PyPI - torchmonarch-nightly - Versions diffs - 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +74 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +198 -0
monarch/actor_mesh.py +692 -0
monarch/allocator.py +62 -0
monarch/bootstrap_main.py +75 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +69 -0
monarch/cached_remote_function.py +257 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +646 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +443 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +572 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +304 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +204 -0
monarch/common/stream.py +111 -0
monarch/common/tensor.py +793 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/fetch.py +55 -0
monarch/future.py +25 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/proc_mesh.py +188 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +190 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +357 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +189 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +57 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +121 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +139 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +112 -0
tests/test_alloc.py +25 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +835 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +372 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +182 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0

monarch/parallel/pipelining/scheduler.py ADDED Viewed

@@ -0,0 +1,249 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from functools import cache
+from logging import getLogger
+from timeit import default_timer as timer
+from .schedule_ir import (
+    _Action,
+    _add_send_recv,
+    _ComputationType,
+    _dump_csv,
+    _format_pipeline_order,
+    _merge_bw,
+    BACKWARD,
+    FORWARD,
+    FULL_BACKWARD,
+)
+logger = getLogger()
+def get_stage_str(model_chunk_index, training_stage, mb_index):
+    ctype = _ComputationType.from_str(training_stage)
+    return str(_Action(model_chunk_index, ctype, mb_index))
+def get_dora_schedule(
+    num_model_chunks,
+    pipeline_parallel_size,
+    num_round,
+    num_microbatch_per_round,
+    zero_bubble,
+    total_num_microbatches,
+    num_microbatches,
+    dfs=False,
+    prefetch_weight_latency=1.0,
+    enable_weight_sharding_in_pp=False,
+    enable_wgrad_sharding_in_pp=False,
+):
+    start_time = timer()
+    num_warmup_microbatches_list = []
+    num_1f1b_microbatches_list = []
+    num_additional_1b1w_list = []
+    for pipeline_parallel_rank in range(pipeline_parallel_size):
+        num_warmup_microbatches = 0
+        # The number of microbatches that last pipeline stage run before 1f1b.
+        num_warmup_microbatches += (num_model_chunks - 1) * num_microbatch_per_round
+        # From last PP stage up, each rank will be 2 more than the previous one.
+        num_warmup_microbatches += (
+            pipeline_parallel_size - pipeline_parallel_rank - 1
+        ) * 2
+        num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches)
+        # The number of 1f1b for zero bubble schedule
+        if num_microbatches == pipeline_parallel_size:
+            num_1f1b_microbatches = pipeline_parallel_rank
+        else:
+            num_1f1b_microbatches = 2 * pipeline_parallel_rank
+        num_additional_1b1w = max(
+            int(math.ceil((pipeline_parallel_size - 4) / 2)) - pipeline_parallel_rank,
+            0,
+        )
+        if dfs:
+            num_1f1b_microbatches = 0
+            num_additional_1b1w = 0
+        num_warmup_microbatches_list.append(num_warmup_microbatches)
+        num_1f1b_microbatches_list.append(num_1f1b_microbatches)
+        num_additional_1b1w_list.append(num_additional_1b1w)
+    schedules = []
+    def get_last_pp_rank(i):
+        return (i - 1) % pipeline_parallel_size, i - 1 < 0
+    def get_next_pp_rank(i):
+        return (i + 1) % pipeline_parallel_size, i + 1 >= pipeline_parallel_size
+    for pipeline_parallel_rank in range(pipeline_parallel_size):
+        s = []
+        fwd_mb_index_list = [0 for i in range(num_model_chunks)]
+        bwd_mb_index_list = [0 for i in range(num_model_chunks)]
+        fwd_model_chunk_index = 0
+        bwd_model_chunk_index = num_model_chunks - 1
+        weight_store = []
+        num_warmup_microbatches = num_warmup_microbatches_list[pipeline_parallel_rank]
+        num_1f1b_microbatches = num_1f1b_microbatches_list[pipeline_parallel_rank]
+        num_additional_1b1w = num_additional_1b1w_list[pipeline_parallel_rank]
+        fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+        bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+        fill_1b1w = False
+        for _ in range(num_warmup_microbatches):  # warm up fwd
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(fwd_model_chunk_index, "F", fwd_mb_index)
+            s.append(tmp)
+            fwd_mb_index_list[fwd_model_chunk_index] += 1
+            if fwd_mb_index_list[fwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if fwd_model_chunk_index < num_model_chunks - 1:
+                    fwd_model_chunk_index += 1
+                else:
+                    fwd_model_chunk_index = 0
+        for i in range(
+            total_num_microbatches - num_warmup_microbatches
+        ):  # 1f1b and 1f1b1w
+            if (
+                fwd_model_chunk_index == 1 and not fill_1b1w
+            ):  # additional 1b1w to fill before fwd
+                fill_1b1w = True
+                for _ in range(num_additional_1b1w):
+                    bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+                    tmp = get_stage_str(bwd_model_chunk_index, "B", bwd_mb_index)
+                    s.append(tmp)
+                    tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+                    s.append(tmp)
+                    bwd_mb_index_list[bwd_model_chunk_index] += 1
+                    if (
+                        bwd_mb_index_list[bwd_model_chunk_index]
+                        % num_microbatch_per_round
+                        == 0
+                    ):
+                        if bwd_model_chunk_index > 0:
+                            bwd_model_chunk_index -= 1
+                        else:
+                            bwd_model_chunk_index = num_model_chunks - 1
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(fwd_model_chunk_index, "F", fwd_mb_index)
+            s.append(tmp)
+            fwd_mb_index_list[fwd_model_chunk_index] += 1
+            if fwd_mb_index_list[fwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if fwd_model_chunk_index < num_model_chunks - 1:
+                    fwd_model_chunk_index += 1
+                else:
+                    fwd_model_chunk_index = 0
+            tmp = get_stage_str(
+                bwd_model_chunk_index, "B" if zero_bubble else "BW", bwd_mb_index
+            )
+            s.append(tmp)
+            tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+            if zero_bubble and i < num_1f1b_microbatches:
+                weight_store.append(tmp)
+            else:
+                s.append(tmp)
+            bwd_mb_index_list[bwd_model_chunk_index] += 1
+            if bwd_mb_index_list[bwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if bwd_model_chunk_index > 0:
+                    bwd_model_chunk_index -= 1
+                else:
+                    bwd_model_chunk_index = num_model_chunks - 1
+        num_cooldown = (
+            num_warmup_microbatches - num_additional_1b1w
+            if fill_1b1w
+            else num_warmup_microbatches
+        )
+        for _ in range(num_cooldown):  # cooldown bwd
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(bwd_model_chunk_index, "B", bwd_mb_index)
+            s.append(tmp)
+            tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+            s.append(tmp)
+            bwd_mb_index_list[bwd_model_chunk_index] += 1
+            if bwd_mb_index_list[bwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if bwd_model_chunk_index > 0:
+                    bwd_model_chunk_index -= 1
+                else:
+                    bwd_model_chunk_index = num_model_chunks - 1
+        if len(weight_store) > 0:
+            s += weight_store
+        schedules.append(s)
+    compute_schedules = {}
+    for rank in range(pipeline_parallel_size):
+        compute_schedules[rank] = []
+        for action_str in schedules[rank]:
+            action = _Action.from_str(action_str)
+            stage_index = action.stage_index * pipeline_parallel_size + rank
+            action = _Action(
+                stage_index, action.computation_type, action.microbatch_index
+            )
+            compute_schedules[rank].append(action)
+    lowered_comm_schedule = compute_schedules
+    for rank in lowered_comm_schedule:
+        lowered_comm_schedule[rank] = _merge_bw(lowered_comm_schedule[rank])
+    dump_scheduler_ir = True
+    if dump_scheduler_ir:
+        compute_str = _format_pipeline_order(lowered_comm_schedule)
+        with open("lowered_compute.log", "w") as logf:
+            logf.write(compute_str)
+        _dump_csv(compute_schedules, "lowered_compute.csv")
+    lowered_comm_schedule = _add_send_recv(
+        lowered_comm_schedule,
+        stage_to_rank=lambda chunk_index: chunk_index % pipeline_parallel_size,
+        num_stages=num_model_chunks * pipeline_parallel_size,
+    )
+    comms_str = _format_pipeline_order(lowered_comm_schedule)
+    if dump_scheduler_ir:
+        with open("lowered_comms.log", "w") as logf:
+            logf.write(comms_str)
+        _dump_csv(lowered_comm_schedule, "lowered_compute_with_send_recv.csv")
+    logger.debug("---------- lowered IR\n%s----------", comms_str)
+    if not enable_weight_sharding_in_pp and not enable_wgrad_sharding_in_pp:
+        return lowered_comm_schedule
+    generation_time = timer() - start_time
+    logger.info(f"schedule generation took {generation_time:.6f} seconds")
+    return lowered_comm_schedule
+# TODO - replace bfs / dfs functions below with new IR generators
+ir_schedules = {
+    # "dora": get_dora_schedule,
+    "dora-dfs": lambda *args, **kwargs: get_dora_schedule(*args, **kwargs, dfs=True),
+    # "zbv": get_zbv_schedule,
+    # "zbw": get_zbw_schedule,
+}
+is_zero_bubble = {
+    # "dora": True,
+    "dora-dfs": True,
+    # "zbv": True,
+    # "zbw": True,
+}
+@cache
+def generate_schedule(name: str, *args, **kwargs):
+    assert name in ir_schedules, f"{name} is not a supported schedule type"
+    schedules = ir_schedules[name](*args, **kwargs)
+    stage_to_rank = {}
+    for rank, schedule_actions_rank in schedules.items():
+        for action in schedule_actions_rank:
+            comp_type = action.computation_type
+            stage_idx = action.stage_index
+            if comp_type == FORWARD:
+                stage_to_rank[stage_idx] = rank
+            if comp_type in (BACKWARD, FULL_BACKWARD):
+                stage_to_rank[stage_idx] = rank
+    return schedules, stage_to_rank

monarch/proc_mesh.py ADDED Viewed

@@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from typing import Any, cast, Optional, Type, TypeVar
+import monarch
+from monarch import ActorFuture as Future
+from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
+    Alloc,
+    AllocConstraints,
+    AllocSpec,
+)
+from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
+from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
+from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
+from monarch.common._device_utils import _local_device_count
+from monarch.rdma import RDMAManager
+T = TypeVar("T")
+try:
+    from __manifest__ import fbmake  # noqa
+    IN_PAR = True
+except ImportError:
+    IN_PAR = False
+async def _allocate_nonblocking(alloc: Alloc) -> "ProcMesh":
+    return ProcMesh(await HyProcMesh.allocate_nonblocking(alloc))
+def _allocate_blocking(alloc: Alloc) -> "ProcMesh":
+    return ProcMesh(HyProcMesh.allocate_blocking(alloc))
+class ProcMesh:
+    def __init__(self, hy_proc_mesh: HyProcMesh) -> None:
+        self._proc_mesh = hy_proc_mesh
+        self._mailbox: Mailbox = self._proc_mesh.client
+        self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
+    def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
+        return Future(
+            lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
+            lambda: self._spawn_blocking(name, Class, *args, **kwargs),
+        )
+    @classmethod
+    def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
+        return Future(
+            lambda: _allocate_nonblocking(alloc),
+            lambda: _allocate_blocking(alloc),
+        )
+    def _spawn_blocking(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> T:
+        if not issubclass(Class, Actor):
+            raise ValueError(
+                f"{Class} must subclass monarch.service.Actor to spawn it."
+            )
+        actor_mesh = self._proc_mesh.spawn_blocking(name, _Actor)
+        service = ActorMeshRef(
+            Class,
+            _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
+            self._mailbox,
+        )
+        # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
+        # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
+        service._create(args, kwargs)
+        return cast(T, service)
+    def __repr__(self) -> str:
+        return repr(self._proc_mesh)
+    def __str__(self) -> str:
+        return str(self._proc_mesh)
+    async def _spawn_nonblocking(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> T:
+        if not issubclass(Class, Actor):
+            raise ValueError(
+                f"{Class} must subclass monarch.service.Actor to spawn it."
+            )
+        actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
+        service = ActorMeshRef(
+            Class,
+            _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
+            self._mailbox,
+        )
+        # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
+        # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
+        service._create(args, kwargs)
+        return cast(T, service)
+async def local_proc_mesh_nonblocking(
+    *, gpus: Optional[int] = None, hosts: int = 1
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    allocator = monarch.LocalAllocator()
+    alloc = await allocator.allocate(spec)
+    return await ProcMesh.from_alloc(alloc)
+def local_proc_mesh_blocking(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    allocator = monarch.LocalAllocator()
+    alloc = allocator.allocate(spec).get()
+    return ProcMesh.from_alloc(alloc).get()
+def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
+    return Future(
+        lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
+        lambda: local_proc_mesh_blocking(gpus=gpus, hosts=hosts),
+    )
+_BOOTSTRAP_MAIN = "monarch.bootstrap_main"
+def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
+    if IN_PAR:
+        cmd = sys.argv[0]
+        args = None
+        env = {
+            "PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
+        }
+    else:
+        cmd = sys.executable
+        args = ["-m", _BOOTSTRAP_MAIN]
+        env = {}
+    return cmd, args, env
+async def proc_mesh_nonblocking(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    env = env or {}
+    cmd, args, base_env = _get_bootstrap_args()
+    env.update(base_env)
+    env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
+    allocator = monarch.ProcessAllocator(cmd, args, env)
+    alloc = await allocator.allocate(spec)
+    return await ProcMesh.from_alloc(alloc)
+def proc_mesh_blocking(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    env = env or {}
+    cmd, args, base_env = _get_bootstrap_args()
+    env.update(base_env)
+    env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
+    allocator = monarch.ProcessAllocator(cmd, args, env)
+    alloc = allocator.allocate(spec).get()
+    return ProcMesh.from_alloc(alloc).get()
+def proc_mesh(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> Future[ProcMesh]:
+    return Future(
+        lambda: proc_mesh_nonblocking(gpus=gpus, hosts=hosts, env=env),
+        lambda: proc_mesh_blocking(gpus=gpus, hosts=hosts, env=env),
+    )

monarch/profiler.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import itertools
+import os
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, NamedTuple, Optional, Tuple
+import torch
+from monarch.common.remote import remote
+from monarch.remote_class import ControllerRemoteClass, WorkerRemoteClass
+class Schedule(NamedTuple):
+    wait: int
+    warmup: int
+    active: int
+    repeat: int = 0
+    skip_first: int = 0
+class profile:
+    """
+    The class wraps `torch.profiler.profile()` to allow invoking the profiler remotely.
+    There are two main differences:
+    1) `on_trace_ready` can only be a string, indicating the folder where the traces
+        will be saved.
+    2) `schedule` must be of type `monarch.profiler.Schedule`.
+    """
+    PATH_KEY = "on_trace_ready"
+    _counter = itertools.count()
+    def __init__(self, *args, **kwargs) -> None:
+        assert isinstance(kwargs.get(self.PATH_KEY, None), str), (
+            f"{self.PATH_KEY} must be passed and must be a string to represent the "
+            "path to save the profiler."
+        )
+        schedule = kwargs.get("schedule", None)
+        assert (
+            isinstance(schedule, Schedule) or schedule is None
+        ), "schedule can only be monarch.profiler.Schedule or None."
+        self.id = next(self._counter)
+        _profiler_controller_init(self.id, *args, **kwargs)
+    def __enter__(self) -> "profile":
+        _profiler_controller_enter(self.id)
+        return self
+    def __exit__(self, *args, **kwargs) -> None:
+        _profiler_controller_exit(self.id)
+    def step(self) -> None:
+        _profiler_controller_step(self.id)
+@dataclass
+class _Profiler:
+    args: Tuple[Any, ...]
+    kwargs: Dict[str, Any]
+    profiler: Optional[torch.profiler.profile] = None
+_profilers: Dict[int, _Profiler] = {}
+def _profiler_init(ident, *args, **kwargs) -> None:
+    global _profilers
+    assert (
+        ident not in _profilers
+    ), f"Initializing an already existing profiler, {ident=}"
+    _profilers[ident] = _Profiler(args, kwargs)
+    # It's unclear why we cannot create the profiler here. Even though
+    # the thread is the same, profiler complains thread id mismatch.
+def _profiler_enter(ident, *args, **kwargs) -> None:
+    def on_trace_ready(prof, dir_path):
+        dir_path = Path(dir_path).absolute()
+        os.makedirs(dir_path, exist_ok=True)
+        # This is not a synchronized call, so it is okay to call without
+        # device mesh.
+        rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        prof.export_chrome_trace(f"{dir_path}/trace_{rank}.json")
+    profiler = _profilers[ident]
+    profiler.kwargs[profile.PATH_KEY] = partial(
+        on_trace_ready, dir_path=profiler.kwargs[profile.PATH_KEY]
+    )
+    schedule = profiler.kwargs.get("schedule", None)
+    if schedule is not None:
+        profiler.kwargs["schedule"] = torch.profiler.schedule(**schedule._asdict())
+    profiler.profiler = torch.profiler.profile(*profiler.args, **profiler.kwargs)
+    profiler.profiler.__enter__()
+def _profiler_exit(ident, *args, **kwargs) -> None:
+    profiler = _profilers[ident].profiler
+    assert profiler is not None
+    profiler.__exit__(None, None, None)
+    _profilers.pop(ident)
+def _profiler_step(ident, *args, **kwargs) -> None:
+    profiler = _profilers[ident].profiler
+    assert profiler is not None
+    profiler.step()
+_profiler_controller_init = remote(
+    "monarch.profiler._profiler_init", propagate="inspect"
+)
+_profiler_controller_enter = remote(
+    "monarch.profiler._profiler_enter", propagate="inspect"
+)
+_profiler_controller_exit = remote(
+    "monarch.profiler._profiler_exit", propagate="inspect"
+)
+_profiler_controller_step = remote(
+    "monarch.profiler._profiler_step", propagate="inspect"
+)
+class record_function(ControllerRemoteClass):
+    """
+    The class wraps `torch.profiler.record_function()` to allow invoking the
+    record_function remotely.
+    """
+    def __init__(self, name: str, args: Optional[str] = None) -> None:
+        super().__init__("monarch.profiler.WorkerRecordFunction", name, args)
+    @ControllerRemoteClass.remote_method
+    def __enter__(self) -> "record_function":
+        return self
+    @ControllerRemoteClass.remote_method
+    def __exit__(self, *args, **kwargs) -> None:
+        return
+class WorkerRecordFunction(WorkerRemoteClass):
+    def __init__(self, *args, **kwargs) -> None:
+        self._record_function = torch.profiler.record_function(*args, **kwargs)
+    def __enter__(self) -> None:
+        self._record_function.__enter__()
+    def __exit__(self, *args, **kwargs) -> None:
+        self._record_function.__exit__(*args, **kwargs)

monarch/python_local_mesh.py ADDED Viewed

@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import os
+import subprocess
+from time import sleep
+from typing import Optional, TYPE_CHECKING
+import monarch_supervisor
+from monarch.common._device_utils import _local_device_count
+from monarch.common.fake import fake_call
+from monarch.common.invocation import DeviceException, RemoteException
+from monarch.world_mesh import world_mesh
+from monarch_supervisor import Context, HostConnected
+from monarch_supervisor.python_executable import PYTHON_EXECUTABLE
+if TYPE_CHECKING:
+    from monarch.common.device_mesh import DeviceMesh
+class PythonLocalContext:
+    def __init__(self, N: int):
+        # do a fake call to instantiate ThreadPoolExecutor so we don't block GIL later
+        fake_call(lambda: 0)
+        self.ctx = ctx = Context()
+        ctx.request_hosts(N)
+        # we want ctx to start its listener threads
+        # before creating the hosts because
+        # initialization will happen faster in this case
+        sleep(0)
+        supervisor_addr = f"tcp://127.0.0.1:{ctx.port}"
+        env = {
+            **os.environ,
+            "TORCH_SUPERVISOR_HEARTBEAT_INTERVAL": str(
+                monarch_supervisor.HEARTBEAT_INTERVAL
+            ),
+            # This is needed to avoid a hard failure in ncclx when we do not
+            # have backend topology info (eg. on RE).
+            "NCCL_IGNORE_TOPO_LOAD_FAILURE": "true",
+        }
+        # start_new_session=True, because we want the host managers to be able to kill
+        # any worker processes before they exit, even if the supervisor crashes, or we ctrl-c
+        # it in testing.
+        self.host_managers = [
+            subprocess.Popen(
+                [
+                    PYTHON_EXECUTABLE,
+                    "-m",
+                    "monarch_supervisor.host",
+                    supervisor_addr,
+                ],
+                env=env,
+                start_new_session=True,
+            )
+            for _ in range(N)
+        ]
+        connections = ctx.messagefilter(HostConnected)
+        self.hosts = [connections.recv(timeout=30).sender for _ in range(N)]
+    def shutdown(self):
+        self.ctx.shutdown()
+        for host_manager in self.host_managers:
+            host_manager.wait(timeout=10)
+def python_local_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> "DeviceMesh":
+    """
+    Creates a local device mesh with the given number of hosts and gpus per host.
+    Easy way to use PythonLocalContext.
+    Args:
+        gpus (Optional[int]): number of gpus per host.
+                              Default: the number of GPUs this machine has.
+        hosts (int): number of hosts, primarily used for simulating multiple machines locally.
+                     Default: 1
+    Example::
+        local_mesh = python_local_mesh(gpus=2)
+        with local_mesh.activate():
+            x = torch.rand(3, 4)
+            local_tensor = fetch_shard(x).result()
+        # Cleanly shut down the local mesh and exit.
+        local_mesh.exit()
+    """
+    ctx = PythonLocalContext(hosts)
+    if gpus is None:
+        gpus = _local_device_count()
+    dm = world_mesh(ctx.ctx, ctx.hosts, gpus)
+    def exit(
+        error: Optional[RemoteException | DeviceException | Exception] = None,
+    ) -> None:
+        dm.client.shutdown(True, error)
+        ctx.shutdown()
+    dm.exit = exit
+    return dm