PyPI - torchmonarch-nightly - Versions diffs - 2025.6.30__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.30__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -752
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +75 -9
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -332
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

monarch/tools/commands.py CHANGED Viewed

@@ -7,7 +7,6 @@
 # pyre-strict
 import argparse
-import functools
 import inspect
 import logging
 import os
@@ -21,8 +20,8 @@ from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/con
 )
 from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
-from torchx.runner import Runner
-from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
+from torchx.runner import Runner  # @manual=//torchx/runner:lib_core
+from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
 from torchx.specs.builders import parse_args
 from torchx.util.types import decode, decode_optional
@@ -84,14 +83,10 @@ def component_args_from_cli(
 def create(
     config: Config,
-    component_fn: Optional[Callable[..., AppDef]] = None,
-) -> Callable[..., Union[str, AppDryRunInfo]]:
+    appdef: AppDef,
+) -> Union[str, AppDryRunInfo]:
     """Creates a monarch server by submitting it as a job to the target scheduler.
-    Note that this function returns a `Callable` that has to be called with the
-    same arguments that one would call the `component_fn` to actually submit
-    the job that runs the monarch server.
     Usage:
     .. doc-test::
@@ -99,6 +94,8 @@ def create(
         from monarch.tools.config import defaults
         config = defaults.config(scheduler="slurm")
+        appdef = defaults.component_fn(scheduler=config.scheduler)()
         config.scheduler_args.update(
             {
                 "partition": "prod",
@@ -108,7 +105,7 @@ def create(
         )
         config.dryrun = True
-        create(default_config)(host_type="gpu.medium", num_hosts=4)
+        create(config, appdef)
     Args:
@@ -120,33 +117,26 @@ def create(
     """
     scheduler: str = config.scheduler
     cfg: Mapping[str, CfgVal] = config.scheduler_args
-    component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
-    @functools.wraps(component)
-    def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
-        # for logging call-site context in application metadata
-        os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
-        appdef = component(*args, **kwargs)
+    # for logging call-site context in application metadata
+    os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
-        with torchx_runner() as runner:
-            info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
-            info_json_fmt = AppDryRunInfo(
-                info.request,
-                fmt=defaults.dryrun_info_formatter(info),
-            )
-            info_json_fmt._app = info._app
-            info_json_fmt._cfg = info._cfg
-            info_json_fmt._scheduler = info._scheduler
+    with torchx_runner() as runner:
+        info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
-            if config.dryrun:
-                return info_json_fmt
-            else:
-                server_handle = runner.schedule(info)
-                return server_handle
+        info_json_fmt = AppDryRunInfo(
+            info.request,
+            fmt=defaults.dryrun_info_formatter(info),
+        )
+        info_json_fmt._app = info._app
+        info_json_fmt._cfg = info._cfg
+        info_json_fmt._scheduler = info._scheduler
-    return _run
+        if config.dryrun:
+            return info_json_fmt
+        else:
+            server_handle = runner.schedule(info)
+            return server_handle
 def info(server_handle: str) -> Optional[ServerSpec]:
@@ -183,14 +173,22 @@ def info(server_handle: str) -> Optional[ServerSpec]:
         mesh_specs.append(spec)
-    return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
+    scheduler, namespace, _ = parse_app_handle(server_handle)
+    return ServerSpec(
+        name=appdef.name,
+        state=status.state,
+        meshes=mesh_specs,
+        scheduler=scheduler,
+        namespace=namespace,
+    )
 _5_SECONDS = timedelta(seconds=5)
 async def server_ready(
-    server_handle: str, check_interval: timedelta = _5_SECONDS
+    server_handle: str,
+    check_interval: timedelta = _5_SECONDS,
 ) -> Optional[ServerSpec]:
     """Waits until the server's job is in RUNNING state to returns the server spec.
     Returns `None` if the server does not exist.
@@ -236,6 +234,68 @@ async def server_ready(
             return server_spec
+async def get_or_create(
+    name: str,
+    config: Config,
+    appdef: AppDef,
+    check_interval: timedelta = _5_SECONDS,
+) -> ServerSpec:
+    """Waits for the server called `name` in the scheduler specified in the `config`
+    to be ready (e.g. RUNNING). If the server is not found then this function creates one
+    per the `appdef` spec, and waits for the server to be ready before returning.
+    Usage:
+    .. code-block:: python
+        import getpass
+        from monarch.tools.config import defaults
+        USER = getpass.getuser()
+        config = defaults.config(scheduler)
+        appdef = defaults.component_fn(config.scheduler)()
+        server_handle = get_or_create(f"{USER}_monarch", config, appdef)
+        server_info = info(server_handle)
+    Returns: A `ServerSpec` containing information about either the existing or the newly
+        created server.
+    """
+    assert not config.dryrun, "dryrun is not supported for get_or_create(), for dryrun use the create() API instead"
+    server_handle = f"{config.scheduler}:///{name}"
+    server_info = await server_ready(server_handle, check_interval)
+    if not server_info or not server_info.is_running:  # then create one
+        logger.info(
+            "no existing RUNNING server `%s` creating new one...", server_handle
+        )
+        # no dryrun (see assertion above) support so will always be a handle (str)
+        new_server_handle = str(create(config, appdef))
+        logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
+        server_info = await server_ready(new_server_handle, check_interval)
+        if not server_info:
+            raise RuntimeError(
+                f"the new server `{new_server_handle}` went missing (should never happen)"
+            )
+        if not server_info.is_running:
+            raise RuntimeError(
+                f"the new server `{new_server_handle}` has {server_info.state}"
+            )
+        logger.info(f"server `{new_server_handle}` is: {server_info.state}")
+        return server_info
+    else:
+        logger.info("found existing RUNNING server `%s`", server_handle)
+        return server_info
 def kill(server_handle: str) -> None:
     with torchx_runner() as runner:
         runner.cancel(server_handle)

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Any, Optional
 from monarch.tools.network import get_sockaddr
 from torchx import specs
+from torchx.specs.api import is_terminal
 DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
@@ -122,11 +123,64 @@ class ServerSpec:
     name: str
     state: specs.AppState
     meshes: list[MeshSpec]
+    scheduler: str
+    namespace: str = ""
+    @property
+    def server_handle(self) -> str:
+        return f"{self.scheduler}://{self.namespace}/{self.name}"
     @property
     def is_running(self) -> bool:
         return self.state == specs.AppState.RUNNING
+    def host0(self, mesh_name: str) -> str:
+        """The hostname of the first node in the given mesh.
+        The return value of this method can be used to set `MASTER_ADDR` env var for torch.distributed.
+        NOTE: the state of this server must be RUNNING for this method to return a valid value.
+        Usage:
+        .. code-block::python
+            from monarch.tools.commands import get_or_create
+            server_info = await get_or_create(...)
+            assert server_info.is_running
+            # allocate proc mesh -> create actor (code omitted for brevity)...
+            trainer_actor.call(
+                MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host
+                MASTER_PORT=29500,
+                ...
+            )
+        NOTE: The ordering of the hostnames is exactly the same as what comes back from the underlying
+        scheduler's `describe_job` or `list_*` API. Please find the exact semantics in the
+        respective scheduler's implementation in https://github.com/pytorch/torchx/tree/main/torchx/schedulers.
+        """
+        mesh_spec = self.get_mesh_spec(mesh_name)
+        if self.is_running:
+            # hostnames are only valid when the server is RUNNING
+            if not mesh_spec.hostnames:
+                raise RuntimeError(f"{self.server_handle} does not have any hosts")
+            return mesh_spec.hostnames[0]
+        elif self.state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
+            raise RuntimeError(
+                f"{self.server_handle} is {self.state}."
+                f" Use `monarch.tools.commands.server_ready()` to wait for the server to be {specs.AppState.RUNNING}"
+            )
+        elif is_terminal(self.state):
+            raise RuntimeError(
+                f"{self.server_handle} is {self.state}."
+                " Use `monarch.tools.commands.get_or_create()` to create a new server"
+            )
+        else:
+            raise RuntimeError(
+                f"{self.server_handle} is in an invalid state: {self.state}. Please report this as a bug"
+            )
     def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
         for mesh_spec in self.meshes:
             if mesh_spec.name == mesh_name:
@@ -152,6 +206,7 @@ class ServerSpec:
         return {
             "name": self.name,
+            "server_handle": self.server_handle,
             "state": self.state.name,
             "meshes": {
                 mesh.name: {

monarch/tools/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+from typing import Optional
+class conda:
+    """Conda related util functions."""
+    @staticmethod
+    def active_env_dir() -> Optional[str]:
+        """
+        Returns the currently active conda environment's directory.
+        `None` if run outside of a conda environment.
+        """
+        return os.getenv("CONDA_PREFIX")
+    @staticmethod
+    def active_env_name() -> Optional[str]:
+        """
+        Returns the currently active conda environment name.
+        `None` if run outside of a conda environment.
+        """
+        env_name = os.getenv("CONDA_DEFAULT_ENV")
+        if not env_name:
+            # conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so
+            # fallback to CONDA_PREFIX which points to the path of the currently active conda environment
+            # e.g./home/$USER/.conda/envs/{env_name}
+            if env_dir := conda.active_env_dir():
+                env_name = os.path.basename(env_dir)
+        return env_name

monarch/worker/worker.py CHANGED Viewed

@@ -37,13 +37,13 @@ import torch.distributed
 import torch.fx
 import zmq
 import zmq.asyncio
+from monarch._src.actor.shape import NDSlice
 from monarch.common import messages
 from monarch.common.function import ResolvableFunction
 from monarch.common.messages import DependentOnError, Dims
 from monarch.common.process_group import SingleControllerProcessGroupWrapper
 from monarch.common.reference import Ref, Referenceable
-from monarch.common.shape import NDSlice
 from monarch.common.tensor_factory import TensorFactory
 from monarch.common.tree import flatten, flattener
 from monarch_supervisor import get_message_queue, Letter

monarch/world_mesh.py CHANGED Viewed

@@ -8,10 +8,11 @@
 from typing import List
+from monarch._src.actor.shape import NDSlice
 from monarch.common.client import Client
 from monarch.common.device_mesh import DeviceMesh
-from monarch.common.shape import NDSlice
 from monarch.controller.backend import ProcessBackend

monarch_supervisor/python_executable.py CHANGED Viewed

@@ -11,7 +11,10 @@ import sys
 try:
     from __manifest__ import fbmake  # noqa
-    IN_PAR = True
+    # simply checking for the existence of __manifest__ is not enough to tell if we are in a PAR
+    # because monarch wheels include a dummy __manifest__ (see fbcode//monarch/python/monarch/session/meta/__manifest__.py)
+    # so that we can use libfb programmatically. Hence additionally check if the `par_style` key is not null/empty
+    IN_PAR = bool(fbmake.get("par_style"))
 except ImportError:
     IN_PAR = False
@@ -26,8 +29,8 @@ if IN_PAR:
         PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
     else:
         try:
-            with importlib.resources.path(
-                "monarch_tensor_worker_env", "worker_env"
+            with importlib.resources.as_file(
+                importlib.resources.files("monarch_tensor_worker_env") / "worker_env"
             ) as path:
                 if not path.exists():
                     raise ImportError()

tests/error_test_binary.py CHANGED Viewed

@@ -9,11 +9,11 @@ import ctypes
 import sys
 import click
+from monarch._rust_bindings.monarch_extension.blocking import blocking_function
 from monarch._rust_bindings.monarch_extension.panic import panicking_function
-from monarch.actor_mesh import Actor, endpoint, send
-from monarch.proc_mesh import proc_mesh
+from monarch.actor import Actor, endpoint, proc_mesh, send
 class ErrorActor(Actor):
@@ -36,12 +36,24 @@ class ErrorActor(Actor):
         """Endpoint that calls a Rust function that panics."""
         panicking_function()
+    @endpoint
+    async def cause_stuck(self) -> None:
+        """Endpoint that causes the process to hang indefinitely."""
+        blocking_function()
     @endpoint
     async def await_then_error(self) -> None:
         await asyncio.sleep(0.1)
         await asyncio.sleep(0.1)
         raise RuntimeError("oh noez")
+    @endpoint
+    async def get_pid(self) -> int:
+        """Endpoint that returns the process PID."""
+        import os
+        return os.getpid()
 class ErrorActorSync(Actor):
     """An actor that has endpoints cause segfaults."""
@@ -73,8 +85,7 @@ def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
     error_actor = proc.spawn("error_actor", actor_class).get()
     # This output is checked in the test to make sure that the process actually got here
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_test", flush=True)
     if endpoint_name == "cause_segfault":
         endpoint = error_actor.cause_segfault
@@ -104,8 +115,7 @@ def _run_error_test(num_procs, sync_endpoint, endpoint_name):
         error_actor = await proc.spawn("error_actor", actor_class)
         # This output is checked in the test to make sure that the process actually got here
-        print("I actually ran")
-        sys.stdout.flush()
+        print("Started function error_test", flush=True)
         if endpoint_name == "cause_segfault":
             endpoint = error_actor.cause_segfault
@@ -147,12 +157,30 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
 @main.command("error-bootstrap")
 def error_bootstrap():
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_bootstrap", flush=True)
     proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
+async def _error_unmonitored():
+    print("Started function _error_unmonitored", flush=True)
+    proc = await proc_mesh(gpus=1)
+    actor = await proc.spawn("error_actor", ErrorActor)
+    # fire and forget
+    send(actor.await_then_error, (), {}, None, "all")
+    # Wait. Eventually a supervision event will get propagated and the process
+    # will exit.
+    #
+    # If an event is not delivered, the test will time out before this sleep
+    # finishes.
+    await asyncio.sleep(300)
+"""
+TODO: This test should be enabled when stop() is fully implemented.
 async def _error_unmonitored():
     print("I actually ran")
     sys.stdout.flush()
@@ -161,7 +189,8 @@ async def _error_unmonitored():
     actor = await proc.spawn("error_actor", ErrorActor)
     # fire and forget
-    send(actor.await_then_error, (), {}, None, "all")
+    send(actor.cause_stuck, (), {}, None, "all")
+    proc_mesh.stop()
     # Wait. Eventually a supervision event will get propagated and the process
     # will exit.
@@ -169,6 +198,7 @@ async def _error_unmonitored():
     # If an event is not delivered, the test will time out before this sleep
     # finishes.
     await asyncio.sleep(300)
+"""
 @main.command("error-unmonitored")
@@ -176,5 +206,41 @@ def error_unmonitored():
     asyncio.run(_error_unmonitored())
+async def _error_cleanup():
+    """Test function that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    print("Started function _error_cleanup() for parent process", flush=True)
+    # Spawn an 8 process procmesh
+    proc = await proc_mesh(gpus=8)
+    error_actor = await proc.spawn("error_actor", ErrorActor)
+    print("Procmesh spawned, collecting child PIDs from actors", flush=True)
+    # Get PIDs from all actor processes
+    try:
+        # Call get_pid endpoint on all actors to collect their PIDs
+        pids = await error_actor.get_pid.call()
+        child_pids = [str(pid) for _, pid in pids]
+        print(f"CHILD_PIDS: {','.join(child_pids)}", flush=True)
+    except Exception as e:
+        print(f"Error getting child PIDs from actors: {e}", flush=True)
+    print("About to call endpoint that raises exception", flush=True)
+    # Call an endpoint that raises a normal exception
+    try:
+        await error_actor.await_then_error.call()
+    except Exception as e:
+        print(f"Expected exception caught: {e}", flush=True)
+        # Re-raise to cause the process to exit with non-zero code
+        raise
+@main.command("error-cleanup")
+def error_cleanup():
+    """Command that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    asyncio.run(_error_cleanup())
 if __name__ == "__main__":
     main()