PyPI - torchmonarch-nightly - Versions diffs - 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.26__cp313-cp313-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.26__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +878 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +303 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +508 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +59 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +53 -0
monarch/actor_mesh.py +6 -765
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +21 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/gradient/_gradient_generator.so +0 -0
monarch/mesh_controller.py +263 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +129 -47
monarch/tools/components/hyperactor.py +5 -3
monarch/tools/config/__init__.py +18 -1
monarch/tools/config/defaults.py +2 -2
monarch/tools/mesh_spec.py +59 -1
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +48 -10
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +369 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +161 -0
tests/test_python_actors.py +184 -333
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +81 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0

monarch/tools/commands.py CHANGED Viewed

@@ -7,22 +7,22 @@
 # pyre-strict
 import argparse
-import functools
+import asyncio
 import inspect
 import logging
 import os
-import time
-from datetime import timedelta
+from datetime import datetime, timedelta
 from typing import Any, Callable, Mapping, Optional, Union
+from monarch.tools.components.hyperactor import DEFAULT_NAME
 from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/config/meta:defaults
     Config,
     defaults,
 )
 from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
-from torchx.runner import Runner
-from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
+from torchx.runner import Runner  # @manual=//torchx/runner:lib_core
+from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
 from torchx.specs.builders import parse_args
 from torchx.util.types import decode, decode_optional
@@ -84,14 +84,10 @@ def component_args_from_cli(
 def create(
     config: Config,
-    component_fn: Optional[Callable[..., AppDef]] = None,
-) -> Callable[..., Union[str, AppDryRunInfo]]:
+    name: str = DEFAULT_NAME,
+) -> Union[str, AppDryRunInfo]:
     """Creates a monarch server by submitting it as a job to the target scheduler.
-    Note that this function returns a `Callable` that has to be called with the
-    same arguments that one would call the `component_fn` to actually submit
-    the job that runs the monarch server.
     Usage:
     .. doc-test::
@@ -99,6 +95,8 @@ def create(
         from monarch.tools.config import defaults
         config = defaults.config(scheduler="slurm")
+        config.appdef = defaults.component_fn(scheduler=config.scheduler)()
         config.scheduler_args.update(
             {
                 "partition": "prod",
@@ -108,7 +106,7 @@ def create(
         )
         config.dryrun = True
-        create(default_config)(host_type="gpu.medium", num_hosts=4)
+        create(config)
     Args:
@@ -117,36 +115,32 @@ def create(
         component_fn: a function that returns the AppDef (job def).
             If not provided, defaults to the configured default for the scheduler
             (in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
+        name: the name of the job. If none, a default job name will be created.
     """
     scheduler: str = config.scheduler
     cfg: Mapping[str, CfgVal] = config.scheduler_args
-    component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
-    @functools.wraps(component)
-    def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
-        # for logging call-site context in application metadata
-        os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
-        appdef = component(*args, **kwargs)
+    # for logging call-site context in application metadata
+    os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
-        with torchx_runner() as runner:
-            info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
+    with torchx_runner() as runner:
+        appdef: AppDef = AppDef(name, config.appdef.roles, config.appdef.metadata)
-            info_json_fmt = AppDryRunInfo(
-                info.request,
-                fmt=defaults.dryrun_info_formatter(info),
-            )
-            info_json_fmt._app = info._app
-            info_json_fmt._cfg = info._cfg
-            info_json_fmt._scheduler = info._scheduler
+        info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
-            if config.dryrun:
-                return info_json_fmt
-            else:
-                server_handle = runner.schedule(info)
-                return server_handle
+        info_json_fmt = AppDryRunInfo(
+            info.request,
+            fmt=defaults.dryrun_info_formatter(info),
+        )
+        info_json_fmt._app = info._app
+        info_json_fmt._cfg = info._cfg
+        info_json_fmt._scheduler = info._scheduler
-    return _run
+        if config.dryrun:
+            return info_json_fmt
+        else:
+            server_handle = runner.schedule(info)
+            return server_handle
 def info(server_handle: str) -> Optional[ServerSpec]:
@@ -180,17 +174,27 @@ def info(server_handle: str) -> Optional[ServerSpec]:
         # null-guard since some schedulers do not fill replica_status
         if host_status := replica_status.get(role.name):
             spec.hostnames = [h.hostname for h in host_status]
+            # the mesh status is based on the "least progressive" replica status
+            spec.state = min(h.state for h in host_status)
         mesh_specs.append(spec)
-    return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
+    scheduler, namespace, _ = parse_app_handle(server_handle)
+    return ServerSpec(
+        name=appdef.name,
+        state=status.state,
+        meshes=mesh_specs,
+        scheduler=scheduler,
+        namespace=namespace,
+    )
 _5_SECONDS = timedelta(seconds=5)
 async def server_ready(
-    server_handle: str, check_interval: timedelta = _5_SECONDS
+    server_handle: str,
+    check_interval: timedelta = _5_SECONDS,
 ) -> Optional[ServerSpec]:
     """Waits until the server's job is in RUNNING state to returns the server spec.
     Returns `None` if the server does not exist.
@@ -213,6 +217,8 @@ async def server_ready(
     """
+    check_interval_seconds = check_interval.total_seconds()
+    start = datetime.now()
     while True:
         server_spec = info(server_handle)
@@ -222,18 +228,94 @@ async def server_ready(
         if server_spec.state <= AppState.PENDING:  # UNSUBMITTED or SUBMITTED or PENDING
             # NOTE: TorchX currently does not have async APIs so need to loop-on-interval
             # TODO maybe inverse exponential backoff instead of constant interval?
-            check_interval_seconds = check_interval.total_seconds()
-            logger.info(
-                "waiting for %s to be %s (current: %s), will check again in %g seconds...",
-                server_handle,
-                AppState.RUNNING,
-                server_spec.state,
-                check_interval_seconds,
+            print(
+                f"Waiting for {server_handle} to be {AppState.RUNNING} (current: {server_spec.state}); "
+                f"will check again in {check_interval_seconds} seconds. "
+                f"Total wait time: {datetime.now() - start}",
+                end="\r",
             )
-            time.sleep(check_interval_seconds)
+            await asyncio.sleep(check_interval_seconds)
             continue
-        else:
-            return server_spec
+        # check if hosts are allocated for all the meshes
+        if server_spec.state == AppState.RUNNING:
+            running = True
+            for mesh_spec in server_spec.meshes:
+                if mesh_spec.state <= AppState.PENDING:
+                    print(
+                        f"Job {server_handle} is running but waiting for mesh {mesh_spec.name} "
+                        f"to be {AppState.RUNNING} (current: {mesh_spec.state}); "
+                        f"will check again in {check_interval_seconds} seconds. "
+                        f"Total wait time: {datetime.now() - start}",
+                        end="\r",
+                    )
+                    running = False
+                    break
+            if not running:
+                await asyncio.sleep(check_interval_seconds)
+                continue
+        return server_spec
+# TODO: this API is overloaded. Ideally, we do not need config to get or an handle to create.
+async def get_or_create(
+    name: str,
+    config: Config,
+    check_interval: timedelta = _5_SECONDS,
+) -> ServerSpec:
+    """Waits for the server based on identity `name` in the scheduler specified in the `config`
+    to be ready (e.g. RUNNING). If the server is not found then this function creates one
+    per the `config` spec, and waits for the server to be ready before returning.
+    Usage:
+    .. code-block:: python
+        from monarch.tools.config import defaults
+        config = defaults.config(scheduler)
+        config.appdef = defaults.component_fn(config.scheduler)()
+        server_handle = get_or_create(name="my_job_name", config)
+        server_info = info(server_handle)
+    Returns: A `ServerSpec` containing information about either the existing or the newly
+        created server.
+    """
+    assert not config.dryrun, "dryrun is not supported for get_or_create(), for dryrun use the create() API instead"
+    server_handle = f"{config.scheduler}:///{name}"
+    server_info = await server_ready(server_handle, check_interval)
+    if not server_info or not server_info.is_running:  # then create one
+        logger.info(
+            "no existing RUNNING server `%s` creating new one...", server_handle
+        )
+        # no dryrun (see assertion above) support so will always be a handle (str)
+        new_server_handle = str(create(config, name))
+        logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
+        server_info = await server_ready(new_server_handle, check_interval)
+        if not server_info:
+            raise RuntimeError(
+                f"the new server `{new_server_handle}` went missing (should never happen)"
+            )
+        if not server_info.is_running:
+            raise RuntimeError(
+                f"the new server `{new_server_handle}` has {server_info.state}"
+            )
+        print(f"\x1b[36mNew job `{new_server_handle}` is ready to serve. \x1b[0m")
+        return server_info
+    else:
+        print(f"\x1b[36mFound existing job `{server_handle}` ready to serve. \x1b[0m")
+        return server_info
 def kill(server_handle: str) -> None:

monarch/tools/components/hyperactor.py CHANGED Viewed

@@ -9,6 +9,7 @@ import getpass
 from typing import Optional
 from monarch.tools import mesh_spec
+from monarch.tools.config import UnnamedAppDef
 from monarch.tools.mesh_spec import mesh_spec_from_str
 from torchx import specs
@@ -16,17 +17,18 @@ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
 _USER: str = getpass.getuser()
+DEFAULT_NAME: str = f"monarch-{_USER}"
 __version__ = "latest"  # TODO get version from monarch.__version_
 def proc_mesh(
-    name: str = f"monarch-{_USER}",
     image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}",  # TODO docker needs to be built and pushed to ghcr
     meshes: list[str] = _DEFAULT_MESHES,
     env: Optional[dict[str, str]] = None,
     port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
     program: str = "monarch_bootstrap",  # installed with monarch wheel (as console script)
-) -> specs.AppDef:
+) -> UnnamedAppDef:
     """
     Args:
         name: the name of the monarch server job
@@ -37,7 +39,7 @@ def proc_mesh(
         program: path to the binary that the remote process allocator spawns on an allocation request
     """
-    appdef = specs.AppDef(name)
+    appdef = UnnamedAppDef()
     for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
         mesh_role = specs.Role(

monarch/tools/config/__init__.py CHANGED Viewed

@@ -6,15 +6,32 @@
 # pyre-strict
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Any, Dict, List, Optional
+from torchx.specs import Role
 NOT_SET: str = "__NOT_SET__"
+@dataclass
+class UnnamedAppDef:
+    """
+    A TorchX AppDef without a name.
+    """
+    roles: List[Role] = field(default_factory=list)
+    metadata: Dict[str, str] = field(default_factory=dict)
 @dataclass
 class Config:
+    """
+    All configs needed to schedule a mesh of allocators.
+    """
     scheduler: str = NOT_SET
     scheduler_args: dict[str, Any] = field(default_factory=dict)
     workspace: Optional[str] = None
     dryrun: bool = False
+    appdef: UnnamedAppDef = UnnamedAppDef()

monarch/tools/config/defaults.py CHANGED Viewed

@@ -11,7 +11,7 @@
 from typing import Callable, Optional
 from monarch.tools.components import hyperactor
-from monarch.tools.config import Config
+from monarch.tools.config import Config, UnnamedAppDef
 from torchx import specs
 from torchx.schedulers import (
@@ -23,7 +23,7 @@ from torchx.schedulers import (
 )
-def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
+def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
     """The default TorchX component function for the scheduler"""
     return hyperactor.proc_mesh

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -9,8 +9,11 @@ import string
 from dataclasses import dataclass, field
 from typing import Any, Optional
+from monarch.tools.config import UnnamedAppDef
 from monarch.tools.network import get_sockaddr
 from torchx import specs
+from torchx.specs.api import is_terminal
 DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
@@ -38,6 +41,7 @@ class MeshSpec:
     transport: str = "tcp"
     port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
     hostnames: list[str] = field(default_factory=list)
+    state: specs.AppState = specs.AppState.UNSUBMITTED
     def server_addrs(
         self, transport: Optional[str] = None, port: Optional[int] = None
@@ -68,7 +72,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
     return string.Template(tag_template).substitute(mesh_name=mesh_name)
-def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
+def tag_as_metadata(mesh_spec: MeshSpec, appdef: UnnamedAppDef) -> None:
     appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
     appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
     appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
@@ -122,11 +126,64 @@ class ServerSpec:
     name: str
     state: specs.AppState
     meshes: list[MeshSpec]
+    scheduler: str
+    namespace: str = ""
+    @property
+    def server_handle(self) -> str:
+        return f"{self.scheduler}://{self.namespace}/{self.name}"
     @property
     def is_running(self) -> bool:
         return self.state == specs.AppState.RUNNING
+    def host0(self, mesh_name: str) -> str:
+        """The hostname of the first node in the given mesh.
+        The return value of this method can be used to set `MASTER_ADDR` env var for torch.distributed.
+        NOTE: the state of this server must be RUNNING for this method to return a valid value.
+        Usage:
+        .. code-block::python
+            from monarch.tools.commands import get_or_create
+            server_info = await get_or_create(...)
+            assert server_info.is_running
+            # allocate proc mesh -> create actor (code omitted for brevity)...
+            trainer_actor.call(
+                MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host
+                MASTER_PORT=29500,
+                ...
+            )
+        NOTE: The ordering of the hostnames is exactly the same as what comes back from the underlying
+        scheduler's `describe_job` or `list_*` API. Please find the exact semantics in the
+        respective scheduler's implementation in https://github.com/pytorch/torchx/tree/main/torchx/schedulers.
+        """
+        mesh_spec = self.get_mesh_spec(mesh_name)
+        if self.is_running:
+            # hostnames are only valid when the server is RUNNING
+            if not mesh_spec.hostnames:
+                raise RuntimeError(f"{self.server_handle} does not have any hosts")
+            return mesh_spec.hostnames[0]
+        elif self.state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
+            raise RuntimeError(
+                f"{self.server_handle} is {self.state}."
+                f" Use `monarch.tools.commands.server_ready()` to wait for the server to be {specs.AppState.RUNNING}"
+            )
+        elif is_terminal(self.state):
+            raise RuntimeError(
+                f"{self.server_handle} is {self.state}."
+                " Use `monarch.tools.commands.get_or_create()` to create a new server"
+            )
+        else:
+            raise RuntimeError(
+                f"{self.server_handle} is in an invalid state: {self.state}. Please report this as a bug"
+            )
     def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
         for mesh_spec in self.meshes:
             if mesh_spec.name == mesh_name:
@@ -152,6 +209,7 @@ class ServerSpec:
         return {
             "name": self.name,
+            "server_handle": self.server_handle,
             "state": self.state.name,
             "meshes": {
                 mesh.name: {

monarch/tools/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+from typing import Optional
+class conda:
+    """Conda related util functions."""
+    @staticmethod
+    def active_env_dir() -> Optional[str]:
+        """
+        Returns the currently active conda environment's directory.
+        `None` if run outside of a conda environment.
+        """
+        return os.getenv("CONDA_PREFIX")
+    @staticmethod
+    def active_env_name() -> Optional[str]:
+        """
+        Returns the currently active conda environment name.
+        `None` if run outside of a conda environment.
+        """
+        env_name = os.getenv("CONDA_DEFAULT_ENV")
+        if not env_name:
+            # conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so
+            # fallback to CONDA_PREFIX which points to the path of the currently active conda environment
+            # e.g./home/$USER/.conda/envs/{env_name}
+            if env_dir := conda.active_env_dir():
+                env_name = os.path.basename(env_dir)
+        return env_name

monarch/worker/worker.py CHANGED Viewed

@@ -37,13 +37,13 @@ import torch.distributed
 import torch.fx
 import zmq
 import zmq.asyncio
+from monarch._src.actor.shape import NDSlice
 from monarch.common import messages
 from monarch.common.function import ResolvableFunction
 from monarch.common.messages import DependentOnError, Dims
 from monarch.common.process_group import SingleControllerProcessGroupWrapper
 from monarch.common.reference import Ref, Referenceable
-from monarch.common.shape import NDSlice
 from monarch.common.tensor_factory import TensorFactory
 from monarch.common.tree import flatten, flattener
 from monarch_supervisor import get_message_queue, Letter

monarch/world_mesh.py CHANGED Viewed

@@ -8,10 +8,11 @@
 from typing import List
+from monarch._src.actor.shape import NDSlice
 from monarch.common.client import Client
 from monarch.common.device_mesh import DeviceMesh
-from monarch.common.shape import NDSlice
 from monarch.controller.backend import ProcessBackend

monarch_supervisor/python_executable.py CHANGED Viewed

@@ -11,7 +11,10 @@ import sys
 try:
     from __manifest__ import fbmake  # noqa
-    IN_PAR = True
+    # simply checking for the existence of __manifest__ is not enough to tell if we are in a PAR
+    # because monarch wheels include a dummy __manifest__ (see fbcode//monarch/python/monarch/session/meta/__manifest__.py)
+    # so that we can use libfb programmatically. Hence additionally check if the `par_style` key is not null/empty
+    IN_PAR = bool(fbmake.get("par_style"))
 except ImportError:
     IN_PAR = False
@@ -26,8 +29,8 @@ if IN_PAR:
         PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
     else:
         try:
-            with importlib.resources.path(
-                "monarch_tensor_worker_env", "worker_env"
+            with importlib.resources.as_file(
+                importlib.resources.files("monarch_tensor_worker_env") / "worker_env"
             ) as path:
                 if not path.exists():
                     raise ImportError()

tests/error_test_binary.py CHANGED Viewed

@@ -13,8 +13,7 @@ from monarch._rust_bindings.monarch_extension.blocking import blocking_function
 from monarch._rust_bindings.monarch_extension.panic import panicking_function
-from monarch.actor_mesh import Actor, endpoint, send
-from monarch.proc_mesh import proc_mesh
+from monarch.actor import Actor, endpoint, proc_mesh, send
 class ErrorActor(Actor):
@@ -48,6 +47,13 @@ class ErrorActor(Actor):
         await asyncio.sleep(0.1)
         raise RuntimeError("oh noez")
+    @endpoint
+    async def get_pid(self) -> int:
+        """Endpoint that returns the process PID."""
+        import os
+        return os.getpid()
 class ErrorActorSync(Actor):
     """An actor that has endpoints cause segfaults."""
@@ -79,8 +85,7 @@ def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
     error_actor = proc.spawn("error_actor", actor_class).get()
     # This output is checked in the test to make sure that the process actually got here
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_test", flush=True)
     if endpoint_name == "cause_segfault":
         endpoint = error_actor.cause_segfault
@@ -110,8 +115,7 @@ def _run_error_test(num_procs, sync_endpoint, endpoint_name):
         error_actor = await proc.spawn("error_actor", actor_class)
         # This output is checked in the test to make sure that the process actually got here
-        print("I actually ran")
-        sys.stdout.flush()
+        print("Started function error_test", flush=True)
         if endpoint_name == "cause_segfault":
             endpoint = error_actor.cause_segfault
@@ -153,15 +157,13 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
 @main.command("error-bootstrap")
 def error_bootstrap():
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function error_bootstrap", flush=True)
     proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
 async def _error_unmonitored():
-    print("I actually ran")
-    sys.stdout.flush()
+    print("Started function _error_unmonitored", flush=True)
     proc = await proc_mesh(gpus=1)
     actor = await proc.spawn("error_actor", ErrorActor)
@@ -204,5 +206,41 @@ def error_unmonitored():
     asyncio.run(_error_unmonitored())
+async def _error_cleanup():
+    """Test function that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    print("Started function _error_cleanup() for parent process", flush=True)
+    # Spawn an 8 process procmesh
+    proc = await proc_mesh(gpus=8)
+    error_actor = await proc.spawn("error_actor", ErrorActor)
+    print("Procmesh spawned, collecting child PIDs from actors", flush=True)
+    # Get PIDs from all actor processes
+    try:
+        # Call get_pid endpoint on all actors to collect their PIDs
+        pids = await error_actor.get_pid.call()
+        child_pids = [str(pid) for _, pid in pids]
+        print(f"CHILD_PIDS: {','.join(child_pids)}", flush=True)
+    except Exception as e:
+        print(f"Error getting child PIDs from actors: {e}", flush=True)
+    print("About to call endpoint that raises exception", flush=True)
+    # Call an endpoint that raises a normal exception
+    try:
+        await error_actor.await_then_error.call()
+    except Exception as e:
+        print(f"Expected exception caught: {e}", flush=True)
+        # Re-raise to cause the process to exit with non-zero code
+        raise
+@main.command("error-cleanup")
+def error_cleanup():
+    """Command that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
+    asyncio.run(_error_cleanup())
 if __name__ == "__main__":
     main()