PyPI - torchmonarch-nightly - Versions diffs - 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.26__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.26__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +878 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +303 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +508 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +59 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +53 -0
monarch/actor_mesh.py +6 -765
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +21 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/gradient/_gradient_generator.so +0 -0
monarch/mesh_controller.py +263 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +129 -47
monarch/tools/components/hyperactor.py +5 -3
monarch/tools/config/__init__.py +18 -1
monarch/tools/config/defaults.py +2 -2
monarch/tools/mesh_spec.py +59 -1
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +48 -10
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +369 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +161 -0
tests/test_python_actors.py +184 -333
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +81 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0

tests/test_actor_error.py CHANGED Viewed

@@ -4,14 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import asyncio
 import importlib.resources
+import os
 import subprocess
+import sys
 import pytest
 from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
-from monarch.actor_mesh import Actor, ActorError, endpoint, send
-from monarch.proc_mesh import local_proc_mesh, proc_mesh
+from monarch._rust_bindings.monarch_hyperactor.supervision import SupervisionError
+from monarch.actor import Actor, ActorError, endpoint, local_proc_mesh, proc_mesh
 class ExceptionActor(Actor):
@@ -66,16 +68,21 @@ class BrokenPickleClass:
         self.__dict__.update(state)
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
 @pytest.mark.parametrize(
     "actor_class",
     [ExceptionActor, ExceptionActorSync],
 )
 @pytest.mark.parametrize("num_procs", [1, 2])
-async def test_actor_exception(actor_class, num_procs):
+async def test_actor_exception(mesh, actor_class, num_procs):
     """
     Test that exceptions raised in actor endpoints are propagated to the client.
     """
-    proc = await proc_mesh(gpus=num_procs)
+    proc = await mesh(gpus=num_procs)
     exception_actor = await proc.spawn("exception_actor", actor_class)
     with pytest.raises(ActorError, match="This is a test exception"):
@@ -85,16 +92,21 @@ async def test_actor_exception(actor_class, num_procs):
             await exception_actor.raise_exception.call()
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
 @pytest.mark.parametrize(
     "actor_class",
     [ExceptionActor, ExceptionActorSync],
 )
 @pytest.mark.parametrize("num_procs", [1, 2])
-def test_actor_exception_sync(actor_class, num_procs):
+def test_actor_exception_sync(mesh, actor_class, num_procs):
     """
     Test that exceptions raised in actor endpoints are propagated to the client.
     """
-    proc = proc_mesh(gpus=num_procs).get()
+    proc = mesh(gpus=num_procs).get()
     exception_actor = proc.spawn("exception_actor", actor_class).get()
     with pytest.raises(ActorError, match="This is a test exception"):
@@ -104,6 +116,7 @@ def test_actor_exception_sync(actor_class, num_procs):
             exception_actor.raise_exception.call().get()
+'''
 # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
 @pytest.mark.oss_skip
 @pytest.mark.parametrize("num_procs", [1, 2])
@@ -140,10 +153,11 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
         raise
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function error_test" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
+'''
 # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
@@ -170,7 +184,7 @@ def test_proc_mesh_bootstrap_error():
         raise
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function error_bootstrap" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
@@ -213,6 +227,7 @@ async def test_broken_pickle_class(raise_on_getstate, raise_on_setstate, num_pro
             await exception_actor.print_value.call(broken_obj)
+"""
 # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
 @pytest.mark.oss_skip
 async def test_exception_after_wait_unmonitored():
@@ -234,23 +249,135 @@ async def test_exception_after_wait_unmonitored():
         raise
     # Assert that the subprocess exited with a non-zero code
-    assert "I actually ran" in process.stdout.decode()
+    assert "Started function _error_unmonitored" in process.stdout.decode()
     assert (
         process.returncode != 0
     ), f"Expected non-zero exit code, got {process.returncode}"
+"""
+# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
+@pytest.mark.oss_skip
+def test_python_actor_process_cleanup():
+    """
+    Test that PythonActor processes are cleaned up when the parent process dies.
+    This test spawns an 8 process procmesh and calls an endpoint that returns a normal exception,
+    then verifies that all spawned processes have been cleaned up after the spawned binary dies.
+    """
+    import os
+    import signal
+    import time
+    # Run the error-cleanup test in a subprocess
+    test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
+    cmd = [
+        str(test_bin),
+        "error-cleanup",
+    ]
+    try:
+        print("running cmd", " ".join(cmd))
+        process = subprocess.run(cmd, capture_output=True, timeout=180, text=True)
+    except subprocess.TimeoutExpired as e:
+        print("timeout expired")
+        if e.stdout is not None:
+            print(e.stdout.decode())
+        if e.stderr is not None:
+            print(e.stderr.decode())
+        raise
+    # Read stdout line by line to get child PIDs
+    assert "Started function _error_cleanup() for parent process" in process.stdout
+    child_pids = set()
+    for line in process.stdout.splitlines():
+        if line.startswith("CHILD_PIDS: "):
+            pids_str = line[len("CHILD_PIDS: ") :]  # noqa
+            child_pids = {
+                int(pid.strip()) for pid in pids_str.split(",") if pid.strip()
+            }
+            print(f"Extracted child PIDs: {child_pids}")
+            break
+    if not child_pids:
+        raise AssertionError("No child PIDs found in output")
+    assert child_pids, "No child PIDs were collected from subprocess output"
+    # Wait for child processes to be cleaned up
+    print("Waiting for child processes to be cleaned up...")
+    cleanup_timeout = 120
+    start_time = time.time()
+    def is_process_running(pid):
+        """Check if a process with the given PID is still running."""
+        try:
+            os.kill(pid, 0)  # Signal 0 doesn't kill, just checks if process exists
+            return True
+        except OSError:
+            return False
+    still_running = set(child_pids)
+    while time.time() - start_time < cleanup_timeout:
+        if not still_running:
+            print("All child processes have been cleaned up!")
+            return
+        still_running = {pid for pid in still_running if is_process_running(pid)}
+        print(f"Still running child PIDs: {still_running}")
+        time.sleep(2)
+    # If we get here, some processes are still running
+    # Try to clean up remaining processes
+    for pid in still_running:
+        try:
+            os.kill(pid, signal.SIGKILL)
+        except OSError:
+            pass
+    raise AssertionError(
+        f"Child processes not cleaned up after {cleanup_timeout}s: {still_running}"
+    )
+class ActorFailureError(BaseException):
+    """Exception to simulate actor failure for supervision testing.
+    Inherits from BaseException in order that supervision be
+    triggered.
+    """
+    pass
 class ErrorActor(Actor):
-    def __init__(self, message):
-        raise RuntimeError("fail on init")
+    @endpoint
+    def fail_with_supervision_error(self) -> None:
+        raise ActorFailureError("Simulated actor failure for supervision testing")
     @endpoint
-    async def check(self) -> None:
-        pass
+    async def fail_with_supervision_error_async(self) -> None:
+        raise ActorFailureError("Simulated actor failure for supervision testing")
+    @endpoint
+    async def check(self) -> str:
+        return "this is a healthy check"
+    @endpoint
+    async def check_with_exception(self) -> None:
+        raise RuntimeError("failed the check with app error")
-async def test_proc_mesh_redundant_monitoring():
-    proc = await local_proc_mesh(hosts=1, gpus=1)
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
+async def test_proc_mesh_redundant_monitoring(mesh):
+    proc = await mesh(hosts=1, gpus=1)
     await proc.monitor()
     with pytest.raises(
@@ -259,15 +386,237 @@ async def test_proc_mesh_redundant_monitoring():
         await proc.monitor()
-async def test_proc_mesh_monitoring():
-    proc = await local_proc_mesh(hosts=1, gpus=1)
+class Worker(Actor):
+    @endpoint
+    def work(self):
+        raise ValueError("value error")
+class Manager(Actor):
+    @endpoint
+    async def init(self):
+        mesh = await proc_mesh(gpus=1)
+        self.workers = await mesh.spawn("Worker", Worker)
+    @endpoint
+    async def route(self):
+        return await self.workers.work.call_one()
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
+async def test_errors_propagated(mesh):
+    p_mesh = await mesh(gpus=1)
+    mesh = await p_mesh.spawn("manager", Manager)
+    await mesh.init.call_one()
+    with pytest.raises(ActorError) as err_info:
+        await mesh.route.call_one()
+    assert "value error" in str(err_info.value)
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
+async def test_proc_mesh_monitoring(mesh):
+    proc = await mesh(hosts=1, gpus=1)
     monitor = await proc.monitor()
+    e = await proc.spawn("error", ErrorActor)
     with pytest.raises(Exception):
-        e = await proc.spawn("error", ErrorActor, "failed to init the actor")
-        await asyncio.wait_for(e.check.call_one(), timeout=15)
+        await e.fail_with_supervision_error.call_one()
     event = await anext(monitor)
     assert isinstance(event, ProcEvent.Crashed)
     assert event[0] == 0  # check rank
-    assert "fail on init" in event[1]  # check error message
+    assert "ActorFailureError" in event[1]  # check error message
+    assert (
+        "Simulated actor failure for supervision testing" in event[1]
+    )  # check error message
+    # should not be able to spawn actors anymore as proc mesh is unhealthy
+    with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
+        await proc.spawn("ex", ExceptionActorSync)
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
+async def test_actor_mesh_supervision_handling(mesh):
+    proc = await mesh(hosts=1, gpus=1)
+    e = await proc.spawn("error", ErrorActor)
+    # first check() call should succeed
+    await e.check.call()
+    # throw an application error
+    with pytest.raises(ActorError, match="failed the check with app error"):
+        await e.check_with_exception.call()
+    # actor mesh should still be healthy
+    await e.check.call()
+    # existing call should fail with supervision error
+    with pytest.raises(SupervisionError, match="supervision error:"):
+        await e.fail_with_supervision_error.call_one()
+    # new call should fail with check of health state of actor mesh
+    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+        await e.check.call()
+    # should not be able to spawn actors anymore as proc mesh is unhealthy
+    with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
+        await proc.spawn("ex", ExceptionActorSync)
+class HealthyActor(Actor):
+    @endpoint
+    async def check(self):
+        return "this is a healthy check"
+    @endpoint
+    async def check_with_payload(self, payload: str):
+        pass
+class Intermediate(Actor):
+    @endpoint
+    async def init_local_mesh(self):
+        mesh = await local_proc_mesh(gpus=1)
+        self._error_actor = await mesh.spawn("error", ErrorActor)
+        self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
+    @endpoint
+    async def init_proc_mesh(self):
+        mesh = await proc_mesh(gpus=1)
+        self._error_actor = await mesh.spawn("error", ErrorActor)
+        self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
+    @endpoint
+    async def forward_success(self):
+        return await self._error_actor.check.call()
+    @endpoint
+    async def forward_error(self):
+        return await self._error_actor.fail_with_supervision_error.call_one()
+    @endpoint
+    async def forward_healthy_check(self):
+        return await self._healthy_actor.check.call()
+@pytest.mark.parametrize(
+    "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
+)
+async def test_actor_mesh_supervision_handling_chained_error(mesh):
+    proc = await mesh(hosts=1, gpus=1)
+    intermediate_actor = await proc.spawn("intermediate", Intermediate)
+    if mesh is proc_mesh:
+        await intermediate_actor.init_proc_mesh.call()
+    elif mesh is local_proc_mesh:
+        await intermediate_actor.init_local_mesh.call()
+    # first forward() call should succeed
+    await intermediate_actor.forward_success.call()
+    await intermediate_actor.forward_healthy_check.call()
+    # in a chain of client -> Intermediate -> ErrorActor, a supervision error
+    # happening in ErrorActor will be captured by Intermediate and re-raised
+    # as an application error (ActorError).
+    with pytest.raises(ActorError, match="supervision error:"):
+        await intermediate_actor.forward_error.call()
+    # calling success endpoint should fail with ActorError, but with supervision msg.
+    with pytest.raises(ActorError, match="actor mesh is not in a healthy state"):
+        await intermediate_actor.forward_success.call()
+    # healthy actor should still be working
+    await intermediate_actor.forward_healthy_check.call()
+@pytest.mark.parametrize(
+    "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
+)
+@pytest.mark.parametrize(
+    "method_name",
+    ["fail_with_supervision_error", "fail_with_supervision_error_async"],
+)
+async def test_base_exception_handling(mesh, method_name):
+    """Test that BaseException subclasses trigger supervision errors.
+    This test verifies that both synchronous and asynchronous methods
+    that raise ActorFailureError (a BaseException subclass) trigger
+    supervision errors properly.
+    """
+    proc = await mesh(hosts=1, gpus=1)
+    error_actor = await proc.spawn("error", ErrorActor)
+    # Get the method to call based on the parameter
+    method = getattr(error_actor, method_name)
+    # The call should raise a SupervisionError
+    with pytest.raises(SupervisionError, match="supervision error:"):
+        await method.call_one()
+    # Subsequent calls should fail with a health state error
+    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+        await error_actor.check.call()
+@pytest.mark.parametrize(
+    "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
+)
+async def test_supervision_with_proc_mesh_stopped(mesh):
+    proc = await mesh(hosts=1, gpus=1)
+    actor_mesh = await proc.spawn("healthy", HealthyActor)
+    await actor_mesh.check.call()
+    await proc.stop()
+    # new call should fail with check of health state of actor mesh
+    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+        await actor_mesh.check.call()
+    # proc mesh cannot spawn new actors anymore
+    with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
+        await proc.spawn("immediate", Intermediate)
+# TODO - re-enable after resolving T232206970
+@pytest.mark.oss_skip
+async def test_supervision_with_sending_error():
+    os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "9999999999"
+    os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "1"
+    proc = await proc_mesh(gpus=1)
+    actor_mesh = await proc.spawn("healthy", HealthyActor)
+    await actor_mesh.check.call()
+    # send a small payload to trigger success
+    await actor_mesh.check_with_payload.call(payload="a")
+    # send a large payload to trigger send timeout error
+    with pytest.raises(
+        SupervisionError, match="supervision error:.*message not delivered:"
+    ):
+        await actor_mesh.check_with_payload.call(payload="a" * 5000000000)
+    # new call should fail with check of health state of actor mesh
+    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+        await actor_mesh.check.call()
+    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+        await actor_mesh.check_with_payload.call(payload="a")

tests/test_alloc.py CHANGED Viewed

@@ -9,7 +9,7 @@
 from unittest import IsolatedAsyncioTestCase
 from monarch import ProcessAllocator
-from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
+from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
     AllocConstraints,
     AllocSpec,
 )