torchmonarch-nightly 2025.8.2__cp312-cp312-manylinux2014_x86_64.whl → 2025.9.3__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +414 -216
- monarch/_src/actor/allocator.py +75 -6
- monarch/_src/actor/bootstrap_main.py +7 -4
- monarch/_src/actor/code_sync/__init__.py +2 -0
- monarch/_src/actor/debugger/__init__.py +7 -0
- monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
- monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
- monarch/_src/actor/endpoint.py +27 -45
- monarch/_src/actor/future.py +86 -24
- monarch/_src/actor/host_mesh.py +125 -0
- monarch/_src/actor/logging.py +94 -0
- monarch/_src/actor/pickle.py +25 -0
- monarch/_src/actor/proc_mesh.py +423 -156
- monarch/_src/actor/python_extension_methods.py +90 -0
- monarch/_src/actor/shape.py +8 -1
- monarch/_src/actor/source_loader.py +45 -0
- monarch/_src/actor/telemetry/__init__.py +172 -0
- monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
- monarch/_src/debug_cli/__init__.py +7 -0
- monarch/_src/debug_cli/debug_cli.py +43 -0
- monarch/_src/tensor_engine/rdma.py +64 -9
- monarch/_testing.py +1 -3
- monarch/actor/__init__.py +24 -4
- monarch/common/_C.so +0 -0
- monarch/common/device_mesh.py +14 -0
- monarch/common/future.py +10 -0
- monarch/common/remote.py +14 -25
- monarch/common/tensor.py +12 -0
- monarch/debug_cli/__init__.py +7 -0
- monarch/debug_cli/__main__.py +12 -0
- monarch/fetch.py +2 -2
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +4 -2
- monarch/mesh_controller.py +34 -14
- monarch/monarch_controller +0 -0
- monarch/tools/colors.py +25 -0
- monarch/tools/commands.py +42 -7
- monarch/tools/components/hyperactor.py +1 -1
- monarch/tools/config/__init__.py +31 -4
- monarch/tools/config/defaults.py +13 -3
- monarch/tools/config/environment.py +45 -0
- monarch/tools/config/workspace.py +165 -0
- monarch/tools/mesh_spec.py +2 -0
- monarch/utils/__init__.py +9 -0
- monarch/utils/utils.py +78 -0
- tests/error_test_binary.py +5 -3
- tests/python_actor_test_binary.py +52 -0
- tests/test_actor_error.py +142 -14
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +59 -72
- tests/test_debugger.py +639 -45
- tests/test_env_before_cuda.py +4 -4
- tests/test_mesh_trait.py +38 -0
- tests/test_python_actors.py +965 -75
- tests/test_rdma.py +7 -6
- tests/test_tensor_engine.py +6 -6
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import shutil
|
10
|
+
from pathlib import Path
|
11
|
+
|
12
|
+
from monarch.tools.config.environment import CondaEnvironment, Environment
|
13
|
+
|
14
|
+
ACTIVE_CONDA_ENV = CondaEnvironment()
|
15
|
+
|
16
|
+
|
17
|
+
class Workspace:
|
18
|
+
"""
|
19
|
+
A workspace is one or more local directories that contains your project(s).
|
20
|
+
Workspaces can specify an "environment" on which projects are developed and run locally.
|
21
|
+
A currently active conda environment is an example of such environment.
|
22
|
+
|
23
|
+
At the time of job submission an ephemeral version of the "image" is built and the
|
24
|
+
new job is configured to run on this image. The "image" is the one specified by
|
25
|
+
`Role.image` attribute in the job's `AppDef`
|
26
|
+
(see `monarch.tools.components.hyperactor.host_mesh()`).
|
27
|
+
|
28
|
+
For example when launching onto Kubernetes, "image" is interpreted as a Docker image (e.g. "name:tag")
|
29
|
+
|
30
|
+
Specifically the ephemeral image contains:
|
31
|
+
|
32
|
+
1. A copy of the workspace directories
|
33
|
+
2. (If Applicable) A copy of the currently active environment
|
34
|
+
|
35
|
+
This effectively one-time mirrors the local codebase and environment on the remote machines.
|
36
|
+
|
37
|
+
Workspaces can also be sync'ed interactively on-demand (post job launch) by using
|
38
|
+
`monarch.actor.proc_mesh.ProcMesh.syncWorkspace(Workspace)`.
|
39
|
+
|
40
|
+
Usage:
|
41
|
+
|
42
|
+
.. doc-test::
|
43
|
+
|
44
|
+
import pathlib
|
45
|
+
from monarch.tools.config import Workspace
|
46
|
+
from monarch.tools.config import Config
|
47
|
+
|
48
|
+
HOME = pathlib.Path().home()
|
49
|
+
|
50
|
+
# 1. single project workspace
|
51
|
+
config = Config(
|
52
|
+
workspace=Workspace(dirs=[HOME / "github" / "torchtitan"]),
|
53
|
+
)
|
54
|
+
|
55
|
+
# 2. multiple projects (useful for cross-project development)
|
56
|
+
config = Config(
|
57
|
+
workspace=Workspace(
|
58
|
+
dirs=[
|
59
|
+
# $HOME/torch (local) -> $WORKSPACE_DIR/torch (remote)
|
60
|
+
# $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/torchtitan (remote)
|
61
|
+
HOME() / "torch",
|
62
|
+
HOME() / "github" / "torchtitan",
|
63
|
+
]
|
64
|
+
),
|
65
|
+
)
|
66
|
+
|
67
|
+
# 3. with explicit local -> remote mappings
|
68
|
+
config = Config(
|
69
|
+
workspace=Workspace(
|
70
|
+
dirs={
|
71
|
+
# $HOME/torch (local) -> $WORKSPACE_DIR/github/pytorch (remote)
|
72
|
+
# $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/github/torchtitan (remote)
|
73
|
+
HOME() / "torch" : "github/pytorch"
|
74
|
+
HOME() / "github" / "torchtitan" : "github/torchtitan"
|
75
|
+
}
|
76
|
+
)
|
77
|
+
)
|
78
|
+
# -- or flat into WORKSPACE_DIR
|
79
|
+
config = Config(
|
80
|
+
workspace=Workspace(
|
81
|
+
# $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/ (remote)
|
82
|
+
dirs={HOME() / "github" / "torchtitan": ""},
|
83
|
+
)
|
84
|
+
)
|
85
|
+
|
86
|
+
# 3. no project, everything is installed in my environment (but sync my env)
|
87
|
+
config = Config(
|
88
|
+
workspace=Workspace(),
|
89
|
+
)
|
90
|
+
|
91
|
+
# 4. disable project and environment sync
|
92
|
+
config = Config(
|
93
|
+
workspace=Workspace(env=None),
|
94
|
+
)
|
95
|
+
"""
|
96
|
+
|
97
|
+
def __init__(
|
98
|
+
self,
|
99
|
+
dirs: list[Path | str] | dict[Path | str, str] | None = None,
|
100
|
+
env: Environment | None = ACTIVE_CONDA_ENV,
|
101
|
+
) -> None:
|
102
|
+
self.env = env
|
103
|
+
self.dirs: dict[Path, str] = {} # src -> dst
|
104
|
+
|
105
|
+
if dirs is None:
|
106
|
+
pass
|
107
|
+
elif isinstance(dirs, list):
|
108
|
+
for d in dirs:
|
109
|
+
d = Path(d)
|
110
|
+
self.dirs[d] = d.name
|
111
|
+
else: # dict
|
112
|
+
for src, dst in dirs.items():
|
113
|
+
self.dirs[Path(src)] = dst
|
114
|
+
|
115
|
+
def __eq__(self, other: object) -> bool:
|
116
|
+
if not isinstance(other, Workspace):
|
117
|
+
return False
|
118
|
+
|
119
|
+
return self.env == other.env and self.dirs == other.dirs
|
120
|
+
|
121
|
+
def merge(self, outdir: str | Path) -> None:
|
122
|
+
"""Merges the dirs of this workspace into the given outdir."""
|
123
|
+
|
124
|
+
outdir = Path(outdir)
|
125
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
126
|
+
|
127
|
+
for src, dst in self.dirs.items():
|
128
|
+
shutil.copytree(src, outdir / dst, dirs_exist_ok=True)
|
129
|
+
|
130
|
+
# pyre-ignore[2] skip type-hint to avoid torchx dep
|
131
|
+
def set_env_vars(self, appdef) -> None:
|
132
|
+
"""For each role in the appdef, sets the following env vars (if not already set):
|
133
|
+
|
134
|
+
1. `WORKSPACE_DIR`: the root directory of the remote workspace
|
135
|
+
2. `PYTHONPATH`: include all the remote workspace dirs for all the roles in the appdef
|
136
|
+
(dedups and appends to existing `PYTHONPATH`)
|
137
|
+
3. `CONDA_DIR`: (if env is conda) the remote path to the conda env to activate
|
138
|
+
"""
|
139
|
+
|
140
|
+
# typically this macro comes from torchx.specs.macros.img_root
|
141
|
+
# but we use the str repr instead to avoid taking a dep to torchx from this module
|
142
|
+
# unittest (test_workspace.py) asserts against torchx.specs.macros.img_root
|
143
|
+
# guarding against changes to the macro value
|
144
|
+
img_root_macro = "${img_root}"
|
145
|
+
|
146
|
+
for role in appdef.roles:
|
147
|
+
remote_workspace_root = role.env.setdefault(
|
148
|
+
"WORKSPACE_DIR",
|
149
|
+
f"{img_root_macro}/workspace",
|
150
|
+
)
|
151
|
+
|
152
|
+
PYTHONPATH = [p for p in role.env.get("PYTHONPATH", "").split(":") if p]
|
153
|
+
for dst in self.dirs.values():
|
154
|
+
remote_dir = f"{remote_workspace_root}/{dst}"
|
155
|
+
if remote_dir not in PYTHONPATH:
|
156
|
+
PYTHONPATH.append(remote_dir)
|
157
|
+
role.env["PYTHONPATH"] = ":".join(PYTHONPATH)
|
158
|
+
|
159
|
+
if isinstance(self.env, CondaEnvironment):
|
160
|
+
role.env.setdefault("CONDA_DIR", f"{img_root_macro}/conda")
|
161
|
+
|
162
|
+
@staticmethod
|
163
|
+
def null() -> "Workspace":
|
164
|
+
"""Returns a "null" workspace; a workspace with no project dirs and no environment."""
|
165
|
+
return Workspace(env=None)
|
monarch/tools/mesh_spec.py
CHANGED
@@ -128,6 +128,7 @@ class ServerSpec:
|
|
128
128
|
meshes: list[MeshSpec]
|
129
129
|
scheduler: str
|
130
130
|
namespace: str = ""
|
131
|
+
ui_url: Optional[str] = None
|
131
132
|
|
132
133
|
@property
|
133
134
|
def server_handle(self) -> str:
|
@@ -210,6 +211,7 @@ class ServerSpec:
|
|
210
211
|
return {
|
211
212
|
"name": self.name,
|
212
213
|
"server_handle": self.server_handle,
|
214
|
+
**({"ui_url": self.ui_url} if self.ui_url else {}),
|
213
215
|
"state": self.state.name,
|
214
216
|
"meshes": {
|
215
217
|
mesh.name: {
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
from .utils import setup_env_for_distributed
|
8
|
+
|
9
|
+
__all__ = ["setup_env_for_distributed"]
|
monarch/utils/utils.py
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
|
8
|
+
# pyre-strict
|
9
|
+
import os
|
10
|
+
import socket
|
11
|
+
|
12
|
+
from monarch.actor import Actor, current_rank, endpoint, ProcMesh
|
13
|
+
|
14
|
+
|
15
|
+
def _find_free_port() -> int:
|
16
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
17
|
+
s.bind(("localhost", 0))
|
18
|
+
addr = s.getsockname()
|
19
|
+
port = addr[1]
|
20
|
+
return port
|
21
|
+
|
22
|
+
|
23
|
+
class _TorchDistributedInitActor(Actor):
|
24
|
+
def __init__(self) -> None:
|
25
|
+
self.rank: int = current_rank().rank
|
26
|
+
|
27
|
+
@endpoint
|
28
|
+
def get_host_port(self) -> tuple[str, int]:
|
29
|
+
return (socket.gethostname(), _find_free_port())
|
30
|
+
|
31
|
+
@endpoint
|
32
|
+
def setup_env(self, master_addr: str, master_port: int) -> None:
|
33
|
+
cr = current_rank()
|
34
|
+
# Assume last dimension is the local rank.
|
35
|
+
last_label = cr.extent.labels[-1]
|
36
|
+
local_world_size = cr.size(last_label)
|
37
|
+
world_size = cr.extent.nelements
|
38
|
+
global_rank = cr.rank
|
39
|
+
local_rank = min(world_size, global_rank % local_world_size)
|
40
|
+
group_rank = global_rank // local_world_size
|
41
|
+
group_world_size = (world_size + local_world_size - 1) // local_world_size
|
42
|
+
env = {
|
43
|
+
"MASTER_ADDR": master_addr,
|
44
|
+
"MASTER_PORT": str(master_port),
|
45
|
+
"RANK": str(global_rank),
|
46
|
+
"LOCAL_RANK": str(local_rank),
|
47
|
+
"LOCAL_WORLD_SIZE": str(local_world_size),
|
48
|
+
"GROUP_RANK": str(group_rank),
|
49
|
+
"GROUP_WORLD_SIZE": str(group_world_size),
|
50
|
+
"ROLE_RANK": str(global_rank),
|
51
|
+
"ROLE_WORLD_SIZE": str(world_size),
|
52
|
+
"ROLE_NAME": "rank",
|
53
|
+
"WORLD_SIZE": str(world_size),
|
54
|
+
}
|
55
|
+
os.environ.update(env)
|
56
|
+
|
57
|
+
|
58
|
+
async def setup_env_for_distributed(
|
59
|
+
proc_mesh: ProcMesh,
|
60
|
+
master_addr: str | None = None,
|
61
|
+
master_port: int | None = None,
|
62
|
+
) -> None:
|
63
|
+
"""
|
64
|
+
Sets up environment variables for pytorch distributed.
|
65
|
+
It selects a random proc in the proc_mesh to be the master node.
|
66
|
+
It sets enviornment variables like RANK, LOCAL_RANK, WORLD_SIZE, etc.
|
67
|
+
If master_addr and master_port are None, it will automatically select a master node and port.
|
68
|
+
"""
|
69
|
+
assert (
|
70
|
+
(master_addr is None) == (master_port is None)
|
71
|
+
), "Either both master_addr and master_port must be specified or neither must be specified."
|
72
|
+
am = await proc_mesh.spawn("_TorchDistributedInitActor", _TorchDistributedInitActor)
|
73
|
+
if master_addr is None:
|
74
|
+
# We use call instead of call_one because call_one can't handle tuple return types.
|
75
|
+
vm = await am.flatten("rank").slice(rank=0).get_host_port.call()
|
76
|
+
master_addr, master_port = vm.item()
|
77
|
+
assert master_port is not None, "master_port should not be None here."
|
78
|
+
await am.setup_env.call(master_addr, master_port)
|
tests/error_test_binary.py
CHANGED
@@ -4,9 +4,10 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
# pyre-unsafe
|
8
|
+
|
7
9
|
import asyncio
|
8
10
|
import ctypes
|
9
|
-
import sys
|
10
11
|
|
11
12
|
import click
|
12
13
|
from monarch._rust_bindings.monarch_extension.blocking import blocking_function
|
@@ -158,8 +159,9 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
|
|
158
159
|
@main.command("error-bootstrap")
|
159
160
|
def error_bootstrap():
|
160
161
|
print("Started function error_bootstrap", flush=True)
|
161
|
-
|
162
|
-
|
162
|
+
proc_mesh(
|
163
|
+
gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
|
164
|
+
).initialized.get()
|
163
165
|
|
164
166
|
|
165
167
|
async def _error_unmonitored():
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import asyncio
|
10
|
+
import logging
|
11
|
+
|
12
|
+
import click
|
13
|
+
|
14
|
+
from monarch.actor import Actor, endpoint, proc_mesh
|
15
|
+
|
16
|
+
|
17
|
+
@click.group()
|
18
|
+
def main() -> None:
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
class Printer(Actor):
|
23
|
+
def __init__(self) -> None:
|
24
|
+
self.logger: logging.Logger = logging.getLogger()
|
25
|
+
|
26
|
+
@endpoint
|
27
|
+
async def print(self, content: str) -> None:
|
28
|
+
print(f"{content}", flush=True)
|
29
|
+
|
30
|
+
|
31
|
+
async def _flush_logs() -> None:
|
32
|
+
# Create a lot of processes to stress test the logging
|
33
|
+
pm = await proc_mesh(gpus=32)
|
34
|
+
|
35
|
+
# never flush
|
36
|
+
await pm.logging_option(aggregate_window_sec=1000)
|
37
|
+
am = await pm.spawn("printer", Printer)
|
38
|
+
|
39
|
+
# These should be streamed to client
|
40
|
+
for _ in range(5):
|
41
|
+
await am.print.call("has print streaming")
|
42
|
+
|
43
|
+
await pm.stop()
|
44
|
+
|
45
|
+
|
46
|
+
@main.command("flush-logs")
|
47
|
+
def flush_logs() -> None:
|
48
|
+
asyncio.run(_flush_logs())
|
49
|
+
|
50
|
+
|
51
|
+
if __name__ == "__main__":
|
52
|
+
main()
|
tests/test_actor_error.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
# pyre-unsafe
|
7
8
|
|
8
9
|
import importlib.resources
|
9
10
|
import os
|
@@ -34,6 +35,24 @@ class ExceptionActorSync(Actor):
|
|
34
35
|
raise Exception("This is a test exception")
|
35
36
|
|
36
37
|
|
38
|
+
class NestedExceptionActor(Actor):
|
39
|
+
@endpoint
|
40
|
+
async def raise_exception_with_context(self) -> None:
|
41
|
+
try:
|
42
|
+
raise Exception("Inner exception")
|
43
|
+
except Exception:
|
44
|
+
# Don't use from here to set __context__ instead of __cause__
|
45
|
+
raise Exception("Outer exception")
|
46
|
+
|
47
|
+
@endpoint
|
48
|
+
async def raise_exception_with_cause(self) -> None:
|
49
|
+
try:
|
50
|
+
raise Exception("Inner exception")
|
51
|
+
except Exception as e:
|
52
|
+
# Use from here to set __cause__ instead of __context__
|
53
|
+
raise Exception("Outer exception") from e
|
54
|
+
|
55
|
+
|
37
56
|
class BrokenPickleClass:
|
38
57
|
"""A class that can be configured to raise exceptions during pickling/unpickling."""
|
39
58
|
|
@@ -116,6 +135,41 @@ def test_actor_exception_sync(mesh, actor_class, num_procs):
|
|
116
135
|
exception_actor.raise_exception.call().get()
|
117
136
|
|
118
137
|
|
138
|
+
@pytest.mark.parametrize(
|
139
|
+
"mesh",
|
140
|
+
[local_proc_mesh, proc_mesh],
|
141
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
142
|
+
)
|
143
|
+
async def test_actor_error_message(mesh):
|
144
|
+
"""
|
145
|
+
Test that exceptions raised in actor endpoints capture nested exceptions.
|
146
|
+
"""
|
147
|
+
proc = mesh(gpus=2)
|
148
|
+
exception_actor = await proc.spawn("exception_actor", NestedExceptionActor)
|
149
|
+
|
150
|
+
with pytest.raises(ActorError) as exc_info:
|
151
|
+
await exception_actor.raise_exception_with_cause.call()
|
152
|
+
|
153
|
+
# Make sure both exception messages are present in the message.
|
154
|
+
assert "Inner exception" in str(exc_info.value)
|
155
|
+
assert "Outer exception" in str(exc_info.value)
|
156
|
+
# Make sure the "cause" is set.
|
157
|
+
assert "The above exception was the direct cause of the following exception" in str(
|
158
|
+
exc_info.value
|
159
|
+
)
|
160
|
+
|
161
|
+
with pytest.raises(ActorError) as exc_info:
|
162
|
+
await exception_actor.raise_exception_with_context.call()
|
163
|
+
|
164
|
+
# Make sure both exception messages are present in the message.
|
165
|
+
assert "Inner exception" in str(exc_info.value)
|
166
|
+
assert "Outer exception" in str(exc_info.value)
|
167
|
+
# Make sure the "cause" is set.
|
168
|
+
assert "During handling of the above exception, another exception occurred" in str(
|
169
|
+
exc_info.value
|
170
|
+
)
|
171
|
+
|
172
|
+
|
119
173
|
'''
|
120
174
|
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
121
175
|
@pytest.mark.oss_skip
|
@@ -436,14 +490,14 @@ async def test_proc_mesh_monitoring(mesh):
|
|
436
490
|
event = await anext(monitor)
|
437
491
|
assert isinstance(event, ProcEvent.Crashed)
|
438
492
|
assert event[0] == 0 # check rank
|
439
|
-
assert "
|
493
|
+
assert "failed: did not handle supervision event" in event[1] # check error message
|
440
494
|
assert (
|
441
495
|
"Simulated actor failure for supervision testing" in event[1]
|
442
496
|
) # check error message
|
443
497
|
|
444
498
|
# should not be able to spawn actors anymore as proc mesh is unhealthy
|
445
499
|
with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
|
446
|
-
await proc.spawn("ex", ExceptionActorSync)
|
500
|
+
await proc.spawn("ex", ExceptionActorSync).initialized
|
447
501
|
|
448
502
|
|
449
503
|
@pytest.mark.parametrize(
|
@@ -467,16 +521,19 @@ async def test_actor_mesh_supervision_handling(mesh):
|
|
467
521
|
await e.check.call()
|
468
522
|
|
469
523
|
# existing call should fail with supervision error
|
470
|
-
with pytest.raises(
|
524
|
+
with pytest.raises(
|
525
|
+
SupervisionError,
|
526
|
+
match=".*Actor .* exited because of the following reason",
|
527
|
+
):
|
471
528
|
await e.fail_with_supervision_error.call_one()
|
472
529
|
|
473
530
|
# new call should fail with check of health state of actor mesh
|
474
|
-
with pytest.raises(SupervisionError, match="
|
531
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
|
475
532
|
await e.check.call()
|
476
533
|
|
477
534
|
# should not be able to spawn actors anymore as proc mesh is unhealthy
|
478
535
|
with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
|
479
|
-
await proc.spawn("ex", ExceptionActorSync)
|
536
|
+
await proc.spawn("ex", ExceptionActorSync).initialized
|
480
537
|
|
481
538
|
|
482
539
|
class HealthyActor(Actor):
|
@@ -534,11 +591,14 @@ async def test_actor_mesh_supervision_handling_chained_error(mesh):
|
|
534
591
|
# in a chain of client -> Intermediate -> ErrorActor, a supervision error
|
535
592
|
# happening in ErrorActor will be captured by Intermediate and re-raised
|
536
593
|
# as an application error (ActorError).
|
537
|
-
with pytest.raises(
|
594
|
+
with pytest.raises(
|
595
|
+
ActorError,
|
596
|
+
match=".*Actor .* exited because of the following reason",
|
597
|
+
):
|
538
598
|
await intermediate_actor.forward_error.call()
|
539
599
|
|
540
600
|
# calling success endpoint should fail with ActorError, but with supervision msg.
|
541
|
-
with pytest.raises(ActorError, match="
|
601
|
+
with pytest.raises(ActorError, match="Actor .* is unhealthy with reason"):
|
542
602
|
await intermediate_actor.forward_success.call()
|
543
603
|
|
544
604
|
# healthy actor should still be working
|
@@ -567,11 +627,14 @@ async def test_base_exception_handling(mesh, method_name):
|
|
567
627
|
method = getattr(error_actor, method_name)
|
568
628
|
|
569
629
|
# The call should raise a SupervisionError
|
570
|
-
with pytest.raises(
|
630
|
+
with pytest.raises(
|
631
|
+
SupervisionError,
|
632
|
+
match=".*Actor .* exited because of the following reason",
|
633
|
+
):
|
571
634
|
await method.call_one()
|
572
635
|
|
573
636
|
# Subsequent calls should fail with a health state error
|
574
|
-
with pytest.raises(
|
637
|
+
with pytest.raises(RuntimeError, match="Actor .* is unhealthy with reason"):
|
575
638
|
await error_actor.check.call()
|
576
639
|
|
577
640
|
|
@@ -587,18 +650,24 @@ async def test_supervision_with_proc_mesh_stopped(mesh):
|
|
587
650
|
await proc.stop()
|
588
651
|
|
589
652
|
# new call should fail with check of health state of actor mesh
|
590
|
-
with pytest.raises(
|
653
|
+
with pytest.raises(
|
654
|
+
SupervisionError, match="actor mesh is stopped due to proc mesh shutdown"
|
655
|
+
):
|
591
656
|
await actor_mesh.check.call()
|
592
657
|
|
593
658
|
# proc mesh cannot spawn new actors anymore
|
594
659
|
with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
|
595
|
-
await proc.spawn("immediate", Intermediate)
|
660
|
+
await proc.spawn("immediate", Intermediate).initialized
|
596
661
|
|
597
662
|
|
598
663
|
# TODO - re-enable after resolving T232206970
|
599
664
|
@pytest.mark.oss_skip
|
600
665
|
async def test_supervision_with_sending_error():
|
666
|
+
# Messages of length > this will cause a send error and a returned
|
667
|
+
# undeliverable.
|
601
668
|
os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "50000000"
|
669
|
+
# Limit retries for sending before giving up.
|
670
|
+
os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "5"
|
602
671
|
|
603
672
|
proc = await proc_mesh(gpus=1)
|
604
673
|
actor_mesh = await proc.spawn("healthy", HealthyActor)
|
@@ -610,12 +679,71 @@ async def test_supervision_with_sending_error():
|
|
610
679
|
|
611
680
|
# send a large payload to trigger send timeout error
|
612
681
|
with pytest.raises(
|
613
|
-
SupervisionError,
|
682
|
+
SupervisionError,
|
683
|
+
match=".*Actor .* exited because of the following reason",
|
614
684
|
):
|
615
685
|
await actor_mesh.check_with_payload.call(payload="a" * 55000000)
|
616
686
|
|
617
687
|
# new call should fail with check of health state of actor mesh
|
618
|
-
with pytest.raises(SupervisionError, match="
|
688
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
|
619
689
|
await actor_mesh.check.call()
|
620
|
-
with pytest.raises(SupervisionError, match="
|
690
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
|
621
691
|
await actor_mesh.check_with_payload.call(payload="a")
|
692
|
+
|
693
|
+
|
694
|
+
async def test_slice_supervision() -> None:
|
695
|
+
pm = await proc_mesh(gpus=4)
|
696
|
+
healthy_mesh = await pm.spawn("healthy", HealthyActor)
|
697
|
+
error_mesh = await pm.spawn("error", ErrorActor)
|
698
|
+
slice_1 = error_mesh.slice(gpus=slice(2, 4))
|
699
|
+
slice_2 = error_mesh.slice(gpus=2)
|
700
|
+
slice_3 = error_mesh.slice(gpus=3)
|
701
|
+
|
702
|
+
# Trigger supervision error on gpus=3
|
703
|
+
with pytest.raises(SupervisionError, match="did not handle supervision event"):
|
704
|
+
await slice_3.fail_with_supervision_error.call()
|
705
|
+
|
706
|
+
# Mesh containing all gpus is unhealthy
|
707
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
|
708
|
+
await error_mesh.check.call()
|
709
|
+
|
710
|
+
# Slice containing only gpus=3 is unhealthy
|
711
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
|
712
|
+
await slice_3.check.call()
|
713
|
+
|
714
|
+
# Slice containing gpus=3 is unhealthy
|
715
|
+
with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
|
716
|
+
await slice_1.check.call()
|
717
|
+
|
718
|
+
# Slice not containing gpus=3 is healthy
|
719
|
+
check = await slice_2.check.call()
|
720
|
+
for _, item in check.items():
|
721
|
+
assert item == "this is a healthy check"
|
722
|
+
|
723
|
+
# Other actor mesh on the same proc mesh is healthy
|
724
|
+
check = await healthy_mesh.check.call()
|
725
|
+
for _, item in check.items():
|
726
|
+
assert item == "this is a healthy check"
|
727
|
+
|
728
|
+
|
729
|
+
async def test_mesh_slices_inherit_parent_errors() -> None:
|
730
|
+
pm = await proc_mesh(gpus=4)
|
731
|
+
error_mesh = await pm.spawn("error", ErrorActor)
|
732
|
+
slice_1 = error_mesh.slice(gpus=slice(2, 4))
|
733
|
+
|
734
|
+
# Trigger supervision error on gpus=2, 3, 4
|
735
|
+
with pytest.raises(SupervisionError):
|
736
|
+
await slice_1.fail_with_supervision_error.call()
|
737
|
+
|
738
|
+
# Newly created slice containing gpu=3 is unhealthy
|
739
|
+
slice_2 = error_mesh.slice(gpus=3)
|
740
|
+
with pytest.raises(SupervisionError):
|
741
|
+
await slice_2.check.call()
|
742
|
+
|
743
|
+
# Newly created slice containing gpu=1 is healthy
|
744
|
+
slice_3 = error_mesh.slice(gpus=1)
|
745
|
+
check = await slice_3.check.call()
|
746
|
+
for _, item in check.items():
|
747
|
+
assert item == "this is a healthy check"
|
748
|
+
|
749
|
+
await pm.stop()
|
tests/test_alloc.py
CHANGED