torchmonarch-nightly 2025.8.2__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.3__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. monarch/_rust_bindings.so +0 -0
  2. monarch/_src/actor/actor_mesh.py +414 -216
  3. monarch/_src/actor/allocator.py +75 -6
  4. monarch/_src/actor/bootstrap_main.py +7 -4
  5. monarch/_src/actor/code_sync/__init__.py +2 -0
  6. monarch/_src/actor/debugger/__init__.py +7 -0
  7. monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
  8. monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
  9. monarch/_src/actor/endpoint.py +27 -45
  10. monarch/_src/actor/future.py +86 -24
  11. monarch/_src/actor/host_mesh.py +125 -0
  12. monarch/_src/actor/logging.py +94 -0
  13. monarch/_src/actor/pickle.py +25 -0
  14. monarch/_src/actor/proc_mesh.py +423 -156
  15. monarch/_src/actor/python_extension_methods.py +90 -0
  16. monarch/_src/actor/shape.py +8 -1
  17. monarch/_src/actor/source_loader.py +45 -0
  18. monarch/_src/actor/telemetry/__init__.py +172 -0
  19. monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
  20. monarch/_src/debug_cli/__init__.py +7 -0
  21. monarch/_src/debug_cli/debug_cli.py +43 -0
  22. monarch/_src/tensor_engine/rdma.py +64 -9
  23. monarch/_testing.py +1 -3
  24. monarch/actor/__init__.py +24 -4
  25. monarch/common/_C.so +0 -0
  26. monarch/common/device_mesh.py +14 -0
  27. monarch/common/future.py +10 -0
  28. monarch/common/remote.py +14 -25
  29. monarch/common/tensor.py +12 -0
  30. monarch/debug_cli/__init__.py +7 -0
  31. monarch/debug_cli/__main__.py +12 -0
  32. monarch/fetch.py +2 -2
  33. monarch/gradient/_gradient_generator.so +0 -0
  34. monarch/gradient_generator.py +4 -2
  35. monarch/mesh_controller.py +34 -14
  36. monarch/monarch_controller +0 -0
  37. monarch/tools/colors.py +25 -0
  38. monarch/tools/commands.py +42 -7
  39. monarch/tools/components/hyperactor.py +1 -1
  40. monarch/tools/config/__init__.py +31 -4
  41. monarch/tools/config/defaults.py +13 -3
  42. monarch/tools/config/environment.py +45 -0
  43. monarch/tools/config/workspace.py +165 -0
  44. monarch/tools/mesh_spec.py +2 -0
  45. monarch/utils/__init__.py +9 -0
  46. monarch/utils/utils.py +78 -0
  47. tests/error_test_binary.py +5 -3
  48. tests/python_actor_test_binary.py +52 -0
  49. tests/test_actor_error.py +142 -14
  50. tests/test_alloc.py +1 -1
  51. tests/test_allocator.py +59 -72
  52. tests/test_debugger.py +639 -45
  53. tests/test_env_before_cuda.py +4 -4
  54. tests/test_mesh_trait.py +38 -0
  55. tests/test_python_actors.py +965 -75
  56. tests/test_rdma.py +7 -6
  57. tests/test_tensor_engine.py +6 -6
  58. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
  59. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
  60. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
  61. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
  62. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  63. {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import shutil
10
+ from pathlib import Path
11
+
12
+ from monarch.tools.config.environment import CondaEnvironment, Environment
13
+
14
+ ACTIVE_CONDA_ENV = CondaEnvironment()
15
+
16
+
17
+ class Workspace:
18
+ """
19
+ A workspace is one or more local directories that contains your project(s).
20
+ Workspaces can specify an "environment" on which projects are developed and run locally.
21
+ A currently active conda environment is an example of such environment.
22
+
23
+ At the time of job submission an ephemeral version of the "image" is built and the
24
+ new job is configured to run on this image. The "image" is the one specified by
25
+ `Role.image` attribute in the job's `AppDef`
26
+ (see `monarch.tools.components.hyperactor.host_mesh()`).
27
+
28
+ For example when launching onto Kubernetes, "image" is interpreted as a Docker image (e.g. "name:tag")
29
+
30
+ Specifically the ephemeral image contains:
31
+
32
+ 1. A copy of the workspace directories
33
+ 2. (If Applicable) A copy of the currently active environment
34
+
35
+ This effectively one-time mirrors the local codebase and environment on the remote machines.
36
+
37
+ Workspaces can also be sync'ed interactively on-demand (post job launch) by using
38
+ `monarch.actor.proc_mesh.ProcMesh.syncWorkspace(Workspace)`.
39
+
40
+ Usage:
41
+
42
+ .. doc-test::
43
+
44
+ import pathlib
45
+ from monarch.tools.config import Workspace
46
+ from monarch.tools.config import Config
47
+
48
+ HOME = pathlib.Path().home()
49
+
50
+ # 1. single project workspace
51
+ config = Config(
52
+ workspace=Workspace(dirs=[HOME / "github" / "torchtitan"]),
53
+ )
54
+
55
+ # 2. multiple projects (useful for cross-project development)
56
+ config = Config(
57
+ workspace=Workspace(
58
+ dirs=[
59
+ # $HOME/torch (local) -> $WORKSPACE_DIR/torch (remote)
60
+ # $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/torchtitan (remote)
61
+ HOME() / "torch",
62
+ HOME() / "github" / "torchtitan",
63
+ ]
64
+ ),
65
+ )
66
+
67
+ # 3. with explicit local -> remote mappings
68
+ config = Config(
69
+ workspace=Workspace(
70
+ dirs={
71
+ # $HOME/torch (local) -> $WORKSPACE_DIR/github/pytorch (remote)
72
+ # $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/github/torchtitan (remote)
73
+ HOME() / "torch" : "github/pytorch"
74
+ HOME() / "github" / "torchtitan" : "github/torchtitan"
75
+ }
76
+ )
77
+ )
78
+ # -- or flat into WORKSPACE_DIR
79
+ config = Config(
80
+ workspace=Workspace(
81
+ # $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/ (remote)
82
+ dirs={HOME() / "github" / "torchtitan": ""},
83
+ )
84
+ )
85
+
86
+ # 3. no project, everything is installed in my environment (but sync my env)
87
+ config = Config(
88
+ workspace=Workspace(),
89
+ )
90
+
91
+ # 4. disable project and environment sync
92
+ config = Config(
93
+ workspace=Workspace(env=None),
94
+ )
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ dirs: list[Path | str] | dict[Path | str, str] | None = None,
100
+ env: Environment | None = ACTIVE_CONDA_ENV,
101
+ ) -> None:
102
+ self.env = env
103
+ self.dirs: dict[Path, str] = {} # src -> dst
104
+
105
+ if dirs is None:
106
+ pass
107
+ elif isinstance(dirs, list):
108
+ for d in dirs:
109
+ d = Path(d)
110
+ self.dirs[d] = d.name
111
+ else: # dict
112
+ for src, dst in dirs.items():
113
+ self.dirs[Path(src)] = dst
114
+
115
+ def __eq__(self, other: object) -> bool:
116
+ if not isinstance(other, Workspace):
117
+ return False
118
+
119
+ return self.env == other.env and self.dirs == other.dirs
120
+
121
+ def merge(self, outdir: str | Path) -> None:
122
+ """Merges the dirs of this workspace into the given outdir."""
123
+
124
+ outdir = Path(outdir)
125
+ outdir.mkdir(parents=True, exist_ok=True)
126
+
127
+ for src, dst in self.dirs.items():
128
+ shutil.copytree(src, outdir / dst, dirs_exist_ok=True)
129
+
130
+ # pyre-ignore[2] skip type-hint to avoid torchx dep
131
+ def set_env_vars(self, appdef) -> None:
132
+ """For each role in the appdef, sets the following env vars (if not already set):
133
+
134
+ 1. `WORKSPACE_DIR`: the root directory of the remote workspace
135
+ 2. `PYTHONPATH`: include all the remote workspace dirs for all the roles in the appdef
136
+ (dedups and appends to existing `PYTHONPATH`)
137
+ 3. `CONDA_DIR`: (if env is conda) the remote path to the conda env to activate
138
+ """
139
+
140
+ # typically this macro comes from torchx.specs.macros.img_root
141
+ # but we use the str repr instead to avoid taking a dep to torchx from this module
142
+ # unittest (test_workspace.py) asserts against torchx.specs.macros.img_root
143
+ # guarding against changes to the macro value
144
+ img_root_macro = "${img_root}"
145
+
146
+ for role in appdef.roles:
147
+ remote_workspace_root = role.env.setdefault(
148
+ "WORKSPACE_DIR",
149
+ f"{img_root_macro}/workspace",
150
+ )
151
+
152
+ PYTHONPATH = [p for p in role.env.get("PYTHONPATH", "").split(":") if p]
153
+ for dst in self.dirs.values():
154
+ remote_dir = f"{remote_workspace_root}/{dst}"
155
+ if remote_dir not in PYTHONPATH:
156
+ PYTHONPATH.append(remote_dir)
157
+ role.env["PYTHONPATH"] = ":".join(PYTHONPATH)
158
+
159
+ if isinstance(self.env, CondaEnvironment):
160
+ role.env.setdefault("CONDA_DIR", f"{img_root_macro}/conda")
161
+
162
+ @staticmethod
163
+ def null() -> "Workspace":
164
+ """Returns a "null" workspace; a workspace with no project dirs and no environment."""
165
+ return Workspace(env=None)
@@ -128,6 +128,7 @@ class ServerSpec:
128
128
  meshes: list[MeshSpec]
129
129
  scheduler: str
130
130
  namespace: str = ""
131
+ ui_url: Optional[str] = None
131
132
 
132
133
  @property
133
134
  def server_handle(self) -> str:
@@ -210,6 +211,7 @@ class ServerSpec:
210
211
  return {
211
212
  "name": self.name,
212
213
  "server_handle": self.server_handle,
214
+ **({"ui_url": self.ui_url} if self.ui_url else {}),
213
215
  "state": self.state.name,
214
216
  "meshes": {
215
217
  mesh.name: {
@@ -0,0 +1,9 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .utils import setup_env_for_distributed
8
+
9
+ __all__ = ["setup_env_for_distributed"]
monarch/utils/utils.py ADDED
@@ -0,0 +1,78 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+
8
+ # pyre-strict
9
+ import os
10
+ import socket
11
+
12
+ from monarch.actor import Actor, current_rank, endpoint, ProcMesh
13
+
14
+
15
+ def _find_free_port() -> int:
16
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
17
+ s.bind(("localhost", 0))
18
+ addr = s.getsockname()
19
+ port = addr[1]
20
+ return port
21
+
22
+
23
+ class _TorchDistributedInitActor(Actor):
24
+ def __init__(self) -> None:
25
+ self.rank: int = current_rank().rank
26
+
27
+ @endpoint
28
+ def get_host_port(self) -> tuple[str, int]:
29
+ return (socket.gethostname(), _find_free_port())
30
+
31
+ @endpoint
32
+ def setup_env(self, master_addr: str, master_port: int) -> None:
33
+ cr = current_rank()
34
+ # Assume last dimension is the local rank.
35
+ last_label = cr.extent.labels[-1]
36
+ local_world_size = cr.size(last_label)
37
+ world_size = cr.extent.nelements
38
+ global_rank = cr.rank
39
+ local_rank = min(world_size, global_rank % local_world_size)
40
+ group_rank = global_rank // local_world_size
41
+ group_world_size = (world_size + local_world_size - 1) // local_world_size
42
+ env = {
43
+ "MASTER_ADDR": master_addr,
44
+ "MASTER_PORT": str(master_port),
45
+ "RANK": str(global_rank),
46
+ "LOCAL_RANK": str(local_rank),
47
+ "LOCAL_WORLD_SIZE": str(local_world_size),
48
+ "GROUP_RANK": str(group_rank),
49
+ "GROUP_WORLD_SIZE": str(group_world_size),
50
+ "ROLE_RANK": str(global_rank),
51
+ "ROLE_WORLD_SIZE": str(world_size),
52
+ "ROLE_NAME": "rank",
53
+ "WORLD_SIZE": str(world_size),
54
+ }
55
+ os.environ.update(env)
56
+
57
+
58
+ async def setup_env_for_distributed(
59
+ proc_mesh: ProcMesh,
60
+ master_addr: str | None = None,
61
+ master_port: int | None = None,
62
+ ) -> None:
63
+ """
64
+ Sets up environment variables for pytorch distributed.
65
+ It selects a random proc in the proc_mesh to be the master node.
66
+ It sets enviornment variables like RANK, LOCAL_RANK, WORLD_SIZE, etc.
67
+ If master_addr and master_port are None, it will automatically select a master node and port.
68
+ """
69
+ assert (
70
+ (master_addr is None) == (master_port is None)
71
+ ), "Either both master_addr and master_port must be specified or neither must be specified."
72
+ am = await proc_mesh.spawn("_TorchDistributedInitActor", _TorchDistributedInitActor)
73
+ if master_addr is None:
74
+ # We use call instead of call_one because call_one can't handle tuple return types.
75
+ vm = await am.flatten("rank").slice(rank=0).get_host_port.call()
76
+ master_addr, master_port = vm.item()
77
+ assert master_port is not None, "master_port should not be None here."
78
+ await am.setup_env.call(master_addr, master_port)
@@ -4,9 +4,10 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-unsafe
8
+
7
9
  import asyncio
8
10
  import ctypes
9
- import sys
10
11
 
11
12
  import click
12
13
  from monarch._rust_bindings.monarch_extension.blocking import blocking_function
@@ -158,8 +159,9 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
158
159
  @main.command("error-bootstrap")
159
160
  def error_bootstrap():
160
161
  print("Started function error_bootstrap", flush=True)
161
-
162
- proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
162
+ proc_mesh(
163
+ gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
164
+ ).initialized.get()
163
165
 
164
166
 
165
167
  async def _error_unmonitored():
@@ -0,0 +1,52 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import asyncio
10
+ import logging
11
+
12
+ import click
13
+
14
+ from monarch.actor import Actor, endpoint, proc_mesh
15
+
16
+
17
+ @click.group()
18
+ def main() -> None:
19
+ pass
20
+
21
+
22
+ class Printer(Actor):
23
+ def __init__(self) -> None:
24
+ self.logger: logging.Logger = logging.getLogger()
25
+
26
+ @endpoint
27
+ async def print(self, content: str) -> None:
28
+ print(f"{content}", flush=True)
29
+
30
+
31
+ async def _flush_logs() -> None:
32
+ # Create a lot of processes to stress test the logging
33
+ pm = await proc_mesh(gpus=32)
34
+
35
+ # never flush
36
+ await pm.logging_option(aggregate_window_sec=1000)
37
+ am = await pm.spawn("printer", Printer)
38
+
39
+ # These should be streamed to client
40
+ for _ in range(5):
41
+ await am.print.call("has print streaming")
42
+
43
+ await pm.stop()
44
+
45
+
46
+ @main.command("flush-logs")
47
+ def flush_logs() -> None:
48
+ asyncio.run(_flush_logs())
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
tests/test_actor_error.py CHANGED
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-unsafe
7
8
 
8
9
  import importlib.resources
9
10
  import os
@@ -34,6 +35,24 @@ class ExceptionActorSync(Actor):
34
35
  raise Exception("This is a test exception")
35
36
 
36
37
 
38
+ class NestedExceptionActor(Actor):
39
+ @endpoint
40
+ async def raise_exception_with_context(self) -> None:
41
+ try:
42
+ raise Exception("Inner exception")
43
+ except Exception:
44
+ # Don't use from here to set __context__ instead of __cause__
45
+ raise Exception("Outer exception")
46
+
47
+ @endpoint
48
+ async def raise_exception_with_cause(self) -> None:
49
+ try:
50
+ raise Exception("Inner exception")
51
+ except Exception as e:
52
+ # Use from here to set __cause__ instead of __context__
53
+ raise Exception("Outer exception") from e
54
+
55
+
37
56
  class BrokenPickleClass:
38
57
  """A class that can be configured to raise exceptions during pickling/unpickling."""
39
58
 
@@ -116,6 +135,41 @@ def test_actor_exception_sync(mesh, actor_class, num_procs):
116
135
  exception_actor.raise_exception.call().get()
117
136
 
118
137
 
138
+ @pytest.mark.parametrize(
139
+ "mesh",
140
+ [local_proc_mesh, proc_mesh],
141
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
142
+ )
143
+ async def test_actor_error_message(mesh):
144
+ """
145
+ Test that exceptions raised in actor endpoints capture nested exceptions.
146
+ """
147
+ proc = mesh(gpus=2)
148
+ exception_actor = await proc.spawn("exception_actor", NestedExceptionActor)
149
+
150
+ with pytest.raises(ActorError) as exc_info:
151
+ await exception_actor.raise_exception_with_cause.call()
152
+
153
+ # Make sure both exception messages are present in the message.
154
+ assert "Inner exception" in str(exc_info.value)
155
+ assert "Outer exception" in str(exc_info.value)
156
+ # Make sure the "cause" is set.
157
+ assert "The above exception was the direct cause of the following exception" in str(
158
+ exc_info.value
159
+ )
160
+
161
+ with pytest.raises(ActorError) as exc_info:
162
+ await exception_actor.raise_exception_with_context.call()
163
+
164
+ # Make sure both exception messages are present in the message.
165
+ assert "Inner exception" in str(exc_info.value)
166
+ assert "Outer exception" in str(exc_info.value)
167
+ # Make sure the "cause" is set.
168
+ assert "During handling of the above exception, another exception occurred" in str(
169
+ exc_info.value
170
+ )
171
+
172
+
119
173
  '''
120
174
  # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
121
175
  @pytest.mark.oss_skip
@@ -436,14 +490,14 @@ async def test_proc_mesh_monitoring(mesh):
436
490
  event = await anext(monitor)
437
491
  assert isinstance(event, ProcEvent.Crashed)
438
492
  assert event[0] == 0 # check rank
439
- assert "ActorFailureError" in event[1] # check error message
493
+ assert "failed: did not handle supervision event" in event[1] # check error message
440
494
  assert (
441
495
  "Simulated actor failure for supervision testing" in event[1]
442
496
  ) # check error message
443
497
 
444
498
  # should not be able to spawn actors anymore as proc mesh is unhealthy
445
499
  with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
446
- await proc.spawn("ex", ExceptionActorSync)
500
+ await proc.spawn("ex", ExceptionActorSync).initialized
447
501
 
448
502
 
449
503
  @pytest.mark.parametrize(
@@ -467,16 +521,19 @@ async def test_actor_mesh_supervision_handling(mesh):
467
521
  await e.check.call()
468
522
 
469
523
  # existing call should fail with supervision error
470
- with pytest.raises(SupervisionError, match="supervision error:"):
524
+ with pytest.raises(
525
+ SupervisionError,
526
+ match=".*Actor .* exited because of the following reason",
527
+ ):
471
528
  await e.fail_with_supervision_error.call_one()
472
529
 
473
530
  # new call should fail with check of health state of actor mesh
474
- with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
531
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
475
532
  await e.check.call()
476
533
 
477
534
  # should not be able to spawn actors anymore as proc mesh is unhealthy
478
535
  with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
479
- await proc.spawn("ex", ExceptionActorSync)
536
+ await proc.spawn("ex", ExceptionActorSync).initialized
480
537
 
481
538
 
482
539
  class HealthyActor(Actor):
@@ -534,11 +591,14 @@ async def test_actor_mesh_supervision_handling_chained_error(mesh):
534
591
  # in a chain of client -> Intermediate -> ErrorActor, a supervision error
535
592
  # happening in ErrorActor will be captured by Intermediate and re-raised
536
593
  # as an application error (ActorError).
537
- with pytest.raises(ActorError, match="supervision error:"):
594
+ with pytest.raises(
595
+ ActorError,
596
+ match=".*Actor .* exited because of the following reason",
597
+ ):
538
598
  await intermediate_actor.forward_error.call()
539
599
 
540
600
  # calling success endpoint should fail with ActorError, but with supervision msg.
541
- with pytest.raises(ActorError, match="actor mesh is not in a healthy state"):
601
+ with pytest.raises(ActorError, match="Actor .* is unhealthy with reason"):
542
602
  await intermediate_actor.forward_success.call()
543
603
 
544
604
  # healthy actor should still be working
@@ -567,11 +627,14 @@ async def test_base_exception_handling(mesh, method_name):
567
627
  method = getattr(error_actor, method_name)
568
628
 
569
629
  # The call should raise a SupervisionError
570
- with pytest.raises(SupervisionError, match="supervision error:"):
630
+ with pytest.raises(
631
+ SupervisionError,
632
+ match=".*Actor .* exited because of the following reason",
633
+ ):
571
634
  await method.call_one()
572
635
 
573
636
  # Subsequent calls should fail with a health state error
574
- with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
637
+ with pytest.raises(RuntimeError, match="Actor .* is unhealthy with reason"):
575
638
  await error_actor.check.call()
576
639
 
577
640
 
@@ -587,18 +650,24 @@ async def test_supervision_with_proc_mesh_stopped(mesh):
587
650
  await proc.stop()
588
651
 
589
652
  # new call should fail with check of health state of actor mesh
590
- with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
653
+ with pytest.raises(
654
+ SupervisionError, match="actor mesh is stopped due to proc mesh shutdown"
655
+ ):
591
656
  await actor_mesh.check.call()
592
657
 
593
658
  # proc mesh cannot spawn new actors anymore
594
659
  with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
595
- await proc.spawn("immediate", Intermediate)
660
+ await proc.spawn("immediate", Intermediate).initialized
596
661
 
597
662
 
598
663
  # TODO - re-enable after resolving T232206970
599
664
  @pytest.mark.oss_skip
600
665
  async def test_supervision_with_sending_error():
666
+ # Messages of length > this will cause a send error and a returned
667
+ # undeliverable.
601
668
  os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "50000000"
669
+ # Limit retries for sending before giving up.
670
+ os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "5"
602
671
 
603
672
  proc = await proc_mesh(gpus=1)
604
673
  actor_mesh = await proc.spawn("healthy", HealthyActor)
@@ -610,12 +679,71 @@ async def test_supervision_with_sending_error():
610
679
 
611
680
  # send a large payload to trigger send timeout error
612
681
  with pytest.raises(
613
- SupervisionError, match="supervision error:.*actor mesh is stopped"
682
+ SupervisionError,
683
+ match=".*Actor .* exited because of the following reason",
614
684
  ):
615
685
  await actor_mesh.check_with_payload.call(payload="a" * 55000000)
616
686
 
617
687
  # new call should fail with check of health state of actor mesh
618
- with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
688
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
619
689
  await actor_mesh.check.call()
620
- with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
690
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
621
691
  await actor_mesh.check_with_payload.call(payload="a")
692
+
693
+
694
+ async def test_slice_supervision() -> None:
695
+ pm = await proc_mesh(gpus=4)
696
+ healthy_mesh = await pm.spawn("healthy", HealthyActor)
697
+ error_mesh = await pm.spawn("error", ErrorActor)
698
+ slice_1 = error_mesh.slice(gpus=slice(2, 4))
699
+ slice_2 = error_mesh.slice(gpus=2)
700
+ slice_3 = error_mesh.slice(gpus=3)
701
+
702
+ # Trigger supervision error on gpus=3
703
+ with pytest.raises(SupervisionError, match="did not handle supervision event"):
704
+ await slice_3.fail_with_supervision_error.call()
705
+
706
+ # Mesh containing all gpus is unhealthy
707
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
708
+ await error_mesh.check.call()
709
+
710
+ # Slice containing only gpus=3 is unhealthy
711
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
712
+ await slice_3.check.call()
713
+
714
+ # Slice containing gpus=3 is unhealthy
715
+ with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
716
+ await slice_1.check.call()
717
+
718
+ # Slice not containing gpus=3 is healthy
719
+ check = await slice_2.check.call()
720
+ for _, item in check.items():
721
+ assert item == "this is a healthy check"
722
+
723
+ # Other actor mesh on the same proc mesh is healthy
724
+ check = await healthy_mesh.check.call()
725
+ for _, item in check.items():
726
+ assert item == "this is a healthy check"
727
+
728
+
729
+ async def test_mesh_slices_inherit_parent_errors() -> None:
730
+ pm = await proc_mesh(gpus=4)
731
+ error_mesh = await pm.spawn("error", ErrorActor)
732
+ slice_1 = error_mesh.slice(gpus=slice(2, 4))
733
+
734
+ # Trigger supervision error on gpus=2, 3, 4
735
+ with pytest.raises(SupervisionError):
736
+ await slice_1.fail_with_supervision_error.call()
737
+
738
+ # Newly created slice containing gpu=3 is unhealthy
739
+ slice_2 = error_mesh.slice(gpus=3)
740
+ with pytest.raises(SupervisionError):
741
+ await slice_2.check.call()
742
+
743
+ # Newly created slice containing gpu=1 is healthy
744
+ slice_3 = error_mesh.slice(gpus=1)
745
+ check = await slice_3.check.call()
746
+ for _, item in check.items():
747
+ assert item == "this is a healthy check"
748
+
749
+ await pm.stop()
tests/test_alloc.py CHANGED
@@ -20,6 +20,6 @@ class TestAlloc(IsolatedAsyncioTestCase):
20
20
  cmd = "echo hello"
21
21
  allocator = ProcessAllocator(cmd)
22
22
  spec = AllocSpec(AllocConstraints(), replica=2)
23
- alloc = await allocator.allocate(spec)
23
+ alloc = allocator.allocate(spec)
24
24
 
25
25
  print(alloc)