torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
monarch/tools/commands.py CHANGED
@@ -7,7 +7,6 @@
7
7
  # pyre-strict
8
8
 
9
9
  import argparse
10
- import functools
11
10
  import inspect
12
11
  import logging
13
12
  import os
@@ -21,8 +20,8 @@ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/con
21
20
  )
22
21
 
23
22
  from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
24
- from torchx.runner import Runner
25
- from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
23
+ from torchx.runner import Runner # @manual=//torchx/runner:lib_core
24
+ from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
26
25
  from torchx.specs.builders import parse_args
27
26
  from torchx.util.types import decode, decode_optional
28
27
 
@@ -84,14 +83,10 @@ def component_args_from_cli(
84
83
 
85
84
  def create(
86
85
  config: Config,
87
- component_fn: Optional[Callable[..., AppDef]] = None,
88
- ) -> Callable[..., Union[str, AppDryRunInfo]]:
86
+ appdef: AppDef,
87
+ ) -> Union[str, AppDryRunInfo]:
89
88
  """Creates a monarch server by submitting it as a job to the target scheduler.
90
89
 
91
- Note that this function returns a `Callable` that has to be called with the
92
- same arguments that one would call the `component_fn` to actually submit
93
- the job that runs the monarch server.
94
-
95
90
  Usage:
96
91
 
97
92
  .. doc-test::
@@ -99,6 +94,8 @@ def create(
99
94
  from monarch.tools.config import defaults
100
95
 
101
96
  config = defaults.config(scheduler="slurm")
97
+ appdef = defaults.component_fn(scheduler=config.scheduler)()
98
+
102
99
  config.scheduler_args.update(
103
100
  {
104
101
  "partition": "prod",
@@ -108,7 +105,7 @@ def create(
108
105
  )
109
106
  config.dryrun = True
110
107
 
111
- create(default_config)(host_type="gpu.medium", num_hosts=4)
108
+ create(config, appdef)
112
109
 
113
110
 
114
111
  Args:
@@ -120,33 +117,26 @@ def create(
120
117
  """
121
118
  scheduler: str = config.scheduler
122
119
  cfg: Mapping[str, CfgVal] = config.scheduler_args
123
- component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
124
-
125
- @functools.wraps(component)
126
- def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
127
- # for logging call-site context in application metadata
128
- os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
129
120
 
130
- appdef = component(*args, **kwargs)
121
+ # for logging call-site context in application metadata
122
+ os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
131
123
 
132
- with torchx_runner() as runner:
133
- info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
134
-
135
- info_json_fmt = AppDryRunInfo(
136
- info.request,
137
- fmt=defaults.dryrun_info_formatter(info),
138
- )
139
- info_json_fmt._app = info._app
140
- info_json_fmt._cfg = info._cfg
141
- info_json_fmt._scheduler = info._scheduler
124
+ with torchx_runner() as runner:
125
+ info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
142
126
 
143
- if config.dryrun:
144
- return info_json_fmt
145
- else:
146
- server_handle = runner.schedule(info)
147
- return server_handle
127
+ info_json_fmt = AppDryRunInfo(
128
+ info.request,
129
+ fmt=defaults.dryrun_info_formatter(info),
130
+ )
131
+ info_json_fmt._app = info._app
132
+ info_json_fmt._cfg = info._cfg
133
+ info_json_fmt._scheduler = info._scheduler
148
134
 
149
- return _run
135
+ if config.dryrun:
136
+ return info_json_fmt
137
+ else:
138
+ server_handle = runner.schedule(info)
139
+ return server_handle
150
140
 
151
141
 
152
142
  def info(server_handle: str) -> Optional[ServerSpec]:
@@ -183,14 +173,22 @@ def info(server_handle: str) -> Optional[ServerSpec]:
183
173
 
184
174
  mesh_specs.append(spec)
185
175
 
186
- return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
176
+ scheduler, namespace, _ = parse_app_handle(server_handle)
177
+ return ServerSpec(
178
+ name=appdef.name,
179
+ state=status.state,
180
+ meshes=mesh_specs,
181
+ scheduler=scheduler,
182
+ namespace=namespace,
183
+ )
187
184
 
188
185
 
189
186
  _5_SECONDS = timedelta(seconds=5)
190
187
 
191
188
 
192
189
  async def server_ready(
193
- server_handle: str, check_interval: timedelta = _5_SECONDS
190
+ server_handle: str,
191
+ check_interval: timedelta = _5_SECONDS,
194
192
  ) -> Optional[ServerSpec]:
195
193
  """Waits until the server's job is in RUNNING state to returns the server spec.
196
194
  Returns `None` if the server does not exist.
@@ -236,6 +234,68 @@ async def server_ready(
236
234
  return server_spec
237
235
 
238
236
 
237
+ async def get_or_create(
238
+ name: str,
239
+ config: Config,
240
+ appdef: AppDef,
241
+ check_interval: timedelta = _5_SECONDS,
242
+ ) -> ServerSpec:
243
+ """Waits for the server called `name` in the scheduler specified in the `config`
244
+ to be ready (e.g. RUNNING). If the server is not found then this function creates one
245
+ per the `appdef` spec, and waits for the server to be ready before returning.
246
+
247
+ Usage:
248
+
249
+ .. code-block:: python
250
+
251
+ import getpass
252
+ from monarch.tools.config import defaults
253
+
254
+ USER = getpass.getuser()
255
+ config = defaults.config(scheduler)
256
+ appdef = defaults.component_fn(config.scheduler)()
257
+
258
+ server_handle = get_or_create(f"{USER}_monarch", config, appdef)
259
+ server_info = info(server_handle)
260
+
261
+ Returns: A `ServerSpec` containing information about either the existing or the newly
262
+ created server.
263
+
264
+ """
265
+ assert not config.dryrun, "dryrun is not supported for get_or_create(), for dryrun use the create() API instead"
266
+
267
+ server_handle = f"{config.scheduler}:///{name}"
268
+ server_info = await server_ready(server_handle, check_interval)
269
+
270
+ if not server_info or not server_info.is_running: # then create one
271
+ logger.info(
272
+ "no existing RUNNING server `%s` creating new one...", server_handle
273
+ )
274
+
275
+ # no dryrun (see assertion above) support so will always be a handle (str)
276
+ new_server_handle = str(create(config, appdef))
277
+
278
+ logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
279
+
280
+ server_info = await server_ready(new_server_handle, check_interval)
281
+
282
+ if not server_info:
283
+ raise RuntimeError(
284
+ f"the new server `{new_server_handle}` went missing (should never happen)"
285
+ )
286
+
287
+ if not server_info.is_running:
288
+ raise RuntimeError(
289
+ f"the new server `{new_server_handle}` has {server_info.state}"
290
+ )
291
+
292
+ logger.info(f"server `{new_server_handle}` is: {server_info.state}")
293
+ return server_info
294
+ else:
295
+ logger.info("found existing RUNNING server `%s`", server_handle)
296
+ return server_info
297
+
298
+
239
299
  def kill(server_handle: str) -> None:
240
300
  with torchx_runner() as runner:
241
301
  runner.cancel(server_handle)
@@ -11,6 +11,7 @@ from typing import Any, Optional
11
11
 
12
12
  from monarch.tools.network import get_sockaddr
13
13
  from torchx import specs
14
+ from torchx.specs.api import is_terminal
14
15
 
15
16
  DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
16
17
 
@@ -122,11 +123,64 @@ class ServerSpec:
122
123
  name: str
123
124
  state: specs.AppState
124
125
  meshes: list[MeshSpec]
126
+ scheduler: str
127
+ namespace: str = ""
128
+
129
+ @property
130
+ def server_handle(self) -> str:
131
+ return f"{self.scheduler}://{self.namespace}/{self.name}"
125
132
 
126
133
  @property
127
134
  def is_running(self) -> bool:
128
135
  return self.state == specs.AppState.RUNNING
129
136
 
137
+ def host0(self, mesh_name: str) -> str:
138
+ """The hostname of the first node in the given mesh.
139
+ The return value of this method can be used to set `MASTER_ADDR` env var for torch.distributed.
140
+
141
+ NOTE: the state of this server must be RUNNING for this method to return a valid value.
142
+
143
+ Usage:
144
+
145
+ .. code-block::python
146
+ from monarch.tools.commands import get_or_create
147
+
148
+ server_info = await get_or_create(...)
149
+ assert server_info.is_running
150
+
151
+ # allocate proc mesh -> create actor (code omitted for brevity)...
152
+
153
+ trainer_actor.call(
154
+ MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host
155
+ MASTER_PORT=29500,
156
+ ...
157
+ )
158
+
159
+ NOTE: The ordering of the hostnames is exactly the same as what comes back from the underlying
160
+ scheduler's `describe_job` or `list_*` API. Please find the exact semantics in the
161
+ respective scheduler's implementation in https://github.com/pytorch/torchx/tree/main/torchx/schedulers.
162
+ """
163
+ mesh_spec = self.get_mesh_spec(mesh_name)
164
+ if self.is_running:
165
+ # hostnames are only valid when the server is RUNNING
166
+ if not mesh_spec.hostnames:
167
+ raise RuntimeError(f"{self.server_handle} does not have any hosts")
168
+ return mesh_spec.hostnames[0]
169
+ elif self.state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
170
+ raise RuntimeError(
171
+ f"{self.server_handle} is {self.state}."
172
+ f" Use `monarch.tools.commands.server_ready()` to wait for the server to be {specs.AppState.RUNNING}"
173
+ )
174
+ elif is_terminal(self.state):
175
+ raise RuntimeError(
176
+ f"{self.server_handle} is {self.state}."
177
+ " Use `monarch.tools.commands.get_or_create()` to create a new server"
178
+ )
179
+ else:
180
+ raise RuntimeError(
181
+ f"{self.server_handle} is in an invalid state: {self.state}. Please report this as a bug"
182
+ )
183
+
130
184
  def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
131
185
  for mesh_spec in self.meshes:
132
186
  if mesh_spec.name == mesh_name:
@@ -152,6 +206,7 @@ class ServerSpec:
152
206
 
153
207
  return {
154
208
  "name": self.name,
209
+ "server_handle": self.server_handle,
155
210
  "state": self.state.name,
156
211
  "meshes": {
157
212
  mesh.name: {
monarch/tools/utils.py ADDED
@@ -0,0 +1,38 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ import os
9
+ from typing import Optional
10
+
11
+
12
+ class conda:
13
+ """Conda related util functions."""
14
+
15
+ @staticmethod
16
+ def active_env_dir() -> Optional[str]:
17
+ """
18
+ Returns the currently active conda environment's directory.
19
+ `None` if run outside of a conda environment.
20
+ """
21
+ return os.getenv("CONDA_PREFIX")
22
+
23
+ @staticmethod
24
+ def active_env_name() -> Optional[str]:
25
+ """
26
+ Returns the currently active conda environment name.
27
+ `None` if run outside of a conda environment.
28
+ """
29
+ env_name = os.getenv("CONDA_DEFAULT_ENV")
30
+
31
+ if not env_name:
32
+ # conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so
33
+ # fallback to CONDA_PREFIX which points to the path of the currently active conda environment
34
+ # e.g./home/$USER/.conda/envs/{env_name}
35
+ if env_dir := conda.active_env_dir():
36
+ env_name = os.path.basename(env_dir)
37
+
38
+ return env_name
monarch/worker/worker.py CHANGED
@@ -37,13 +37,13 @@ import torch.distributed
37
37
  import torch.fx
38
38
  import zmq
39
39
  import zmq.asyncio
40
+ from monarch._src.actor.shape import NDSlice
40
41
 
41
42
  from monarch.common import messages
42
43
  from monarch.common.function import ResolvableFunction
43
44
  from monarch.common.messages import DependentOnError, Dims
44
45
  from monarch.common.process_group import SingleControllerProcessGroupWrapper
45
46
  from monarch.common.reference import Ref, Referenceable
46
- from monarch.common.shape import NDSlice
47
47
  from monarch.common.tensor_factory import TensorFactory
48
48
  from monarch.common.tree import flatten, flattener
49
49
  from monarch_supervisor import get_message_queue, Letter
monarch/world_mesh.py CHANGED
@@ -8,10 +8,11 @@
8
8
 
9
9
  from typing import List
10
10
 
11
+ from monarch._src.actor.shape import NDSlice
12
+
11
13
  from monarch.common.client import Client
12
14
 
13
15
  from monarch.common.device_mesh import DeviceMesh
14
- from monarch.common.shape import NDSlice
15
16
 
16
17
  from monarch.controller.backend import ProcessBackend
17
18
 
@@ -11,7 +11,10 @@ import sys
11
11
  try:
12
12
  from __manifest__ import fbmake # noqa
13
13
 
14
- IN_PAR = True
14
+ # simply checking for the existence of __manifest__ is not enough to tell if we are in a PAR
15
+ # because monarch wheels include a dummy __manifest__ (see fbcode//monarch/python/monarch/session/meta/__manifest__.py)
16
+ # so that we can use libfb programmatically. Hence additionally check if the `par_style` key is not null/empty
17
+ IN_PAR = bool(fbmake.get("par_style"))
15
18
  except ImportError:
16
19
  IN_PAR = False
17
20
 
@@ -26,8 +29,8 @@ if IN_PAR:
26
29
  PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
27
30
  else:
28
31
  try:
29
- with importlib.resources.path(
30
- "monarch_tensor_worker_env", "worker_env"
32
+ with importlib.resources.as_file(
33
+ importlib.resources.files("monarch_tensor_worker_env") / "worker_env"
31
34
  ) as path:
32
35
  if not path.exists():
33
36
  raise ImportError()
@@ -13,8 +13,7 @@ from monarch._rust_bindings.monarch_extension.blocking import blocking_function
13
13
 
14
14
  from monarch._rust_bindings.monarch_extension.panic import panicking_function
15
15
 
16
- from monarch.actor_mesh import Actor, endpoint, send
17
- from monarch.proc_mesh import proc_mesh
16
+ from monarch.actor import Actor, endpoint, proc_mesh, send
18
17
 
19
18
 
20
19
  class ErrorActor(Actor):
@@ -48,6 +47,13 @@ class ErrorActor(Actor):
48
47
  await asyncio.sleep(0.1)
49
48
  raise RuntimeError("oh noez")
50
49
 
50
+ @endpoint
51
+ async def get_pid(self) -> int:
52
+ """Endpoint that returns the process PID."""
53
+ import os
54
+
55
+ return os.getpid()
56
+
51
57
 
52
58
  class ErrorActorSync(Actor):
53
59
  """An actor that has endpoints cause segfaults."""
@@ -79,8 +85,7 @@ def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
79
85
  error_actor = proc.spawn("error_actor", actor_class).get()
80
86
 
81
87
  # This output is checked in the test to make sure that the process actually got here
82
- print("I actually ran")
83
- sys.stdout.flush()
88
+ print("Started function error_test", flush=True)
84
89
 
85
90
  if endpoint_name == "cause_segfault":
86
91
  endpoint = error_actor.cause_segfault
@@ -110,8 +115,7 @@ def _run_error_test(num_procs, sync_endpoint, endpoint_name):
110
115
  error_actor = await proc.spawn("error_actor", actor_class)
111
116
 
112
117
  # This output is checked in the test to make sure that the process actually got here
113
- print("I actually ran")
114
- sys.stdout.flush()
118
+ print("Started function error_test", flush=True)
115
119
 
116
120
  if endpoint_name == "cause_segfault":
117
121
  endpoint = error_actor.cause_segfault
@@ -153,15 +157,13 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
153
157
 
154
158
  @main.command("error-bootstrap")
155
159
  def error_bootstrap():
156
- print("I actually ran")
157
- sys.stdout.flush()
160
+ print("Started function error_bootstrap", flush=True)
158
161
 
159
162
  proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
160
163
 
161
164
 
162
165
  async def _error_unmonitored():
163
- print("I actually ran")
164
- sys.stdout.flush()
166
+ print("Started function _error_unmonitored", flush=True)
165
167
 
166
168
  proc = await proc_mesh(gpus=1)
167
169
  actor = await proc.spawn("error_actor", ErrorActor)
@@ -204,5 +206,41 @@ def error_unmonitored():
204
206
  asyncio.run(_error_unmonitored())
205
207
 
206
208
 
209
+ async def _error_cleanup():
210
+ """Test function that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
211
+ print("Started function _error_cleanup() for parent process", flush=True)
212
+
213
+ # Spawn an 8 process procmesh
214
+ proc = await proc_mesh(gpus=8)
215
+ error_actor = await proc.spawn("error_actor", ErrorActor)
216
+
217
+ print("Procmesh spawned, collecting child PIDs from actors", flush=True)
218
+
219
+ # Get PIDs from all actor processes
220
+ try:
221
+ # Call get_pid endpoint on all actors to collect their PIDs
222
+ pids = await error_actor.get_pid.call()
223
+ child_pids = [str(pid) for _, pid in pids]
224
+ print(f"CHILD_PIDS: {','.join(child_pids)}", flush=True)
225
+ except Exception as e:
226
+ print(f"Error getting child PIDs from actors: {e}", flush=True)
227
+
228
+ print("About to call endpoint that raises exception", flush=True)
229
+
230
+ # Call an endpoint that raises a normal exception
231
+ try:
232
+ await error_actor.await_then_error.call()
233
+ except Exception as e:
234
+ print(f"Expected exception caught: {e}", flush=True)
235
+ # Re-raise to cause the process to exit with non-zero code
236
+ raise
237
+
238
+
239
+ @main.command("error-cleanup")
240
+ def error_cleanup():
241
+ """Command that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
242
+ asyncio.run(_error_cleanup())
243
+
244
+
207
245
  if __name__ == "__main__":
208
246
  main()