torchmonarch-nightly 2025.6.30__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +874 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +270 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +500 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +56 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +51 -0
- monarch/actor_mesh.py +6 -752
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +12 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/mesh_controller.py +201 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +95 -35
- monarch/tools/mesh_spec.py +55 -0
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +75 -9
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +373 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +162 -0
- tests/test_python_actors.py +184 -332
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +55 -1
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
monarch/tools/commands.py
CHANGED
@@ -7,7 +7,6 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
9
|
import argparse
|
10
|
-
import functools
|
11
10
|
import inspect
|
12
11
|
import logging
|
13
12
|
import os
|
@@ -21,8 +20,8 @@ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/con
|
|
21
20
|
)
|
22
21
|
|
23
22
|
from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
|
24
|
-
from torchx.runner import Runner
|
25
|
-
from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
|
23
|
+
from torchx.runner import Runner # @manual=//torchx/runner:lib_core
|
24
|
+
from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
|
26
25
|
from torchx.specs.builders import parse_args
|
27
26
|
from torchx.util.types import decode, decode_optional
|
28
27
|
|
@@ -84,14 +83,10 @@ def component_args_from_cli(
|
|
84
83
|
|
85
84
|
def create(
|
86
85
|
config: Config,
|
87
|
-
|
88
|
-
) ->
|
86
|
+
appdef: AppDef,
|
87
|
+
) -> Union[str, AppDryRunInfo]:
|
89
88
|
"""Creates a monarch server by submitting it as a job to the target scheduler.
|
90
89
|
|
91
|
-
Note that this function returns a `Callable` that has to be called with the
|
92
|
-
same arguments that one would call the `component_fn` to actually submit
|
93
|
-
the job that runs the monarch server.
|
94
|
-
|
95
90
|
Usage:
|
96
91
|
|
97
92
|
.. doc-test::
|
@@ -99,6 +94,8 @@ def create(
|
|
99
94
|
from monarch.tools.config import defaults
|
100
95
|
|
101
96
|
config = defaults.config(scheduler="slurm")
|
97
|
+
appdef = defaults.component_fn(scheduler=config.scheduler)()
|
98
|
+
|
102
99
|
config.scheduler_args.update(
|
103
100
|
{
|
104
101
|
"partition": "prod",
|
@@ -108,7 +105,7 @@ def create(
|
|
108
105
|
)
|
109
106
|
config.dryrun = True
|
110
107
|
|
111
|
-
create(
|
108
|
+
create(config, appdef)
|
112
109
|
|
113
110
|
|
114
111
|
Args:
|
@@ -120,33 +117,26 @@ def create(
|
|
120
117
|
"""
|
121
118
|
scheduler: str = config.scheduler
|
122
119
|
cfg: Mapping[str, CfgVal] = config.scheduler_args
|
123
|
-
component: Callable[..., AppDef] = component_fn or defaults.component_fn(scheduler)
|
124
|
-
|
125
|
-
@functools.wraps(component)
|
126
|
-
def _run(*args: Any, **kwargs: Any) -> Union[str, AppDryRunInfo]:
|
127
|
-
# for logging call-site context in application metadata
|
128
|
-
os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
|
129
120
|
|
130
|
-
|
121
|
+
# for logging call-site context in application metadata
|
122
|
+
os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
|
131
123
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
info_json_fmt = AppDryRunInfo(
|
136
|
-
info.request,
|
137
|
-
fmt=defaults.dryrun_info_formatter(info),
|
138
|
-
)
|
139
|
-
info_json_fmt._app = info._app
|
140
|
-
info_json_fmt._cfg = info._cfg
|
141
|
-
info_json_fmt._scheduler = info._scheduler
|
124
|
+
with torchx_runner() as runner:
|
125
|
+
info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
|
142
126
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
127
|
+
info_json_fmt = AppDryRunInfo(
|
128
|
+
info.request,
|
129
|
+
fmt=defaults.dryrun_info_formatter(info),
|
130
|
+
)
|
131
|
+
info_json_fmt._app = info._app
|
132
|
+
info_json_fmt._cfg = info._cfg
|
133
|
+
info_json_fmt._scheduler = info._scheduler
|
148
134
|
|
149
|
-
|
135
|
+
if config.dryrun:
|
136
|
+
return info_json_fmt
|
137
|
+
else:
|
138
|
+
server_handle = runner.schedule(info)
|
139
|
+
return server_handle
|
150
140
|
|
151
141
|
|
152
142
|
def info(server_handle: str) -> Optional[ServerSpec]:
|
@@ -183,14 +173,22 @@ def info(server_handle: str) -> Optional[ServerSpec]:
|
|
183
173
|
|
184
174
|
mesh_specs.append(spec)
|
185
175
|
|
186
|
-
|
176
|
+
scheduler, namespace, _ = parse_app_handle(server_handle)
|
177
|
+
return ServerSpec(
|
178
|
+
name=appdef.name,
|
179
|
+
state=status.state,
|
180
|
+
meshes=mesh_specs,
|
181
|
+
scheduler=scheduler,
|
182
|
+
namespace=namespace,
|
183
|
+
)
|
187
184
|
|
188
185
|
|
189
186
|
_5_SECONDS = timedelta(seconds=5)
|
190
187
|
|
191
188
|
|
192
189
|
async def server_ready(
|
193
|
-
server_handle: str,
|
190
|
+
server_handle: str,
|
191
|
+
check_interval: timedelta = _5_SECONDS,
|
194
192
|
) -> Optional[ServerSpec]:
|
195
193
|
"""Waits until the server's job is in RUNNING state to returns the server spec.
|
196
194
|
Returns `None` if the server does not exist.
|
@@ -236,6 +234,68 @@ async def server_ready(
|
|
236
234
|
return server_spec
|
237
235
|
|
238
236
|
|
237
|
+
async def get_or_create(
|
238
|
+
name: str,
|
239
|
+
config: Config,
|
240
|
+
appdef: AppDef,
|
241
|
+
check_interval: timedelta = _5_SECONDS,
|
242
|
+
) -> ServerSpec:
|
243
|
+
"""Waits for the server called `name` in the scheduler specified in the `config`
|
244
|
+
to be ready (e.g. RUNNING). If the server is not found then this function creates one
|
245
|
+
per the `appdef` spec, and waits for the server to be ready before returning.
|
246
|
+
|
247
|
+
Usage:
|
248
|
+
|
249
|
+
.. code-block:: python
|
250
|
+
|
251
|
+
import getpass
|
252
|
+
from monarch.tools.config import defaults
|
253
|
+
|
254
|
+
USER = getpass.getuser()
|
255
|
+
config = defaults.config(scheduler)
|
256
|
+
appdef = defaults.component_fn(config.scheduler)()
|
257
|
+
|
258
|
+
server_handle = get_or_create(f"{USER}_monarch", config, appdef)
|
259
|
+
server_info = info(server_handle)
|
260
|
+
|
261
|
+
Returns: A `ServerSpec` containing information about either the existing or the newly
|
262
|
+
created server.
|
263
|
+
|
264
|
+
"""
|
265
|
+
assert not config.dryrun, "dryrun is not supported for get_or_create(), for dryrun use the create() API instead"
|
266
|
+
|
267
|
+
server_handle = f"{config.scheduler}:///{name}"
|
268
|
+
server_info = await server_ready(server_handle, check_interval)
|
269
|
+
|
270
|
+
if not server_info or not server_info.is_running: # then create one
|
271
|
+
logger.info(
|
272
|
+
"no existing RUNNING server `%s` creating new one...", server_handle
|
273
|
+
)
|
274
|
+
|
275
|
+
# no dryrun (see assertion above) support so will always be a handle (str)
|
276
|
+
new_server_handle = str(create(config, appdef))
|
277
|
+
|
278
|
+
logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
|
279
|
+
|
280
|
+
server_info = await server_ready(new_server_handle, check_interval)
|
281
|
+
|
282
|
+
if not server_info:
|
283
|
+
raise RuntimeError(
|
284
|
+
f"the new server `{new_server_handle}` went missing (should never happen)"
|
285
|
+
)
|
286
|
+
|
287
|
+
if not server_info.is_running:
|
288
|
+
raise RuntimeError(
|
289
|
+
f"the new server `{new_server_handle}` has {server_info.state}"
|
290
|
+
)
|
291
|
+
|
292
|
+
logger.info(f"server `{new_server_handle}` is: {server_info.state}")
|
293
|
+
return server_info
|
294
|
+
else:
|
295
|
+
logger.info("found existing RUNNING server `%s`", server_handle)
|
296
|
+
return server_info
|
297
|
+
|
298
|
+
|
239
299
|
def kill(server_handle: str) -> None:
|
240
300
|
with torchx_runner() as runner:
|
241
301
|
runner.cancel(server_handle)
|
monarch/tools/mesh_spec.py
CHANGED
@@ -11,6 +11,7 @@ from typing import Any, Optional
|
|
11
11
|
|
12
12
|
from monarch.tools.network import get_sockaddr
|
13
13
|
from torchx import specs
|
14
|
+
from torchx.specs.api import is_terminal
|
14
15
|
|
15
16
|
DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
|
16
17
|
|
@@ -122,11 +123,64 @@ class ServerSpec:
|
|
122
123
|
name: str
|
123
124
|
state: specs.AppState
|
124
125
|
meshes: list[MeshSpec]
|
126
|
+
scheduler: str
|
127
|
+
namespace: str = ""
|
128
|
+
|
129
|
+
@property
|
130
|
+
def server_handle(self) -> str:
|
131
|
+
return f"{self.scheduler}://{self.namespace}/{self.name}"
|
125
132
|
|
126
133
|
@property
|
127
134
|
def is_running(self) -> bool:
|
128
135
|
return self.state == specs.AppState.RUNNING
|
129
136
|
|
137
|
+
def host0(self, mesh_name: str) -> str:
|
138
|
+
"""The hostname of the first node in the given mesh.
|
139
|
+
The return value of this method can be used to set `MASTER_ADDR` env var for torch.distributed.
|
140
|
+
|
141
|
+
NOTE: the state of this server must be RUNNING for this method to return a valid value.
|
142
|
+
|
143
|
+
Usage:
|
144
|
+
|
145
|
+
.. code-block::python
|
146
|
+
from monarch.tools.commands import get_or_create
|
147
|
+
|
148
|
+
server_info = await get_or_create(...)
|
149
|
+
assert server_info.is_running
|
150
|
+
|
151
|
+
# allocate proc mesh -> create actor (code omitted for brevity)...
|
152
|
+
|
153
|
+
trainer_actor.call(
|
154
|
+
MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host
|
155
|
+
MASTER_PORT=29500,
|
156
|
+
...
|
157
|
+
)
|
158
|
+
|
159
|
+
NOTE: The ordering of the hostnames is exactly the same as what comes back from the underlying
|
160
|
+
scheduler's `describe_job` or `list_*` API. Please find the exact semantics in the
|
161
|
+
respective scheduler's implementation in https://github.com/pytorch/torchx/tree/main/torchx/schedulers.
|
162
|
+
"""
|
163
|
+
mesh_spec = self.get_mesh_spec(mesh_name)
|
164
|
+
if self.is_running:
|
165
|
+
# hostnames are only valid when the server is RUNNING
|
166
|
+
if not mesh_spec.hostnames:
|
167
|
+
raise RuntimeError(f"{self.server_handle} does not have any hosts")
|
168
|
+
return mesh_spec.hostnames[0]
|
169
|
+
elif self.state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
|
170
|
+
raise RuntimeError(
|
171
|
+
f"{self.server_handle} is {self.state}."
|
172
|
+
f" Use `monarch.tools.commands.server_ready()` to wait for the server to be {specs.AppState.RUNNING}"
|
173
|
+
)
|
174
|
+
elif is_terminal(self.state):
|
175
|
+
raise RuntimeError(
|
176
|
+
f"{self.server_handle} is {self.state}."
|
177
|
+
" Use `monarch.tools.commands.get_or_create()` to create a new server"
|
178
|
+
)
|
179
|
+
else:
|
180
|
+
raise RuntimeError(
|
181
|
+
f"{self.server_handle} is in an invalid state: {self.state}. Please report this as a bug"
|
182
|
+
)
|
183
|
+
|
130
184
|
def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
|
131
185
|
for mesh_spec in self.meshes:
|
132
186
|
if mesh_spec.name == mesh_name:
|
@@ -152,6 +206,7 @@ class ServerSpec:
|
|
152
206
|
|
153
207
|
return {
|
154
208
|
"name": self.name,
|
209
|
+
"server_handle": self.server_handle,
|
155
210
|
"state": self.state.name,
|
156
211
|
"meshes": {
|
157
212
|
mesh.name: {
|
monarch/tools/utils.py
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import os
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
|
12
|
+
class conda:
|
13
|
+
"""Conda related util functions."""
|
14
|
+
|
15
|
+
@staticmethod
|
16
|
+
def active_env_dir() -> Optional[str]:
|
17
|
+
"""
|
18
|
+
Returns the currently active conda environment's directory.
|
19
|
+
`None` if run outside of a conda environment.
|
20
|
+
"""
|
21
|
+
return os.getenv("CONDA_PREFIX")
|
22
|
+
|
23
|
+
@staticmethod
|
24
|
+
def active_env_name() -> Optional[str]:
|
25
|
+
"""
|
26
|
+
Returns the currently active conda environment name.
|
27
|
+
`None` if run outside of a conda environment.
|
28
|
+
"""
|
29
|
+
env_name = os.getenv("CONDA_DEFAULT_ENV")
|
30
|
+
|
31
|
+
if not env_name:
|
32
|
+
# conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so
|
33
|
+
# fallback to CONDA_PREFIX which points to the path of the currently active conda environment
|
34
|
+
# e.g./home/$USER/.conda/envs/{env_name}
|
35
|
+
if env_dir := conda.active_env_dir():
|
36
|
+
env_name = os.path.basename(env_dir)
|
37
|
+
|
38
|
+
return env_name
|
monarch/worker/worker.py
CHANGED
@@ -37,13 +37,13 @@ import torch.distributed
|
|
37
37
|
import torch.fx
|
38
38
|
import zmq
|
39
39
|
import zmq.asyncio
|
40
|
+
from monarch._src.actor.shape import NDSlice
|
40
41
|
|
41
42
|
from monarch.common import messages
|
42
43
|
from monarch.common.function import ResolvableFunction
|
43
44
|
from monarch.common.messages import DependentOnError, Dims
|
44
45
|
from monarch.common.process_group import SingleControllerProcessGroupWrapper
|
45
46
|
from monarch.common.reference import Ref, Referenceable
|
46
|
-
from monarch.common.shape import NDSlice
|
47
47
|
from monarch.common.tensor_factory import TensorFactory
|
48
48
|
from monarch.common.tree import flatten, flattener
|
49
49
|
from monarch_supervisor import get_message_queue, Letter
|
monarch/world_mesh.py
CHANGED
@@ -8,10 +8,11 @@
|
|
8
8
|
|
9
9
|
from typing import List
|
10
10
|
|
11
|
+
from monarch._src.actor.shape import NDSlice
|
12
|
+
|
11
13
|
from monarch.common.client import Client
|
12
14
|
|
13
15
|
from monarch.common.device_mesh import DeviceMesh
|
14
|
-
from monarch.common.shape import NDSlice
|
15
16
|
|
16
17
|
from monarch.controller.backend import ProcessBackend
|
17
18
|
|
@@ -11,7 +11,10 @@ import sys
|
|
11
11
|
try:
|
12
12
|
from __manifest__ import fbmake # noqa
|
13
13
|
|
14
|
-
|
14
|
+
# simply checking for the existence of __manifest__ is not enough to tell if we are in a PAR
|
15
|
+
# because monarch wheels include a dummy __manifest__ (see fbcode//monarch/python/monarch/session/meta/__manifest__.py)
|
16
|
+
# so that we can use libfb programmatically. Hence additionally check if the `par_style` key is not null/empty
|
17
|
+
IN_PAR = bool(fbmake.get("par_style"))
|
15
18
|
except ImportError:
|
16
19
|
IN_PAR = False
|
17
20
|
|
@@ -26,8 +29,8 @@ if IN_PAR:
|
|
26
29
|
PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
|
27
30
|
else:
|
28
31
|
try:
|
29
|
-
with importlib.resources.
|
30
|
-
"monarch_tensor_worker_env"
|
32
|
+
with importlib.resources.as_file(
|
33
|
+
importlib.resources.files("monarch_tensor_worker_env") / "worker_env"
|
31
34
|
) as path:
|
32
35
|
if not path.exists():
|
33
36
|
raise ImportError()
|
tests/error_test_binary.py
CHANGED
@@ -9,11 +9,11 @@ import ctypes
|
|
9
9
|
import sys
|
10
10
|
|
11
11
|
import click
|
12
|
+
from monarch._rust_bindings.monarch_extension.blocking import blocking_function
|
12
13
|
|
13
14
|
from monarch._rust_bindings.monarch_extension.panic import panicking_function
|
14
15
|
|
15
|
-
from monarch.
|
16
|
-
from monarch.proc_mesh import proc_mesh
|
16
|
+
from monarch.actor import Actor, endpoint, proc_mesh, send
|
17
17
|
|
18
18
|
|
19
19
|
class ErrorActor(Actor):
|
@@ -36,12 +36,24 @@ class ErrorActor(Actor):
|
|
36
36
|
"""Endpoint that calls a Rust function that panics."""
|
37
37
|
panicking_function()
|
38
38
|
|
39
|
+
@endpoint
|
40
|
+
async def cause_stuck(self) -> None:
|
41
|
+
"""Endpoint that causes the process to hang indefinitely."""
|
42
|
+
blocking_function()
|
43
|
+
|
39
44
|
@endpoint
|
40
45
|
async def await_then_error(self) -> None:
|
41
46
|
await asyncio.sleep(0.1)
|
42
47
|
await asyncio.sleep(0.1)
|
43
48
|
raise RuntimeError("oh noez")
|
44
49
|
|
50
|
+
@endpoint
|
51
|
+
async def get_pid(self) -> int:
|
52
|
+
"""Endpoint that returns the process PID."""
|
53
|
+
import os
|
54
|
+
|
55
|
+
return os.getpid()
|
56
|
+
|
45
57
|
|
46
58
|
class ErrorActorSync(Actor):
|
47
59
|
"""An actor that has endpoints cause segfaults."""
|
@@ -73,8 +85,7 @@ def _run_error_test_sync(num_procs, sync_endpoint, endpoint_name):
|
|
73
85
|
error_actor = proc.spawn("error_actor", actor_class).get()
|
74
86
|
|
75
87
|
# This output is checked in the test to make sure that the process actually got here
|
76
|
-
print("
|
77
|
-
sys.stdout.flush()
|
88
|
+
print("Started function error_test", flush=True)
|
78
89
|
|
79
90
|
if endpoint_name == "cause_segfault":
|
80
91
|
endpoint = error_actor.cause_segfault
|
@@ -104,8 +115,7 @@ def _run_error_test(num_procs, sync_endpoint, endpoint_name):
|
|
104
115
|
error_actor = await proc.spawn("error_actor", actor_class)
|
105
116
|
|
106
117
|
# This output is checked in the test to make sure that the process actually got here
|
107
|
-
print("
|
108
|
-
sys.stdout.flush()
|
118
|
+
print("Started function error_test", flush=True)
|
109
119
|
|
110
120
|
if endpoint_name == "cause_segfault":
|
111
121
|
endpoint = error_actor.cause_segfault
|
@@ -147,12 +157,30 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
|
|
147
157
|
|
148
158
|
@main.command("error-bootstrap")
|
149
159
|
def error_bootstrap():
|
150
|
-
print("
|
151
|
-
sys.stdout.flush()
|
160
|
+
print("Started function error_bootstrap", flush=True)
|
152
161
|
|
153
162
|
proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
|
154
163
|
|
155
164
|
|
165
|
+
async def _error_unmonitored():
|
166
|
+
print("Started function _error_unmonitored", flush=True)
|
167
|
+
|
168
|
+
proc = await proc_mesh(gpus=1)
|
169
|
+
actor = await proc.spawn("error_actor", ErrorActor)
|
170
|
+
|
171
|
+
# fire and forget
|
172
|
+
send(actor.await_then_error, (), {}, None, "all")
|
173
|
+
|
174
|
+
# Wait. Eventually a supervision event will get propagated and the process
|
175
|
+
# will exit.
|
176
|
+
#
|
177
|
+
# If an event is not delivered, the test will time out before this sleep
|
178
|
+
# finishes.
|
179
|
+
await asyncio.sleep(300)
|
180
|
+
|
181
|
+
|
182
|
+
"""
|
183
|
+
TODO: This test should be enabled when stop() is fully implemented.
|
156
184
|
async def _error_unmonitored():
|
157
185
|
print("I actually ran")
|
158
186
|
sys.stdout.flush()
|
@@ -161,7 +189,8 @@ async def _error_unmonitored():
|
|
161
189
|
actor = await proc.spawn("error_actor", ErrorActor)
|
162
190
|
|
163
191
|
# fire and forget
|
164
|
-
send(actor.
|
192
|
+
send(actor.cause_stuck, (), {}, None, "all")
|
193
|
+
proc_mesh.stop()
|
165
194
|
|
166
195
|
# Wait. Eventually a supervision event will get propagated and the process
|
167
196
|
# will exit.
|
@@ -169,6 +198,7 @@ async def _error_unmonitored():
|
|
169
198
|
# If an event is not delivered, the test will time out before this sleep
|
170
199
|
# finishes.
|
171
200
|
await asyncio.sleep(300)
|
201
|
+
"""
|
172
202
|
|
173
203
|
|
174
204
|
@main.command("error-unmonitored")
|
@@ -176,5 +206,41 @@ def error_unmonitored():
|
|
176
206
|
asyncio.run(_error_unmonitored())
|
177
207
|
|
178
208
|
|
209
|
+
async def _error_cleanup():
|
210
|
+
"""Test function that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
|
211
|
+
print("Started function _error_cleanup() for parent process", flush=True)
|
212
|
+
|
213
|
+
# Spawn an 8 process procmesh
|
214
|
+
proc = await proc_mesh(gpus=8)
|
215
|
+
error_actor = await proc.spawn("error_actor", ErrorActor)
|
216
|
+
|
217
|
+
print("Procmesh spawned, collecting child PIDs from actors", flush=True)
|
218
|
+
|
219
|
+
# Get PIDs from all actor processes
|
220
|
+
try:
|
221
|
+
# Call get_pid endpoint on all actors to collect their PIDs
|
222
|
+
pids = await error_actor.get_pid.call()
|
223
|
+
child_pids = [str(pid) for _, pid in pids]
|
224
|
+
print(f"CHILD_PIDS: {','.join(child_pids)}", flush=True)
|
225
|
+
except Exception as e:
|
226
|
+
print(f"Error getting child PIDs from actors: {e}", flush=True)
|
227
|
+
|
228
|
+
print("About to call endpoint that raises exception", flush=True)
|
229
|
+
|
230
|
+
# Call an endpoint that raises a normal exception
|
231
|
+
try:
|
232
|
+
await error_actor.await_then_error.call()
|
233
|
+
except Exception as e:
|
234
|
+
print(f"Expected exception caught: {e}", flush=True)
|
235
|
+
# Re-raise to cause the process to exit with non-zero code
|
236
|
+
raise
|
237
|
+
|
238
|
+
|
239
|
+
@main.command("error-cleanup")
|
240
|
+
def error_cleanup():
|
241
|
+
"""Command that spawns an 8 process procmesh and calls an endpoint that returns a normal exception."""
|
242
|
+
asyncio.run(_error_cleanup())
|
243
|
+
|
244
|
+
|
179
245
|
if __name__ == "__main__":
|
180
246
|
main()
|