torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.26__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +878 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +303 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +508 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +59 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +53 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +21 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/mesh_controller.py +263 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +129 -47
- monarch/tools/components/hyperactor.py +5 -3
- monarch/tools/config/__init__.py +18 -1
- monarch/tools/config/defaults.py +2 -2
- monarch/tools/mesh_spec.py +59 -1
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +369 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +161 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +81 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
- torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
tests/test_actor_error.py
CHANGED
@@ -4,14 +4,16 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
-
|
7
|
+
|
8
8
|
import importlib.resources
|
9
|
+
import os
|
9
10
|
import subprocess
|
11
|
+
import sys
|
10
12
|
|
11
13
|
import pytest
|
12
14
|
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
|
13
|
-
from monarch.
|
14
|
-
from monarch.
|
15
|
+
from monarch._rust_bindings.monarch_hyperactor.supervision import SupervisionError
|
16
|
+
from monarch.actor import Actor, ActorError, endpoint, local_proc_mesh, proc_mesh
|
15
17
|
|
16
18
|
|
17
19
|
class ExceptionActor(Actor):
|
@@ -66,16 +68,21 @@ class BrokenPickleClass:
|
|
66
68
|
self.__dict__.update(state)
|
67
69
|
|
68
70
|
|
71
|
+
@pytest.mark.parametrize(
|
72
|
+
"mesh",
|
73
|
+
[local_proc_mesh, proc_mesh],
|
74
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
75
|
+
)
|
69
76
|
@pytest.mark.parametrize(
|
70
77
|
"actor_class",
|
71
78
|
[ExceptionActor, ExceptionActorSync],
|
72
79
|
)
|
73
80
|
@pytest.mark.parametrize("num_procs", [1, 2])
|
74
|
-
async def test_actor_exception(actor_class, num_procs):
|
81
|
+
async def test_actor_exception(mesh, actor_class, num_procs):
|
75
82
|
"""
|
76
83
|
Test that exceptions raised in actor endpoints are propagated to the client.
|
77
84
|
"""
|
78
|
-
proc = await
|
85
|
+
proc = await mesh(gpus=num_procs)
|
79
86
|
exception_actor = await proc.spawn("exception_actor", actor_class)
|
80
87
|
|
81
88
|
with pytest.raises(ActorError, match="This is a test exception"):
|
@@ -85,16 +92,21 @@ async def test_actor_exception(actor_class, num_procs):
|
|
85
92
|
await exception_actor.raise_exception.call()
|
86
93
|
|
87
94
|
|
95
|
+
@pytest.mark.parametrize(
|
96
|
+
"mesh",
|
97
|
+
[local_proc_mesh, proc_mesh],
|
98
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
99
|
+
)
|
88
100
|
@pytest.mark.parametrize(
|
89
101
|
"actor_class",
|
90
102
|
[ExceptionActor, ExceptionActorSync],
|
91
103
|
)
|
92
104
|
@pytest.mark.parametrize("num_procs", [1, 2])
|
93
|
-
def test_actor_exception_sync(actor_class, num_procs):
|
105
|
+
def test_actor_exception_sync(mesh, actor_class, num_procs):
|
94
106
|
"""
|
95
107
|
Test that exceptions raised in actor endpoints are propagated to the client.
|
96
108
|
"""
|
97
|
-
proc =
|
109
|
+
proc = mesh(gpus=num_procs).get()
|
98
110
|
exception_actor = proc.spawn("exception_actor", actor_class).get()
|
99
111
|
|
100
112
|
with pytest.raises(ActorError, match="This is a test exception"):
|
@@ -104,6 +116,7 @@ def test_actor_exception_sync(actor_class, num_procs):
|
|
104
116
|
exception_actor.raise_exception.call().get()
|
105
117
|
|
106
118
|
|
119
|
+
'''
|
107
120
|
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
108
121
|
@pytest.mark.oss_skip
|
109
122
|
@pytest.mark.parametrize("num_procs", [1, 2])
|
@@ -140,10 +153,11 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
|
|
140
153
|
raise
|
141
154
|
|
142
155
|
# Assert that the subprocess exited with a non-zero code
|
143
|
-
assert "
|
156
|
+
assert "Started function error_test" in process.stdout.decode()
|
144
157
|
assert (
|
145
158
|
process.returncode != 0
|
146
159
|
), f"Expected non-zero exit code, got {process.returncode}"
|
160
|
+
'''
|
147
161
|
|
148
162
|
|
149
163
|
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
@@ -170,7 +184,7 @@ def test_proc_mesh_bootstrap_error():
|
|
170
184
|
raise
|
171
185
|
|
172
186
|
# Assert that the subprocess exited with a non-zero code
|
173
|
-
assert "
|
187
|
+
assert "Started function error_bootstrap" in process.stdout.decode()
|
174
188
|
assert (
|
175
189
|
process.returncode != 0
|
176
190
|
), f"Expected non-zero exit code, got {process.returncode}"
|
@@ -213,6 +227,7 @@ async def test_broken_pickle_class(raise_on_getstate, raise_on_setstate, num_pro
|
|
213
227
|
await exception_actor.print_value.call(broken_obj)
|
214
228
|
|
215
229
|
|
230
|
+
"""
|
216
231
|
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
217
232
|
@pytest.mark.oss_skip
|
218
233
|
async def test_exception_after_wait_unmonitored():
|
@@ -234,23 +249,135 @@ async def test_exception_after_wait_unmonitored():
|
|
234
249
|
raise
|
235
250
|
|
236
251
|
# Assert that the subprocess exited with a non-zero code
|
237
|
-
assert "
|
252
|
+
assert "Started function _error_unmonitored" in process.stdout.decode()
|
238
253
|
assert (
|
239
254
|
process.returncode != 0
|
240
255
|
), f"Expected non-zero exit code, got {process.returncode}"
|
256
|
+
"""
|
257
|
+
|
258
|
+
|
259
|
+
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
260
|
+
@pytest.mark.oss_skip
|
261
|
+
def test_python_actor_process_cleanup():
|
262
|
+
"""
|
263
|
+
Test that PythonActor processes are cleaned up when the parent process dies.
|
264
|
+
|
265
|
+
This test spawns an 8 process procmesh and calls an endpoint that returns a normal exception,
|
266
|
+
then verifies that all spawned processes have been cleaned up after the spawned binary dies.
|
267
|
+
"""
|
268
|
+
import os
|
269
|
+
import signal
|
270
|
+
import time
|
271
|
+
|
272
|
+
# Run the error-cleanup test in a subprocess
|
273
|
+
test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
|
274
|
+
cmd = [
|
275
|
+
str(test_bin),
|
276
|
+
"error-cleanup",
|
277
|
+
]
|
278
|
+
|
279
|
+
try:
|
280
|
+
print("running cmd", " ".join(cmd))
|
281
|
+
process = subprocess.run(cmd, capture_output=True, timeout=180, text=True)
|
282
|
+
except subprocess.TimeoutExpired as e:
|
283
|
+
print("timeout expired")
|
284
|
+
if e.stdout is not None:
|
285
|
+
print(e.stdout.decode())
|
286
|
+
if e.stderr is not None:
|
287
|
+
print(e.stderr.decode())
|
288
|
+
raise
|
289
|
+
|
290
|
+
# Read stdout line by line to get child PIDs
|
291
|
+
assert "Started function _error_cleanup() for parent process" in process.stdout
|
292
|
+
|
293
|
+
child_pids = set()
|
294
|
+
for line in process.stdout.splitlines():
|
295
|
+
if line.startswith("CHILD_PIDS: "):
|
296
|
+
pids_str = line[len("CHILD_PIDS: ") :] # noqa
|
297
|
+
child_pids = {
|
298
|
+
int(pid.strip()) for pid in pids_str.split(",") if pid.strip()
|
299
|
+
}
|
300
|
+
print(f"Extracted child PIDs: {child_pids}")
|
301
|
+
break
|
302
|
+
|
303
|
+
if not child_pids:
|
304
|
+
raise AssertionError("No child PIDs found in output")
|
305
|
+
|
306
|
+
assert child_pids, "No child PIDs were collected from subprocess output"
|
307
|
+
|
308
|
+
# Wait for child processes to be cleaned up
|
309
|
+
print("Waiting for child processes to be cleaned up...")
|
310
|
+
cleanup_timeout = 120
|
311
|
+
start_time = time.time()
|
312
|
+
|
313
|
+
def is_process_running(pid):
|
314
|
+
"""Check if a process with the given PID is still running."""
|
315
|
+
try:
|
316
|
+
os.kill(pid, 0) # Signal 0 doesn't kill, just checks if process exists
|
317
|
+
return True
|
318
|
+
except OSError:
|
319
|
+
return False
|
320
|
+
|
321
|
+
still_running = set(child_pids)
|
322
|
+
|
323
|
+
while time.time() - start_time < cleanup_timeout:
|
324
|
+
if not still_running:
|
325
|
+
print("All child processes have been cleaned up!")
|
326
|
+
return
|
327
|
+
|
328
|
+
still_running = {pid for pid in still_running if is_process_running(pid)}
|
329
|
+
|
330
|
+
print(f"Still running child PIDs: {still_running}")
|
331
|
+
time.sleep(2)
|
332
|
+
|
333
|
+
# If we get here, some processes are still running
|
334
|
+
# Try to clean up remaining processes
|
335
|
+
for pid in still_running:
|
336
|
+
try:
|
337
|
+
os.kill(pid, signal.SIGKILL)
|
338
|
+
except OSError:
|
339
|
+
pass
|
340
|
+
raise AssertionError(
|
341
|
+
f"Child processes not cleaned up after {cleanup_timeout}s: {still_running}"
|
342
|
+
)
|
343
|
+
|
344
|
+
|
345
|
+
class ActorFailureError(BaseException):
|
346
|
+
"""Exception to simulate actor failure for supervision testing.
|
347
|
+
|
348
|
+
Inherits from BaseException in order that supervision be
|
349
|
+
triggered.
|
350
|
+
|
351
|
+
"""
|
352
|
+
|
353
|
+
pass
|
241
354
|
|
242
355
|
|
243
356
|
class ErrorActor(Actor):
|
244
|
-
|
245
|
-
|
357
|
+
@endpoint
|
358
|
+
def fail_with_supervision_error(self) -> None:
|
359
|
+
raise ActorFailureError("Simulated actor failure for supervision testing")
|
246
360
|
|
247
361
|
@endpoint
|
248
|
-
async def
|
249
|
-
|
362
|
+
async def fail_with_supervision_error_async(self) -> None:
|
363
|
+
raise ActorFailureError("Simulated actor failure for supervision testing")
|
364
|
+
|
365
|
+
@endpoint
|
366
|
+
async def check(self) -> str:
|
367
|
+
return "this is a healthy check"
|
368
|
+
|
369
|
+
@endpoint
|
370
|
+
async def check_with_exception(self) -> None:
|
371
|
+
raise RuntimeError("failed the check with app error")
|
250
372
|
|
251
373
|
|
252
|
-
|
253
|
-
|
374
|
+
@pytest.mark.parametrize(
|
375
|
+
"mesh",
|
376
|
+
[local_proc_mesh, proc_mesh],
|
377
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
378
|
+
)
|
379
|
+
async def test_proc_mesh_redundant_monitoring(mesh):
|
380
|
+
proc = await mesh(hosts=1, gpus=1)
|
254
381
|
await proc.monitor()
|
255
382
|
|
256
383
|
with pytest.raises(
|
@@ -259,15 +386,237 @@ async def test_proc_mesh_redundant_monitoring():
|
|
259
386
|
await proc.monitor()
|
260
387
|
|
261
388
|
|
262
|
-
|
263
|
-
|
389
|
+
class Worker(Actor):
|
390
|
+
@endpoint
|
391
|
+
def work(self):
|
392
|
+
raise ValueError("value error")
|
393
|
+
|
394
|
+
|
395
|
+
class Manager(Actor):
|
396
|
+
@endpoint
|
397
|
+
async def init(self):
|
398
|
+
mesh = await proc_mesh(gpus=1)
|
399
|
+
self.workers = await mesh.spawn("Worker", Worker)
|
400
|
+
|
401
|
+
@endpoint
|
402
|
+
async def route(self):
|
403
|
+
return await self.workers.work.call_one()
|
404
|
+
|
405
|
+
|
406
|
+
@pytest.mark.parametrize(
|
407
|
+
"mesh",
|
408
|
+
[local_proc_mesh, proc_mesh],
|
409
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
410
|
+
)
|
411
|
+
async def test_errors_propagated(mesh):
|
412
|
+
p_mesh = await mesh(gpus=1)
|
413
|
+
mesh = await p_mesh.spawn("manager", Manager)
|
414
|
+
|
415
|
+
await mesh.init.call_one()
|
416
|
+
|
417
|
+
with pytest.raises(ActorError) as err_info:
|
418
|
+
await mesh.route.call_one()
|
419
|
+
assert "value error" in str(err_info.value)
|
420
|
+
|
421
|
+
|
422
|
+
@pytest.mark.parametrize(
|
423
|
+
"mesh",
|
424
|
+
[local_proc_mesh, proc_mesh],
|
425
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
426
|
+
)
|
427
|
+
async def test_proc_mesh_monitoring(mesh):
|
428
|
+
proc = await mesh(hosts=1, gpus=1)
|
264
429
|
monitor = await proc.monitor()
|
265
430
|
|
431
|
+
e = await proc.spawn("error", ErrorActor)
|
432
|
+
|
266
433
|
with pytest.raises(Exception):
|
267
|
-
|
268
|
-
await asyncio.wait_for(e.check.call_one(), timeout=15)
|
434
|
+
await e.fail_with_supervision_error.call_one()
|
269
435
|
|
270
436
|
event = await anext(monitor)
|
271
437
|
assert isinstance(event, ProcEvent.Crashed)
|
272
438
|
assert event[0] == 0 # check rank
|
273
|
-
assert "
|
439
|
+
assert "ActorFailureError" in event[1] # check error message
|
440
|
+
assert (
|
441
|
+
"Simulated actor failure for supervision testing" in event[1]
|
442
|
+
) # check error message
|
443
|
+
|
444
|
+
# should not be able to spawn actors anymore as proc mesh is unhealthy
|
445
|
+
with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
|
446
|
+
await proc.spawn("ex", ExceptionActorSync)
|
447
|
+
|
448
|
+
|
449
|
+
@pytest.mark.parametrize(
|
450
|
+
"mesh",
|
451
|
+
[local_proc_mesh, proc_mesh],
|
452
|
+
ids=["local_proc_mesh", "distributed_proc_mesh"],
|
453
|
+
)
|
454
|
+
async def test_actor_mesh_supervision_handling(mesh):
|
455
|
+
proc = await mesh(hosts=1, gpus=1)
|
456
|
+
|
457
|
+
e = await proc.spawn("error", ErrorActor)
|
458
|
+
|
459
|
+
# first check() call should succeed
|
460
|
+
await e.check.call()
|
461
|
+
|
462
|
+
# throw an application error
|
463
|
+
with pytest.raises(ActorError, match="failed the check with app error"):
|
464
|
+
await e.check_with_exception.call()
|
465
|
+
|
466
|
+
# actor mesh should still be healthy
|
467
|
+
await e.check.call()
|
468
|
+
|
469
|
+
# existing call should fail with supervision error
|
470
|
+
with pytest.raises(SupervisionError, match="supervision error:"):
|
471
|
+
await e.fail_with_supervision_error.call_one()
|
472
|
+
|
473
|
+
# new call should fail with check of health state of actor mesh
|
474
|
+
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
475
|
+
await e.check.call()
|
476
|
+
|
477
|
+
# should not be able to spawn actors anymore as proc mesh is unhealthy
|
478
|
+
with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
|
479
|
+
await proc.spawn("ex", ExceptionActorSync)
|
480
|
+
|
481
|
+
|
482
|
+
class HealthyActor(Actor):
|
483
|
+
@endpoint
|
484
|
+
async def check(self):
|
485
|
+
return "this is a healthy check"
|
486
|
+
|
487
|
+
@endpoint
|
488
|
+
async def check_with_payload(self, payload: str):
|
489
|
+
pass
|
490
|
+
|
491
|
+
|
492
|
+
class Intermediate(Actor):
|
493
|
+
@endpoint
|
494
|
+
async def init_local_mesh(self):
|
495
|
+
mesh = await local_proc_mesh(gpus=1)
|
496
|
+
self._error_actor = await mesh.spawn("error", ErrorActor)
|
497
|
+
self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
|
498
|
+
|
499
|
+
@endpoint
|
500
|
+
async def init_proc_mesh(self):
|
501
|
+
mesh = await proc_mesh(gpus=1)
|
502
|
+
self._error_actor = await mesh.spawn("error", ErrorActor)
|
503
|
+
self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
|
504
|
+
|
505
|
+
@endpoint
|
506
|
+
async def forward_success(self):
|
507
|
+
return await self._error_actor.check.call()
|
508
|
+
|
509
|
+
@endpoint
|
510
|
+
async def forward_error(self):
|
511
|
+
return await self._error_actor.fail_with_supervision_error.call_one()
|
512
|
+
|
513
|
+
@endpoint
|
514
|
+
async def forward_healthy_check(self):
|
515
|
+
return await self._healthy_actor.check.call()
|
516
|
+
|
517
|
+
|
518
|
+
@pytest.mark.parametrize(
|
519
|
+
"mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
|
520
|
+
)
|
521
|
+
async def test_actor_mesh_supervision_handling_chained_error(mesh):
|
522
|
+
proc = await mesh(hosts=1, gpus=1)
|
523
|
+
|
524
|
+
intermediate_actor = await proc.spawn("intermediate", Intermediate)
|
525
|
+
if mesh is proc_mesh:
|
526
|
+
await intermediate_actor.init_proc_mesh.call()
|
527
|
+
elif mesh is local_proc_mesh:
|
528
|
+
await intermediate_actor.init_local_mesh.call()
|
529
|
+
|
530
|
+
# first forward() call should succeed
|
531
|
+
await intermediate_actor.forward_success.call()
|
532
|
+
await intermediate_actor.forward_healthy_check.call()
|
533
|
+
|
534
|
+
# in a chain of client -> Intermediate -> ErrorActor, a supervision error
|
535
|
+
# happening in ErrorActor will be captured by Intermediate and re-raised
|
536
|
+
# as an application error (ActorError).
|
537
|
+
with pytest.raises(ActorError, match="supervision error:"):
|
538
|
+
await intermediate_actor.forward_error.call()
|
539
|
+
|
540
|
+
# calling success endpoint should fail with ActorError, but with supervision msg.
|
541
|
+
with pytest.raises(ActorError, match="actor mesh is not in a healthy state"):
|
542
|
+
await intermediate_actor.forward_success.call()
|
543
|
+
|
544
|
+
# healthy actor should still be working
|
545
|
+
await intermediate_actor.forward_healthy_check.call()
|
546
|
+
|
547
|
+
|
548
|
+
@pytest.mark.parametrize(
|
549
|
+
"mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
|
550
|
+
)
|
551
|
+
@pytest.mark.parametrize(
|
552
|
+
"method_name",
|
553
|
+
["fail_with_supervision_error", "fail_with_supervision_error_async"],
|
554
|
+
)
|
555
|
+
async def test_base_exception_handling(mesh, method_name):
|
556
|
+
"""Test that BaseException subclasses trigger supervision errors.
|
557
|
+
|
558
|
+
This test verifies that both synchronous and asynchronous methods
|
559
|
+
that raise ActorFailureError (a BaseException subclass) trigger
|
560
|
+
supervision errors properly.
|
561
|
+
|
562
|
+
"""
|
563
|
+
proc = await mesh(hosts=1, gpus=1)
|
564
|
+
error_actor = await proc.spawn("error", ErrorActor)
|
565
|
+
|
566
|
+
# Get the method to call based on the parameter
|
567
|
+
method = getattr(error_actor, method_name)
|
568
|
+
|
569
|
+
# The call should raise a SupervisionError
|
570
|
+
with pytest.raises(SupervisionError, match="supervision error:"):
|
571
|
+
await method.call_one()
|
572
|
+
|
573
|
+
# Subsequent calls should fail with a health state error
|
574
|
+
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
575
|
+
await error_actor.check.call()
|
576
|
+
|
577
|
+
|
578
|
+
@pytest.mark.parametrize(
|
579
|
+
"mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
|
580
|
+
)
|
581
|
+
async def test_supervision_with_proc_mesh_stopped(mesh):
|
582
|
+
proc = await mesh(hosts=1, gpus=1)
|
583
|
+
actor_mesh = await proc.spawn("healthy", HealthyActor)
|
584
|
+
|
585
|
+
await actor_mesh.check.call()
|
586
|
+
|
587
|
+
await proc.stop()
|
588
|
+
|
589
|
+
# new call should fail with check of health state of actor mesh
|
590
|
+
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
591
|
+
await actor_mesh.check.call()
|
592
|
+
|
593
|
+
# proc mesh cannot spawn new actors anymore
|
594
|
+
with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
|
595
|
+
await proc.spawn("immediate", Intermediate)
|
596
|
+
|
597
|
+
|
598
|
+
# TODO - re-enable after resolving T232206970
|
599
|
+
@pytest.mark.oss_skip
|
600
|
+
async def test_supervision_with_sending_error():
|
601
|
+
os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "9999999999"
|
602
|
+
os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "1"
|
603
|
+
|
604
|
+
proc = await proc_mesh(gpus=1)
|
605
|
+
actor_mesh = await proc.spawn("healthy", HealthyActor)
|
606
|
+
|
607
|
+
await actor_mesh.check.call()
|
608
|
+
|
609
|
+
# send a small payload to trigger success
|
610
|
+
await actor_mesh.check_with_payload.call(payload="a")
|
611
|
+
|
612
|
+
# send a large payload to trigger send timeout error
|
613
|
+
with pytest.raises(
|
614
|
+
SupervisionError, match="supervision error:.*message not delivered:"
|
615
|
+
):
|
616
|
+
await actor_mesh.check_with_payload.call(payload="a" * 5000000000)
|
617
|
+
|
618
|
+
# new call should fail with check of health state of actor mesh
|
619
|
+
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
620
|
+
await actor_mesh.check.call()
|
621
|
+
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
622
|
+
await actor_mesh.check_with_payload.call(payload="a")
|
tests/test_alloc.py
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
from unittest import IsolatedAsyncioTestCase
|
10
10
|
|
11
11
|
from monarch import ProcessAllocator
|
12
|
-
from monarch._rust_bindings.
|
12
|
+
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
13
13
|
AllocConstraints,
|
14
14
|
AllocSpec,
|
15
15
|
)
|