torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.26__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +878 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +303 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +508 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +59 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +53 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +21 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/gradient/_gradient_generator.so +0 -0
  37. monarch/mesh_controller.py +263 -139
  38. monarch/monarch_controller +0 -0
  39. monarch/opaque_module.py +4 -6
  40. monarch/opaque_object.py +3 -3
  41. monarch/proc_mesh.py +6 -309
  42. monarch/python_local_mesh.py +1 -1
  43. monarch/rust_backend_mesh.py +2 -1
  44. monarch/rust_local_mesh.py +4 -2
  45. monarch/sim_mesh.py +10 -19
  46. monarch/simulator/command_history.py +1 -1
  47. monarch/simulator/interface.py +2 -1
  48. monarch/simulator/mock_controller.py +1 -1
  49. monarch/simulator/simulator.py +1 -1
  50. monarch/tensor_engine/__init__.py +23 -0
  51. monarch/tensor_worker_main.py +3 -1
  52. monarch/tools/cli.py +3 -1
  53. monarch/tools/commands.py +129 -47
  54. monarch/tools/components/hyperactor.py +5 -3
  55. monarch/tools/config/__init__.py +18 -1
  56. monarch/tools/config/defaults.py +2 -2
  57. monarch/tools/mesh_spec.py +59 -1
  58. monarch/tools/utils.py +38 -0
  59. monarch/worker/worker.py +1 -1
  60. monarch/world_mesh.py +2 -1
  61. monarch_supervisor/python_executable.py +6 -3
  62. tests/error_test_binary.py +48 -10
  63. tests/test_actor_error.py +370 -21
  64. tests/test_alloc.py +1 -1
  65. tests/test_allocator.py +369 -17
  66. tests/test_controller.py +2 -0
  67. tests/test_debugger.py +416 -0
  68. tests/test_env_before_cuda.py +161 -0
  69. tests/test_python_actors.py +184 -333
  70. tests/test_rdma.py +198 -0
  71. tests/test_remote_functions.py +40 -12
  72. tests/test_rust_backend.py +7 -5
  73. tests/test_sim_backend.py +1 -4
  74. tests/test_tensor_engine.py +81 -1
  75. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
  76. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
  77. torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
  78. monarch/_monarch/hyperactor/__init__.py +0 -58
  79. monarch/_monarch/worker/debugger.py +0 -117
  80. monarch/_monarch/worker/logging.py +0 -107
  81. monarch/debugger.py +0 -379
  82. monarch/future.py +0 -76
  83. monarch/rdma.py +0 -162
  84. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  85. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  86. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  87. /monarch/{common → _src/actor}/shape.py +0 -0
  88. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  89. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
  90. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
  91. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
tests/test_actor_error.py CHANGED
@@ -4,14 +4,16 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
- import asyncio
7
+
8
8
  import importlib.resources
9
+ import os
9
10
  import subprocess
11
+ import sys
10
12
 
11
13
  import pytest
12
14
  from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
13
- from monarch.actor_mesh import Actor, ActorError, endpoint, send
14
- from monarch.proc_mesh import local_proc_mesh, proc_mesh
15
+ from monarch._rust_bindings.monarch_hyperactor.supervision import SupervisionError
16
+ from monarch.actor import Actor, ActorError, endpoint, local_proc_mesh, proc_mesh
15
17
 
16
18
 
17
19
  class ExceptionActor(Actor):
@@ -66,16 +68,21 @@ class BrokenPickleClass:
66
68
  self.__dict__.update(state)
67
69
 
68
70
 
71
+ @pytest.mark.parametrize(
72
+ "mesh",
73
+ [local_proc_mesh, proc_mesh],
74
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
75
+ )
69
76
  @pytest.mark.parametrize(
70
77
  "actor_class",
71
78
  [ExceptionActor, ExceptionActorSync],
72
79
  )
73
80
  @pytest.mark.parametrize("num_procs", [1, 2])
74
- async def test_actor_exception(actor_class, num_procs):
81
+ async def test_actor_exception(mesh, actor_class, num_procs):
75
82
  """
76
83
  Test that exceptions raised in actor endpoints are propagated to the client.
77
84
  """
78
- proc = await proc_mesh(gpus=num_procs)
85
+ proc = await mesh(gpus=num_procs)
79
86
  exception_actor = await proc.spawn("exception_actor", actor_class)
80
87
 
81
88
  with pytest.raises(ActorError, match="This is a test exception"):
@@ -85,16 +92,21 @@ async def test_actor_exception(actor_class, num_procs):
85
92
  await exception_actor.raise_exception.call()
86
93
 
87
94
 
95
+ @pytest.mark.parametrize(
96
+ "mesh",
97
+ [local_proc_mesh, proc_mesh],
98
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
99
+ )
88
100
  @pytest.mark.parametrize(
89
101
  "actor_class",
90
102
  [ExceptionActor, ExceptionActorSync],
91
103
  )
92
104
  @pytest.mark.parametrize("num_procs", [1, 2])
93
- def test_actor_exception_sync(actor_class, num_procs):
105
+ def test_actor_exception_sync(mesh, actor_class, num_procs):
94
106
  """
95
107
  Test that exceptions raised in actor endpoints are propagated to the client.
96
108
  """
97
- proc = proc_mesh(gpus=num_procs).get()
109
+ proc = mesh(gpus=num_procs).get()
98
110
  exception_actor = proc.spawn("exception_actor", actor_class).get()
99
111
 
100
112
  with pytest.raises(ActorError, match="This is a test exception"):
@@ -104,6 +116,7 @@ def test_actor_exception_sync(actor_class, num_procs):
104
116
  exception_actor.raise_exception.call().get()
105
117
 
106
118
 
119
+ '''
107
120
  # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
108
121
  @pytest.mark.oss_skip
109
122
  @pytest.mark.parametrize("num_procs", [1, 2])
@@ -140,10 +153,11 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
140
153
  raise
141
154
 
142
155
  # Assert that the subprocess exited with a non-zero code
143
- assert "I actually ran" in process.stdout.decode()
156
+ assert "Started function error_test" in process.stdout.decode()
144
157
  assert (
145
158
  process.returncode != 0
146
159
  ), f"Expected non-zero exit code, got {process.returncode}"
160
+ '''
147
161
 
148
162
 
149
163
  # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
@@ -170,7 +184,7 @@ def test_proc_mesh_bootstrap_error():
170
184
  raise
171
185
 
172
186
  # Assert that the subprocess exited with a non-zero code
173
- assert "I actually ran" in process.stdout.decode()
187
+ assert "Started function error_bootstrap" in process.stdout.decode()
174
188
  assert (
175
189
  process.returncode != 0
176
190
  ), f"Expected non-zero exit code, got {process.returncode}"
@@ -213,6 +227,7 @@ async def test_broken_pickle_class(raise_on_getstate, raise_on_setstate, num_pro
213
227
  await exception_actor.print_value.call(broken_obj)
214
228
 
215
229
 
230
+ """
216
231
  # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
217
232
  @pytest.mark.oss_skip
218
233
  async def test_exception_after_wait_unmonitored():
@@ -234,23 +249,135 @@ async def test_exception_after_wait_unmonitored():
234
249
  raise
235
250
 
236
251
  # Assert that the subprocess exited with a non-zero code
237
- assert "I actually ran" in process.stdout.decode()
252
+ assert "Started function _error_unmonitored" in process.stdout.decode()
238
253
  assert (
239
254
  process.returncode != 0
240
255
  ), f"Expected non-zero exit code, got {process.returncode}"
256
+ """
257
+
258
+
259
+ # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
260
+ @pytest.mark.oss_skip
261
+ def test_python_actor_process_cleanup():
262
+ """
263
+ Test that PythonActor processes are cleaned up when the parent process dies.
264
+
265
+ This test spawns an 8 process procmesh and calls an endpoint that returns a normal exception,
266
+ then verifies that all spawned processes have been cleaned up after the spawned binary dies.
267
+ """
268
+ import os
269
+ import signal
270
+ import time
271
+
272
+ # Run the error-cleanup test in a subprocess
273
+ test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
274
+ cmd = [
275
+ str(test_bin),
276
+ "error-cleanup",
277
+ ]
278
+
279
+ try:
280
+ print("running cmd", " ".join(cmd))
281
+ process = subprocess.run(cmd, capture_output=True, timeout=180, text=True)
282
+ except subprocess.TimeoutExpired as e:
283
+ print("timeout expired")
284
+ if e.stdout is not None:
285
+ print(e.stdout.decode())
286
+ if e.stderr is not None:
287
+ print(e.stderr.decode())
288
+ raise
289
+
290
+ # Read stdout line by line to get child PIDs
291
+ assert "Started function _error_cleanup() for parent process" in process.stdout
292
+
293
+ child_pids = set()
294
+ for line in process.stdout.splitlines():
295
+ if line.startswith("CHILD_PIDS: "):
296
+ pids_str = line[len("CHILD_PIDS: ") :] # noqa
297
+ child_pids = {
298
+ int(pid.strip()) for pid in pids_str.split(",") if pid.strip()
299
+ }
300
+ print(f"Extracted child PIDs: {child_pids}")
301
+ break
302
+
303
+ if not child_pids:
304
+ raise AssertionError("No child PIDs found in output")
305
+
306
+ assert child_pids, "No child PIDs were collected from subprocess output"
307
+
308
+ # Wait for child processes to be cleaned up
309
+ print("Waiting for child processes to be cleaned up...")
310
+ cleanup_timeout = 120
311
+ start_time = time.time()
312
+
313
+ def is_process_running(pid):
314
+ """Check if a process with the given PID is still running."""
315
+ try:
316
+ os.kill(pid, 0) # Signal 0 doesn't kill, just checks if process exists
317
+ return True
318
+ except OSError:
319
+ return False
320
+
321
+ still_running = set(child_pids)
322
+
323
+ while time.time() - start_time < cleanup_timeout:
324
+ if not still_running:
325
+ print("All child processes have been cleaned up!")
326
+ return
327
+
328
+ still_running = {pid for pid in still_running if is_process_running(pid)}
329
+
330
+ print(f"Still running child PIDs: {still_running}")
331
+ time.sleep(2)
332
+
333
+ # If we get here, some processes are still running
334
+ # Try to clean up remaining processes
335
+ for pid in still_running:
336
+ try:
337
+ os.kill(pid, signal.SIGKILL)
338
+ except OSError:
339
+ pass
340
+ raise AssertionError(
341
+ f"Child processes not cleaned up after {cleanup_timeout}s: {still_running}"
342
+ )
343
+
344
+
345
+ class ActorFailureError(BaseException):
346
+ """Exception to simulate actor failure for supervision testing.
347
+
348
+ Inherits from BaseException in order that supervision be
349
+ triggered.
350
+
351
+ """
352
+
353
+ pass
241
354
 
242
355
 
243
356
  class ErrorActor(Actor):
244
- def __init__(self, message):
245
- raise RuntimeError("fail on init")
357
+ @endpoint
358
+ def fail_with_supervision_error(self) -> None:
359
+ raise ActorFailureError("Simulated actor failure for supervision testing")
246
360
 
247
361
  @endpoint
248
- async def check(self) -> None:
249
- pass
362
+ async def fail_with_supervision_error_async(self) -> None:
363
+ raise ActorFailureError("Simulated actor failure for supervision testing")
364
+
365
+ @endpoint
366
+ async def check(self) -> str:
367
+ return "this is a healthy check"
368
+
369
+ @endpoint
370
+ async def check_with_exception(self) -> None:
371
+ raise RuntimeError("failed the check with app error")
250
372
 
251
373
 
252
- async def test_proc_mesh_redundant_monitoring():
253
- proc = await local_proc_mesh(hosts=1, gpus=1)
374
+ @pytest.mark.parametrize(
375
+ "mesh",
376
+ [local_proc_mesh, proc_mesh],
377
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
378
+ )
379
+ async def test_proc_mesh_redundant_monitoring(mesh):
380
+ proc = await mesh(hosts=1, gpus=1)
254
381
  await proc.monitor()
255
382
 
256
383
  with pytest.raises(
@@ -259,15 +386,237 @@ async def test_proc_mesh_redundant_monitoring():
259
386
  await proc.monitor()
260
387
 
261
388
 
262
- async def test_proc_mesh_monitoring():
263
- proc = await local_proc_mesh(hosts=1, gpus=1)
389
+ class Worker(Actor):
390
+ @endpoint
391
+ def work(self):
392
+ raise ValueError("value error")
393
+
394
+
395
+ class Manager(Actor):
396
+ @endpoint
397
+ async def init(self):
398
+ mesh = await proc_mesh(gpus=1)
399
+ self.workers = await mesh.spawn("Worker", Worker)
400
+
401
+ @endpoint
402
+ async def route(self):
403
+ return await self.workers.work.call_one()
404
+
405
+
406
+ @pytest.mark.parametrize(
407
+ "mesh",
408
+ [local_proc_mesh, proc_mesh],
409
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
410
+ )
411
+ async def test_errors_propagated(mesh):
412
+ p_mesh = await mesh(gpus=1)
413
+ mesh = await p_mesh.spawn("manager", Manager)
414
+
415
+ await mesh.init.call_one()
416
+
417
+ with pytest.raises(ActorError) as err_info:
418
+ await mesh.route.call_one()
419
+ assert "value error" in str(err_info.value)
420
+
421
+
422
+ @pytest.mark.parametrize(
423
+ "mesh",
424
+ [local_proc_mesh, proc_mesh],
425
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
426
+ )
427
+ async def test_proc_mesh_monitoring(mesh):
428
+ proc = await mesh(hosts=1, gpus=1)
264
429
  monitor = await proc.monitor()
265
430
 
431
+ e = await proc.spawn("error", ErrorActor)
432
+
266
433
  with pytest.raises(Exception):
267
- e = await proc.spawn("error", ErrorActor, "failed to init the actor")
268
- await asyncio.wait_for(e.check.call_one(), timeout=15)
434
+ await e.fail_with_supervision_error.call_one()
269
435
 
270
436
  event = await anext(monitor)
271
437
  assert isinstance(event, ProcEvent.Crashed)
272
438
  assert event[0] == 0 # check rank
273
- assert "fail on init" in event[1] # check error message
439
+ assert "ActorFailureError" in event[1] # check error message
440
+ assert (
441
+ "Simulated actor failure for supervision testing" in event[1]
442
+ ) # check error message
443
+
444
+ # should not be able to spawn actors anymore as proc mesh is unhealthy
445
+ with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
446
+ await proc.spawn("ex", ExceptionActorSync)
447
+
448
+
449
+ @pytest.mark.parametrize(
450
+ "mesh",
451
+ [local_proc_mesh, proc_mesh],
452
+ ids=["local_proc_mesh", "distributed_proc_mesh"],
453
+ )
454
+ async def test_actor_mesh_supervision_handling(mesh):
455
+ proc = await mesh(hosts=1, gpus=1)
456
+
457
+ e = await proc.spawn("error", ErrorActor)
458
+
459
+ # first check() call should succeed
460
+ await e.check.call()
461
+
462
+ # throw an application error
463
+ with pytest.raises(ActorError, match="failed the check with app error"):
464
+ await e.check_with_exception.call()
465
+
466
+ # actor mesh should still be healthy
467
+ await e.check.call()
468
+
469
+ # existing call should fail with supervision error
470
+ with pytest.raises(SupervisionError, match="supervision error:"):
471
+ await e.fail_with_supervision_error.call_one()
472
+
473
+ # new call should fail with check of health state of actor mesh
474
+ with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
475
+ await e.check.call()
476
+
477
+ # should not be able to spawn actors anymore as proc mesh is unhealthy
478
+ with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
479
+ await proc.spawn("ex", ExceptionActorSync)
480
+
481
+
482
+ class HealthyActor(Actor):
483
+ @endpoint
484
+ async def check(self):
485
+ return "this is a healthy check"
486
+
487
+ @endpoint
488
+ async def check_with_payload(self, payload: str):
489
+ pass
490
+
491
+
492
+ class Intermediate(Actor):
493
+ @endpoint
494
+ async def init_local_mesh(self):
495
+ mesh = await local_proc_mesh(gpus=1)
496
+ self._error_actor = await mesh.spawn("error", ErrorActor)
497
+ self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
498
+
499
+ @endpoint
500
+ async def init_proc_mesh(self):
501
+ mesh = await proc_mesh(gpus=1)
502
+ self._error_actor = await mesh.spawn("error", ErrorActor)
503
+ self._healthy_actor = await mesh.spawn("healthy", HealthyActor)
504
+
505
+ @endpoint
506
+ async def forward_success(self):
507
+ return await self._error_actor.check.call()
508
+
509
+ @endpoint
510
+ async def forward_error(self):
511
+ return await self._error_actor.fail_with_supervision_error.call_one()
512
+
513
+ @endpoint
514
+ async def forward_healthy_check(self):
515
+ return await self._healthy_actor.check.call()
516
+
517
+
518
+ @pytest.mark.parametrize(
519
+ "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
520
+ )
521
+ async def test_actor_mesh_supervision_handling_chained_error(mesh):
522
+ proc = await mesh(hosts=1, gpus=1)
523
+
524
+ intermediate_actor = await proc.spawn("intermediate", Intermediate)
525
+ if mesh is proc_mesh:
526
+ await intermediate_actor.init_proc_mesh.call()
527
+ elif mesh is local_proc_mesh:
528
+ await intermediate_actor.init_local_mesh.call()
529
+
530
+ # first forward() call should succeed
531
+ await intermediate_actor.forward_success.call()
532
+ await intermediate_actor.forward_healthy_check.call()
533
+
534
+ # in a chain of client -> Intermediate -> ErrorActor, a supervision error
535
+ # happening in ErrorActor will be captured by Intermediate and re-raised
536
+ # as an application error (ActorError).
537
+ with pytest.raises(ActorError, match="supervision error:"):
538
+ await intermediate_actor.forward_error.call()
539
+
540
+ # calling success endpoint should fail with ActorError, but with supervision msg.
541
+ with pytest.raises(ActorError, match="actor mesh is not in a healthy state"):
542
+ await intermediate_actor.forward_success.call()
543
+
544
+ # healthy actor should still be working
545
+ await intermediate_actor.forward_healthy_check.call()
546
+
547
+
548
+ @pytest.mark.parametrize(
549
+ "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
550
+ )
551
+ @pytest.mark.parametrize(
552
+ "method_name",
553
+ ["fail_with_supervision_error", "fail_with_supervision_error_async"],
554
+ )
555
+ async def test_base_exception_handling(mesh, method_name):
556
+ """Test that BaseException subclasses trigger supervision errors.
557
+
558
+ This test verifies that both synchronous and asynchronous methods
559
+ that raise ActorFailureError (a BaseException subclass) trigger
560
+ supervision errors properly.
561
+
562
+ """
563
+ proc = await mesh(hosts=1, gpus=1)
564
+ error_actor = await proc.spawn("error", ErrorActor)
565
+
566
+ # Get the method to call based on the parameter
567
+ method = getattr(error_actor, method_name)
568
+
569
+ # The call should raise a SupervisionError
570
+ with pytest.raises(SupervisionError, match="supervision error:"):
571
+ await method.call_one()
572
+
573
+ # Subsequent calls should fail with a health state error
574
+ with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
575
+ await error_actor.check.call()
576
+
577
+
578
+ @pytest.mark.parametrize(
579
+ "mesh", [local_proc_mesh, proc_mesh], ids=["local_proc_mesh", "proc_mesh"]
580
+ )
581
+ async def test_supervision_with_proc_mesh_stopped(mesh):
582
+ proc = await mesh(hosts=1, gpus=1)
583
+ actor_mesh = await proc.spawn("healthy", HealthyActor)
584
+
585
+ await actor_mesh.check.call()
586
+
587
+ await proc.stop()
588
+
589
+ # new call should fail with check of health state of actor mesh
590
+ with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
591
+ await actor_mesh.check.call()
592
+
593
+ # proc mesh cannot spawn new actors anymore
594
+ with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
595
+ await proc.spawn("immediate", Intermediate)
596
+
597
+
598
+ # TODO - re-enable after resolving T232206970
599
+ @pytest.mark.oss_skip
600
+ async def test_supervision_with_sending_error():
601
+ os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "9999999999"
602
+ os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "1"
603
+
604
+ proc = await proc_mesh(gpus=1)
605
+ actor_mesh = await proc.spawn("healthy", HealthyActor)
606
+
607
+ await actor_mesh.check.call()
608
+
609
+ # send a small payload to trigger success
610
+ await actor_mesh.check_with_payload.call(payload="a")
611
+
612
+ # send a large payload to trigger send timeout error
613
+ with pytest.raises(
614
+ SupervisionError, match="supervision error:.*message not delivered:"
615
+ ):
616
+ await actor_mesh.check_with_payload.call(payload="a" * 5000000000)
617
+
618
+ # new call should fail with check of health state of actor mesh
619
+ with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
620
+ await actor_mesh.check.call()
621
+ with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
622
+ await actor_mesh.check_with_payload.call(payload="a")
tests/test_alloc.py CHANGED
@@ -9,7 +9,7 @@
9
9
  from unittest import IsolatedAsyncioTestCase
10
10
 
11
11
  from monarch import ProcessAllocator
12
- from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
12
+ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
13
13
  AllocConstraints,
14
14
  AllocSpec,
15
15
  )