torchmonarch-nightly 2025.8.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.9.3__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. monarch/_rust_bindings.so +0 -0
  2. monarch/_src/actor/actor_mesh.py +414 -216
  3. monarch/_src/actor/allocator.py +75 -6
  4. monarch/_src/actor/bootstrap_main.py +7 -4
  5. monarch/_src/actor/code_sync/__init__.py +2 -0
  6. monarch/_src/actor/debugger/__init__.py +7 -0
  7. monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
  8. monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
  9. monarch/_src/actor/endpoint.py +27 -45
  10. monarch/_src/actor/future.py +86 -24
  11. monarch/_src/actor/host_mesh.py +125 -0
  12. monarch/_src/actor/logging.py +94 -0
  13. monarch/_src/actor/pickle.py +25 -0
  14. monarch/_src/actor/proc_mesh.py +423 -156
  15. monarch/_src/actor/python_extension_methods.py +90 -0
  16. monarch/_src/actor/shape.py +8 -1
  17. monarch/_src/actor/source_loader.py +45 -0
  18. monarch/_src/actor/telemetry/__init__.py +172 -0
  19. monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
  20. monarch/_src/debug_cli/__init__.py +7 -0
  21. monarch/_src/debug_cli/debug_cli.py +43 -0
  22. monarch/_src/tensor_engine/rdma.py +64 -9
  23. monarch/_testing.py +1 -3
  24. monarch/actor/__init__.py +24 -4
  25. monarch/common/_C.so +0 -0
  26. monarch/common/device_mesh.py +14 -0
  27. monarch/common/future.py +10 -0
  28. monarch/common/remote.py +14 -25
  29. monarch/common/tensor.py +12 -0
  30. monarch/debug_cli/__init__.py +7 -0
  31. monarch/debug_cli/__main__.py +12 -0
  32. monarch/fetch.py +2 -2
  33. monarch/gradient/_gradient_generator.so +0 -0
  34. monarch/gradient_generator.py +4 -2
  35. monarch/mesh_controller.py +34 -14
  36. monarch/monarch_controller +0 -0
  37. monarch/tools/colors.py +25 -0
  38. monarch/tools/commands.py +42 -7
  39. monarch/tools/components/hyperactor.py +1 -1
  40. monarch/tools/config/__init__.py +31 -4
  41. monarch/tools/config/defaults.py +13 -3
  42. monarch/tools/config/environment.py +45 -0
  43. monarch/tools/config/workspace.py +165 -0
  44. monarch/tools/mesh_spec.py +2 -0
  45. monarch/utils/__init__.py +9 -0
  46. monarch/utils/utils.py +78 -0
  47. tests/error_test_binary.py +5 -3
  48. tests/python_actor_test_binary.py +52 -0
  49. tests/test_actor_error.py +142 -14
  50. tests/test_alloc.py +1 -1
  51. tests/test_allocator.py +59 -72
  52. tests/test_coalescing.py +1 -1
  53. tests/test_debugger.py +639 -45
  54. tests/test_env_before_cuda.py +4 -4
  55. tests/test_mesh_trait.py +38 -0
  56. tests/test_python_actors.py +979 -75
  57. tests/test_rdma.py +7 -6
  58. tests/test_tensor_engine.py +6 -6
  59. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
  60. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +64 -48
  61. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
  62. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
  63. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
  64. {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
tests/test_allocator.py CHANGED
@@ -15,13 +15,11 @@ import os
15
15
  import subprocess
16
16
  import sys
17
17
  import unittest
18
- from datetime import timedelta
19
18
  from time import sleep
20
19
  from typing import Generator, Optional
21
20
  from unittest import mock
22
21
 
23
22
  import cloudpickle
24
- import pytest
25
23
 
26
24
  import torch
27
25
  import torch.distributed as dist
@@ -55,8 +53,6 @@ from monarch.tools.network import get_sockaddr
55
53
  from torch.distributed.elastic.utils.distributed import get_free_port
56
54
  from torchx.specs import AppState
57
55
 
58
- _100_MILLISECONDS = timedelta(milliseconds=100)
59
-
60
56
  SERVER_READY = "monarch.tools.commands.server_ready"
61
57
  UNUSED = "__UNUSED__"
62
58
 
@@ -170,10 +166,9 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
170
166
 
171
167
  spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
172
168
  allocator = LocalAllocator()
173
- alloc = await allocator.allocate(spec)
174
-
175
- proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
169
+ alloc = allocator.allocate(spec)
176
170
 
171
+ proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
177
172
  try:
178
173
  actor = await proc_mesh.spawn("env_check", EnvCheckActor)
179
174
 
@@ -197,9 +192,9 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
197
192
 
198
193
  spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
199
194
  allocator = LocalAllocator()
200
- alloc = await allocator.allocate(spec)
195
+ alloc = allocator.allocate(spec)
201
196
 
202
- proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
197
+ proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_with_rank)
203
198
 
204
199
  try:
205
200
  actor = await proc_mesh.spawn("env_check", EnvCheckActor)
@@ -253,10 +248,11 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
253
248
  allocator = RemoteAllocator(
254
249
  world_id="test_remote_allocator",
255
250
  initializer=StaticRemoteAllocInitializer(host1, host2),
256
- heartbeat_interval=_100_MILLISECONDS,
257
251
  )
258
- alloc = await allocator.allocate(spec)
259
- await ProcMesh.from_alloc(alloc)
252
+ alloc = allocator.allocate(spec)
253
+ await alloc.initialized
254
+ pm = ProcMesh.from_alloc(alloc)
255
+ await pm.initialized
260
256
 
261
257
  async def test_call_allocate_twice(self) -> None:
262
258
  class DeletingAllocInitializer(StaticRemoteAllocInitializer):
@@ -275,15 +271,17 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
275
271
  allocator = RemoteAllocator(
276
272
  world_id="test_remote_allocator",
277
273
  initializer=initializer,
278
- heartbeat_interval=_100_MILLISECONDS,
279
274
  )
280
275
 
281
276
  spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
282
277
 
283
- await allocator.allocate(spec)
278
+ alloc = allocator.allocate(spec)
279
+ await alloc.initialized
280
+
284
281
  self.assertEqual([host1], initializer.addrs)
285
282
 
286
- await allocator.allocate(spec)
283
+ alloc = allocator.allocate(spec)
284
+ await alloc.initialized
287
285
  self.assertEqual([], initializer.addrs)
288
286
 
289
287
  async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
@@ -301,9 +299,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
301
299
  allocator = RemoteAllocator(
302
300
  world_id="test_remote_allocator",
303
301
  initializer=empty_initializer,
304
- heartbeat_interval=_100_MILLISECONDS,
305
302
  )
306
- await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
303
+ await allocator.allocate(
304
+ AllocSpec(AllocConstraints(), host=1, gpu=1)
305
+ ).initialized
307
306
 
308
307
  async def test_allocate_2d_mesh(self) -> None:
309
308
  hosts = 2
@@ -316,10 +315,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
316
315
  allocator = RemoteAllocator(
317
316
  world_id="test_remote_allocator",
318
317
  initializer=StaticRemoteAllocInitializer(host1, host2),
319
- heartbeat_interval=_100_MILLISECONDS,
320
318
  )
321
- alloc = await allocator.allocate(spec)
322
- proc_mesh = await ProcMesh.from_alloc(alloc)
319
+ alloc = allocator.allocate(spec)
320
+ proc_mesh = ProcMesh.from_alloc(alloc)
323
321
  actor = await proc_mesh.spawn("test_actor", TestActor)
324
322
 
325
323
  values = await actor.compute_world_size.call(
@@ -335,11 +333,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
335
333
  allocator = RemoteAllocator(
336
334
  world_id="test_remote_allocator",
337
335
  initializer=StaticRemoteAllocInitializer(host1, host2),
338
- heartbeat_interval=_100_MILLISECONDS,
339
336
  )
340
337
 
341
- alloc = await allocator.allocate(spec)
342
- proc_mesh = await ProcMesh.from_alloc(alloc)
338
+ alloc = allocator.allocate(spec)
339
+ proc_mesh = ProcMesh.from_alloc(alloc)
343
340
  # XXX - it is not clear why this trying to use
344
341
  # async code in a sync context.
345
342
  with fake_sync_state():
@@ -348,7 +345,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
348
345
  with self.assertRaises(
349
346
  RuntimeError, msg="`ProcMesh` has already been stopped"
350
347
  ):
351
- proc_mesh.spawn("test_actor", TestActor).get()
348
+ proc_mesh.spawn("test_actor", TestActor).initialized.get()
352
349
  del actor
353
350
 
354
351
  async def test_wrong_address(self) -> None:
@@ -362,14 +359,14 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
362
359
  allocator = RemoteAllocator(
363
360
  world_id="test_remote_allocator",
364
361
  initializer=StaticRemoteAllocInitializer(wrong_host),
365
- heartbeat_interval=_100_MILLISECONDS,
366
362
  )
367
- alloc = await allocator.allocate(spec)
363
+ alloc = allocator.allocate(spec)
364
+ await alloc.initialized
368
365
 
369
366
  with self.assertRaisesRegex(
370
367
  Exception, r"no process has ever been allocated.*"
371
368
  ):
372
- await ProcMesh.from_alloc(alloc)
369
+ await ProcMesh.from_alloc(alloc).initialized
373
370
 
374
371
  async def test_init_failure(self) -> None:
375
372
  class FailInitActor(Actor):
@@ -385,10 +382,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
385
382
  allocator = RemoteAllocator(
386
383
  world_id="helloworld",
387
384
  initializer=StaticRemoteAllocInitializer(host1, host2),
388
- heartbeat_interval=_100_MILLISECONDS,
389
385
  )
390
386
  spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
391
- proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
387
+ proc_mesh = ProcMesh.from_alloc(allocator.allocate(spec))
392
388
  actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
393
389
 
394
390
  with self.assertRaisesRegex(
@@ -405,10 +401,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
405
401
  allocator = RemoteAllocator(
406
402
  world_id="test_remote_allocator",
407
403
  initializer=StaticRemoteAllocInitializer(host1, host2),
408
- heartbeat_interval=_100_MILLISECONDS,
409
404
  )
410
- alloc = await allocator.allocate(spec)
411
- proc_mesh = await ProcMesh.from_alloc(alloc)
405
+ alloc = allocator.allocate(spec)
406
+ proc_mesh = ProcMesh.from_alloc(alloc)
412
407
  actor = await proc_mesh.spawn("test_actor", TestActor)
413
408
 
414
409
  await proc_mesh.stop()
@@ -416,7 +411,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
416
411
  with self.assertRaises(
417
412
  RuntimeError, msg="`ProcMesh` has already been stopped"
418
413
  ):
419
- await proc_mesh.spawn("test_actor", TestActor)
414
+ await proc_mesh.spawn("test_actor", TestActor).initialized
420
415
 
421
416
  # TODO(agallagher): It'd be nice to test that this just fails
422
417
  # immediately, trying to access the wrapped actor mesh, but right
@@ -431,10 +426,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
431
426
  allocator = RemoteAllocator(
432
427
  world_id="test_remote_allocator",
433
428
  initializer=StaticRemoteAllocInitializer(host1, host2),
434
- heartbeat_interval=_100_MILLISECONDS,
435
429
  )
436
- alloc = await allocator.allocate(spec)
437
- proc_mesh = await ProcMesh.from_alloc(alloc)
430
+ alloc = allocator.allocate(spec)
431
+ proc_mesh = ProcMesh.from_alloc(alloc)
438
432
  with self.assertRaises(ValueError, msg="foo"):
439
433
  async with proc_mesh:
440
434
  actor = await proc_mesh.spawn("test_actor", TestActor)
@@ -444,7 +438,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
444
438
  with self.assertRaises(
445
439
  RuntimeError, msg="`ProcMesh` has already been stopped"
446
440
  ):
447
- await proc_mesh.spawn("test_actor", TestActor)
441
+ await proc_mesh.spawn("test_actor", TestActor).initialized
448
442
 
449
443
  # TODO(agallagher): It'd be nice to test that this just fails
450
444
  # immediately, trying to access the wrapped actor mesh, but right
@@ -467,11 +461,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
467
461
  allocator = RemoteAllocator(
468
462
  world_id="test_remote_allocator",
469
463
  initializer=StaticRemoteAllocInitializer(host1, host2),
470
- heartbeat_interval=_100_MILLISECONDS,
471
464
  )
472
- alloc = await allocator.allocate(spec)
473
- proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
474
-
465
+ alloc = allocator.allocate(spec)
466
+ proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_env_vars)
467
+ await proc_mesh.initialized
475
468
  try:
476
469
  actor = await proc_mesh.spawn("env_check", EnvCheckActor)
477
470
 
@@ -494,10 +487,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
494
487
  allocator = RemoteAllocator(
495
488
  world_id="test_remote_allocator",
496
489
  initializer=StaticRemoteAllocInitializer(host1, host2),
497
- heartbeat_interval=_100_MILLISECONDS,
498
490
  )
499
- alloc = await allocator.allocate(spec)
500
- proc_mesh = await ProcMesh.from_alloc(alloc)
491
+ alloc = allocator.allocate(spec)
492
+ proc_mesh = ProcMesh.from_alloc(alloc)
501
493
  # We can nest multiple context managers on the same mesh, the innermost
502
494
  # one closes the mesh and it cannot be used after that.
503
495
  async with proc_mesh:
@@ -507,7 +499,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
507
499
  with self.assertRaises(
508
500
  RuntimeError, msg="`ProcMesh` has already been stopped"
509
501
  ):
510
- await proc_mesh.spawn("test_actor", TestActor)
502
+ await proc_mesh.spawn("test_actor", TestActor).initialized
511
503
  # Exiting a second time should not raise an error.
512
504
 
513
505
  # TODO(agallagher): It'd be nice to test that this just fails
@@ -524,13 +516,12 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
524
516
  allocator = RemoteAllocator(
525
517
  world_id="test_remote_allocator",
526
518
  initializer=StaticRemoteAllocInitializer(host1),
527
- heartbeat_interval=_100_MILLISECONDS,
528
519
  )
529
520
  with self.assertRaisesRegex(
530
521
  Exception, "no process has ever been allocated on"
531
522
  ):
532
- alloc = await allocator.allocate(spec)
533
- await ProcMesh.from_alloc(alloc)
523
+ alloc = allocator.allocate(spec)
524
+ await ProcMesh.from_alloc(alloc).initialized
534
525
 
535
526
  async def test_stacked_1d_meshes(self) -> None:
536
527
  # create two stacked actor meshes on the same host
@@ -540,19 +531,17 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
540
531
  allocator_a = RemoteAllocator(
541
532
  world_id="a",
542
533
  initializer=StaticRemoteAllocInitializer(host1_a),
543
- heartbeat_interval=_100_MILLISECONDS,
544
534
  )
545
535
  allocator_b = RemoteAllocator(
546
536
  world_id="b",
547
537
  initializer=StaticRemoteAllocInitializer(host1_b),
548
- heartbeat_interval=_100_MILLISECONDS,
549
538
  )
550
539
 
551
540
  spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
552
541
  spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
553
542
 
554
- proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
555
- proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
543
+ proc_mesh_a = ProcMesh.from_alloc(allocator_a.allocate(spec_a))
544
+ proc_mesh_b = ProcMesh.from_alloc(allocator_b.allocate(spec_b))
556
545
 
557
546
  actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
558
547
  actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
@@ -576,7 +565,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
576
565
  RuntimeError,
577
566
  r"slurm:///123 does not exist or is in a terminal state",
578
567
  ):
579
- await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
568
+ await allocator.allocate(
569
+ AllocSpec(AllocConstraints(), host=1, gpu=1)
570
+ ).initialized
580
571
 
581
572
  async def test_torchx_remote_alloc_initializer_no_match_label_gt_1_meshes(
582
573
  self,
@@ -599,9 +590,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
599
590
  RuntimeError,
600
591
  r"2 proc meshes in slurm:///123, please specify the mesh name as a match label `procmesh.monarch.meta.com/name`",
601
592
  ):
602
- await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
593
+ await allocator.allocate(
594
+ AllocSpec(AllocConstraints(), host=1, gpu=1)
595
+ ).initialized
603
596
 
604
- @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
605
597
  async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
606
598
  server = ServerSpec(
607
599
  name=UNUSED,
@@ -612,30 +604,26 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
612
604
  name="x",
613
605
  num_hosts=1,
614
606
  transport="tcp",
615
- hostnames=["localhost"],
607
+ hostnames=["0.0.0.0"],
616
608
  )
617
609
  ],
618
610
  )
619
611
  port = get_free_port()
620
- with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
612
+ with remote_process_allocator(addr=f"tcp!{get_sockaddr('0.0.0.0', port)}"):
621
613
  with mock.patch(SERVER_READY, return_value=server):
622
614
  initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
623
615
  allocator = RemoteAllocator(
624
616
  world_id="test",
625
617
  initializer=initializer,
626
- heartbeat_interval=_100_MILLISECONDS,
627
- )
628
- alloc = await allocator.allocate(
629
- AllocSpec(AllocConstraints(), host=1, gpu=4)
630
618
  )
631
- proc_mesh = await ProcMesh.from_alloc(alloc)
619
+ alloc = allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=4))
620
+ proc_mesh = ProcMesh.from_alloc(alloc)
632
621
  actor = await proc_mesh.spawn("test_actor", TestActor)
633
622
  results = await actor.compute_world_size.call(
634
623
  master_addr="0.0.0.0", master_port=get_free_port()
635
624
  )
636
625
  self.assert_computed_world_size(results, 4) # 1x4 mesh
637
626
 
638
- @pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
639
627
  async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
640
628
  server = ServerSpec(
641
629
  name=UNUSED,
@@ -646,20 +634,19 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
646
634
  name="x",
647
635
  num_hosts=1,
648
636
  transport="tcp",
649
- hostnames=["localhost"],
637
+ hostnames=["0.0.0.0"],
650
638
  )
651
639
  ],
652
640
  )
653
641
  port = get_free_port()
654
- with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
642
+ with remote_process_allocator(addr=f"tcp!{get_sockaddr('0.0.0.0', port)}"):
655
643
  with mock.patch(SERVER_READY, return_value=server):
656
644
  initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
657
645
  allocator = RemoteAllocator(
658
646
  world_id="test",
659
647
  initializer=initializer,
660
- heartbeat_interval=_100_MILLISECONDS,
661
648
  )
662
- alloc = await allocator.allocate(
649
+ alloc = allocator.allocate(
663
650
  AllocSpec(
664
651
  AllocConstraints(
665
652
  match_labels={ALLOC_LABEL_PROC_MESH_NAME: "x"}
@@ -668,7 +655,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
668
655
  gpu=3,
669
656
  )
670
657
  )
671
- proc_mesh = await ProcMesh.from_alloc(alloc)
658
+ proc_mesh = ProcMesh.from_alloc(alloc)
672
659
  actor = await proc_mesh.spawn("test_actor", TestActor)
673
660
  results = await actor.compute_world_size.call(
674
661
  master_addr="0.0.0.0", master_port=get_free_port()
@@ -698,7 +685,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
698
685
  with self.assertRaisesRegex(RuntimeError, r"'y' not found in job: test"):
699
686
  initializer = TorchXRemoteAllocInitializer("local:///test")
700
687
  allocator = RemoteAllocator(world_id="test", initializer=initializer)
701
- alloc = await allocator.allocate(
688
+ alloc = allocator.allocate(
702
689
  AllocSpec(
703
690
  AllocConstraints(
704
691
  match_labels={ALLOC_LABEL_PROC_MESH_NAME: "y"}
@@ -707,7 +694,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
707
694
  gpu=1,
708
695
  )
709
696
  )
710
- await ProcMesh.from_alloc(alloc)
697
+ await alloc.initialized
698
+ await ProcMesh.from_alloc(alloc).initialized
711
699
 
712
700
  async def test_log(self) -> None:
713
701
  # create a mesh to log to both stdout and stderr
@@ -716,12 +704,11 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
716
704
  allocator = RemoteAllocator(
717
705
  world_id="test_actor_logger",
718
706
  initializer=StaticRemoteAllocInitializer(host),
719
- heartbeat_interval=_100_MILLISECONDS,
720
707
  )
721
708
 
722
709
  spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
723
710
 
724
- proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
711
+ proc_mesh = ProcMesh.from_alloc(allocator.allocate(spec))
725
712
 
726
713
  # Generate aggregated log every 1 second.
727
714
  await proc_mesh.logging_option(True, 1)
tests/test_coalescing.py CHANGED
@@ -78,7 +78,7 @@ class TestCoalescing:
78
78
  num_hosts,
79
79
  gpu_per_host,
80
80
  activate,
81
- backend=str(backend_type),
81
+ backend=backend_type.value,
82
82
  )
83
83
 
84
84
  @property