torchmonarch-nightly 2025.8.2__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.4__cp313-cp313-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +504 -218
- monarch/_src/actor/allocator.py +75 -6
- monarch/_src/actor/bootstrap_main.py +7 -4
- monarch/_src/actor/code_sync/__init__.py +2 -0
- monarch/_src/actor/debugger/__init__.py +7 -0
- monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
- monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
- monarch/_src/actor/endpoint.py +27 -45
- monarch/_src/actor/future.py +86 -24
- monarch/_src/actor/host_mesh.py +125 -0
- monarch/_src/actor/logging.py +94 -0
- monarch/_src/actor/pickle.py +25 -0
- monarch/_src/actor/proc_mesh.py +423 -156
- monarch/_src/actor/python_extension_methods.py +90 -0
- monarch/_src/actor/shape.py +8 -1
- monarch/_src/actor/source_loader.py +45 -0
- monarch/_src/actor/telemetry/__init__.py +172 -0
- monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
- monarch/_src/debug_cli/__init__.py +7 -0
- monarch/_src/debug_cli/debug_cli.py +43 -0
- monarch/_src/tensor_engine/rdma.py +64 -9
- monarch/_testing.py +1 -3
- monarch/actor/__init__.py +28 -4
- monarch/common/_C.so +0 -0
- monarch/common/device_mesh.py +14 -0
- monarch/common/future.py +10 -0
- monarch/common/remote.py +14 -25
- monarch/common/tensor.py +12 -0
- monarch/debug_cli/__init__.py +7 -0
- monarch/debug_cli/__main__.py +12 -0
- monarch/fetch.py +2 -2
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +4 -2
- monarch/mesh_controller.py +34 -14
- monarch/monarch_controller +0 -0
- monarch/tools/colors.py +25 -0
- monarch/tools/commands.py +42 -7
- monarch/tools/components/hyperactor.py +6 -4
- monarch/tools/config/__init__.py +35 -12
- monarch/tools/config/defaults.py +15 -5
- monarch/tools/config/environment.py +45 -0
- monarch/tools/config/workspace.py +165 -0
- monarch/tools/mesh_spec.py +3 -3
- monarch/utils/__init__.py +9 -0
- monarch/utils/utils.py +78 -0
- tests/error_test_binary.py +5 -3
- tests/python_actor_test_binary.py +52 -0
- tests/test_actor_error.py +142 -14
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +59 -72
- tests/test_debugger.py +639 -45
- tests/test_env_before_cuda.py +4 -4
- tests/test_mesh_trait.py +38 -0
- tests/test_python_actors.py +965 -75
- tests/test_rdma.py +7 -6
- tests/test_tensor_engine.py +6 -6
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/METADATA +82 -4
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/RECORD +63 -47
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.4.dist-info}/top_level.txt +0 -0
tests/test_allocator.py
CHANGED
@@ -15,13 +15,11 @@ import os
|
|
15
15
|
import subprocess
|
16
16
|
import sys
|
17
17
|
import unittest
|
18
|
-
from datetime import timedelta
|
19
18
|
from time import sleep
|
20
19
|
from typing import Generator, Optional
|
21
20
|
from unittest import mock
|
22
21
|
|
23
22
|
import cloudpickle
|
24
|
-
import pytest
|
25
23
|
|
26
24
|
import torch
|
27
25
|
import torch.distributed as dist
|
@@ -55,8 +53,6 @@ from monarch.tools.network import get_sockaddr
|
|
55
53
|
from torch.distributed.elastic.utils.distributed import get_free_port
|
56
54
|
from torchx.specs import AppState
|
57
55
|
|
58
|
-
_100_MILLISECONDS = timedelta(milliseconds=100)
|
59
|
-
|
60
56
|
SERVER_READY = "monarch.tools.commands.server_ready"
|
61
57
|
UNUSED = "__UNUSED__"
|
62
58
|
|
@@ -170,10 +166,9 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
|
170
166
|
|
171
167
|
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
172
168
|
allocator = LocalAllocator()
|
173
|
-
alloc =
|
174
|
-
|
175
|
-
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
|
169
|
+
alloc = allocator.allocate(spec)
|
176
170
|
|
171
|
+
proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
|
177
172
|
try:
|
178
173
|
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
179
174
|
|
@@ -197,9 +192,9 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
|
197
192
|
|
198
193
|
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
199
194
|
allocator = LocalAllocator()
|
200
|
-
alloc =
|
195
|
+
alloc = allocator.allocate(spec)
|
201
196
|
|
202
|
-
proc_mesh =
|
197
|
+
proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_with_rank)
|
203
198
|
|
204
199
|
try:
|
205
200
|
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
@@ -253,10 +248,11 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
253
248
|
allocator = RemoteAllocator(
|
254
249
|
world_id="test_remote_allocator",
|
255
250
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
256
|
-
heartbeat_interval=_100_MILLISECONDS,
|
257
251
|
)
|
258
|
-
alloc =
|
259
|
-
await
|
252
|
+
alloc = allocator.allocate(spec)
|
253
|
+
await alloc.initialized
|
254
|
+
pm = ProcMesh.from_alloc(alloc)
|
255
|
+
await pm.initialized
|
260
256
|
|
261
257
|
async def test_call_allocate_twice(self) -> None:
|
262
258
|
class DeletingAllocInitializer(StaticRemoteAllocInitializer):
|
@@ -275,15 +271,17 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
275
271
|
allocator = RemoteAllocator(
|
276
272
|
world_id="test_remote_allocator",
|
277
273
|
initializer=initializer,
|
278
|
-
heartbeat_interval=_100_MILLISECONDS,
|
279
274
|
)
|
280
275
|
|
281
276
|
spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
|
282
277
|
|
283
|
-
|
278
|
+
alloc = allocator.allocate(spec)
|
279
|
+
await alloc.initialized
|
280
|
+
|
284
281
|
self.assertEqual([host1], initializer.addrs)
|
285
282
|
|
286
|
-
|
283
|
+
alloc = allocator.allocate(spec)
|
284
|
+
await alloc.initialized
|
287
285
|
self.assertEqual([], initializer.addrs)
|
288
286
|
|
289
287
|
async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
|
@@ -301,9 +299,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
301
299
|
allocator = RemoteAllocator(
|
302
300
|
world_id="test_remote_allocator",
|
303
301
|
initializer=empty_initializer,
|
304
|
-
heartbeat_interval=_100_MILLISECONDS,
|
305
302
|
)
|
306
|
-
await allocator.allocate(
|
303
|
+
await allocator.allocate(
|
304
|
+
AllocSpec(AllocConstraints(), host=1, gpu=1)
|
305
|
+
).initialized
|
307
306
|
|
308
307
|
async def test_allocate_2d_mesh(self) -> None:
|
309
308
|
hosts = 2
|
@@ -316,10 +315,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
316
315
|
allocator = RemoteAllocator(
|
317
316
|
world_id="test_remote_allocator",
|
318
317
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
319
|
-
heartbeat_interval=_100_MILLISECONDS,
|
320
318
|
)
|
321
|
-
alloc =
|
322
|
-
proc_mesh =
|
319
|
+
alloc = allocator.allocate(spec)
|
320
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
323
321
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
324
322
|
|
325
323
|
values = await actor.compute_world_size.call(
|
@@ -335,11 +333,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
335
333
|
allocator = RemoteAllocator(
|
336
334
|
world_id="test_remote_allocator",
|
337
335
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
338
|
-
heartbeat_interval=_100_MILLISECONDS,
|
339
336
|
)
|
340
337
|
|
341
|
-
alloc =
|
342
|
-
proc_mesh =
|
338
|
+
alloc = allocator.allocate(spec)
|
339
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
343
340
|
# XXX - it is not clear why this trying to use
|
344
341
|
# async code in a sync context.
|
345
342
|
with fake_sync_state():
|
@@ -348,7 +345,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
348
345
|
with self.assertRaises(
|
349
346
|
RuntimeError, msg="`ProcMesh` has already been stopped"
|
350
347
|
):
|
351
|
-
proc_mesh.spawn("test_actor", TestActor).get()
|
348
|
+
proc_mesh.spawn("test_actor", TestActor).initialized.get()
|
352
349
|
del actor
|
353
350
|
|
354
351
|
async def test_wrong_address(self) -> None:
|
@@ -362,14 +359,14 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
362
359
|
allocator = RemoteAllocator(
|
363
360
|
world_id="test_remote_allocator",
|
364
361
|
initializer=StaticRemoteAllocInitializer(wrong_host),
|
365
|
-
heartbeat_interval=_100_MILLISECONDS,
|
366
362
|
)
|
367
|
-
alloc =
|
363
|
+
alloc = allocator.allocate(spec)
|
364
|
+
await alloc.initialized
|
368
365
|
|
369
366
|
with self.assertRaisesRegex(
|
370
367
|
Exception, r"no process has ever been allocated.*"
|
371
368
|
):
|
372
|
-
await ProcMesh.from_alloc(alloc)
|
369
|
+
await ProcMesh.from_alloc(alloc).initialized
|
373
370
|
|
374
371
|
async def test_init_failure(self) -> None:
|
375
372
|
class FailInitActor(Actor):
|
@@ -385,10 +382,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
385
382
|
allocator = RemoteAllocator(
|
386
383
|
world_id="helloworld",
|
387
384
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
388
|
-
heartbeat_interval=_100_MILLISECONDS,
|
389
385
|
)
|
390
386
|
spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
|
391
|
-
proc_mesh =
|
387
|
+
proc_mesh = ProcMesh.from_alloc(allocator.allocate(spec))
|
392
388
|
actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
|
393
389
|
|
394
390
|
with self.assertRaisesRegex(
|
@@ -405,10 +401,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
405
401
|
allocator = RemoteAllocator(
|
406
402
|
world_id="test_remote_allocator",
|
407
403
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
408
|
-
heartbeat_interval=_100_MILLISECONDS,
|
409
404
|
)
|
410
|
-
alloc =
|
411
|
-
proc_mesh =
|
405
|
+
alloc = allocator.allocate(spec)
|
406
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
412
407
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
413
408
|
|
414
409
|
await proc_mesh.stop()
|
@@ -416,7 +411,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
416
411
|
with self.assertRaises(
|
417
412
|
RuntimeError, msg="`ProcMesh` has already been stopped"
|
418
413
|
):
|
419
|
-
await proc_mesh.spawn("test_actor", TestActor)
|
414
|
+
await proc_mesh.spawn("test_actor", TestActor).initialized
|
420
415
|
|
421
416
|
# TODO(agallagher): It'd be nice to test that this just fails
|
422
417
|
# immediately, trying to access the wrapped actor mesh, but right
|
@@ -431,10 +426,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
431
426
|
allocator = RemoteAllocator(
|
432
427
|
world_id="test_remote_allocator",
|
433
428
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
434
|
-
heartbeat_interval=_100_MILLISECONDS,
|
435
429
|
)
|
436
|
-
alloc =
|
437
|
-
proc_mesh =
|
430
|
+
alloc = allocator.allocate(spec)
|
431
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
438
432
|
with self.assertRaises(ValueError, msg="foo"):
|
439
433
|
async with proc_mesh:
|
440
434
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
@@ -444,7 +438,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
444
438
|
with self.assertRaises(
|
445
439
|
RuntimeError, msg="`ProcMesh` has already been stopped"
|
446
440
|
):
|
447
|
-
await proc_mesh.spawn("test_actor", TestActor)
|
441
|
+
await proc_mesh.spawn("test_actor", TestActor).initialized
|
448
442
|
|
449
443
|
# TODO(agallagher): It'd be nice to test that this just fails
|
450
444
|
# immediately, trying to access the wrapped actor mesh, but right
|
@@ -467,11 +461,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
467
461
|
allocator = RemoteAllocator(
|
468
462
|
world_id="test_remote_allocator",
|
469
463
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
470
|
-
heartbeat_interval=_100_MILLISECONDS,
|
471
464
|
)
|
472
|
-
alloc =
|
473
|
-
proc_mesh =
|
474
|
-
|
465
|
+
alloc = allocator.allocate(spec)
|
466
|
+
proc_mesh = ProcMesh.from_alloc(alloc, setup=setup_env_vars)
|
467
|
+
await proc_mesh.initialized
|
475
468
|
try:
|
476
469
|
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
477
470
|
|
@@ -494,10 +487,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
494
487
|
allocator = RemoteAllocator(
|
495
488
|
world_id="test_remote_allocator",
|
496
489
|
initializer=StaticRemoteAllocInitializer(host1, host2),
|
497
|
-
heartbeat_interval=_100_MILLISECONDS,
|
498
490
|
)
|
499
|
-
alloc =
|
500
|
-
proc_mesh =
|
491
|
+
alloc = allocator.allocate(spec)
|
492
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
501
493
|
# We can nest multiple context managers on the same mesh, the innermost
|
502
494
|
# one closes the mesh and it cannot be used after that.
|
503
495
|
async with proc_mesh:
|
@@ -507,7 +499,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
507
499
|
with self.assertRaises(
|
508
500
|
RuntimeError, msg="`ProcMesh` has already been stopped"
|
509
501
|
):
|
510
|
-
await proc_mesh.spawn("test_actor", TestActor)
|
502
|
+
await proc_mesh.spawn("test_actor", TestActor).initialized
|
511
503
|
# Exiting a second time should not raise an error.
|
512
504
|
|
513
505
|
# TODO(agallagher): It'd be nice to test that this just fails
|
@@ -524,13 +516,12 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
524
516
|
allocator = RemoteAllocator(
|
525
517
|
world_id="test_remote_allocator",
|
526
518
|
initializer=StaticRemoteAllocInitializer(host1),
|
527
|
-
heartbeat_interval=_100_MILLISECONDS,
|
528
519
|
)
|
529
520
|
with self.assertRaisesRegex(
|
530
521
|
Exception, "no process has ever been allocated on"
|
531
522
|
):
|
532
|
-
alloc =
|
533
|
-
await ProcMesh.from_alloc(alloc)
|
523
|
+
alloc = allocator.allocate(spec)
|
524
|
+
await ProcMesh.from_alloc(alloc).initialized
|
534
525
|
|
535
526
|
async def test_stacked_1d_meshes(self) -> None:
|
536
527
|
# create two stacked actor meshes on the same host
|
@@ -540,19 +531,17 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
540
531
|
allocator_a = RemoteAllocator(
|
541
532
|
world_id="a",
|
542
533
|
initializer=StaticRemoteAllocInitializer(host1_a),
|
543
|
-
heartbeat_interval=_100_MILLISECONDS,
|
544
534
|
)
|
545
535
|
allocator_b = RemoteAllocator(
|
546
536
|
world_id="b",
|
547
537
|
initializer=StaticRemoteAllocInitializer(host1_b),
|
548
|
-
heartbeat_interval=_100_MILLISECONDS,
|
549
538
|
)
|
550
539
|
|
551
540
|
spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
|
552
541
|
spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
|
553
542
|
|
554
|
-
proc_mesh_a =
|
555
|
-
proc_mesh_b =
|
543
|
+
proc_mesh_a = ProcMesh.from_alloc(allocator_a.allocate(spec_a))
|
544
|
+
proc_mesh_b = ProcMesh.from_alloc(allocator_b.allocate(spec_b))
|
556
545
|
|
557
546
|
actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
|
558
547
|
actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
|
@@ -576,7 +565,9 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
576
565
|
RuntimeError,
|
577
566
|
r"slurm:///123 does not exist or is in a terminal state",
|
578
567
|
):
|
579
|
-
await allocator.allocate(
|
568
|
+
await allocator.allocate(
|
569
|
+
AllocSpec(AllocConstraints(), host=1, gpu=1)
|
570
|
+
).initialized
|
580
571
|
|
581
572
|
async def test_torchx_remote_alloc_initializer_no_match_label_gt_1_meshes(
|
582
573
|
self,
|
@@ -599,9 +590,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
599
590
|
RuntimeError,
|
600
591
|
r"2 proc meshes in slurm:///123, please specify the mesh name as a match label `procmesh.monarch.meta.com/name`",
|
601
592
|
):
|
602
|
-
await allocator.allocate(
|
593
|
+
await allocator.allocate(
|
594
|
+
AllocSpec(AllocConstraints(), host=1, gpu=1)
|
595
|
+
).initialized
|
603
596
|
|
604
|
-
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
605
597
|
async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
|
606
598
|
server = ServerSpec(
|
607
599
|
name=UNUSED,
|
@@ -612,30 +604,26 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
612
604
|
name="x",
|
613
605
|
num_hosts=1,
|
614
606
|
transport="tcp",
|
615
|
-
hostnames=["
|
607
|
+
hostnames=["0.0.0.0"],
|
616
608
|
)
|
617
609
|
],
|
618
610
|
)
|
619
611
|
port = get_free_port()
|
620
|
-
with remote_process_allocator(addr=f"tcp!{get_sockaddr('
|
612
|
+
with remote_process_allocator(addr=f"tcp!{get_sockaddr('0.0.0.0', port)}"):
|
621
613
|
with mock.patch(SERVER_READY, return_value=server):
|
622
614
|
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
623
615
|
allocator = RemoteAllocator(
|
624
616
|
world_id="test",
|
625
617
|
initializer=initializer,
|
626
|
-
heartbeat_interval=_100_MILLISECONDS,
|
627
|
-
)
|
628
|
-
alloc = await allocator.allocate(
|
629
|
-
AllocSpec(AllocConstraints(), host=1, gpu=4)
|
630
618
|
)
|
631
|
-
|
619
|
+
alloc = allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=4))
|
620
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
632
621
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
633
622
|
results = await actor.compute_world_size.call(
|
634
623
|
master_addr="0.0.0.0", master_port=get_free_port()
|
635
624
|
)
|
636
625
|
self.assert_computed_world_size(results, 4) # 1x4 mesh
|
637
626
|
|
638
|
-
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
639
627
|
async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
|
640
628
|
server = ServerSpec(
|
641
629
|
name=UNUSED,
|
@@ -646,20 +634,19 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
646
634
|
name="x",
|
647
635
|
num_hosts=1,
|
648
636
|
transport="tcp",
|
649
|
-
hostnames=["
|
637
|
+
hostnames=["0.0.0.0"],
|
650
638
|
)
|
651
639
|
],
|
652
640
|
)
|
653
641
|
port = get_free_port()
|
654
|
-
with remote_process_allocator(addr=f"tcp!{get_sockaddr('
|
642
|
+
with remote_process_allocator(addr=f"tcp!{get_sockaddr('0.0.0.0', port)}"):
|
655
643
|
with mock.patch(SERVER_READY, return_value=server):
|
656
644
|
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
657
645
|
allocator = RemoteAllocator(
|
658
646
|
world_id="test",
|
659
647
|
initializer=initializer,
|
660
|
-
heartbeat_interval=_100_MILLISECONDS,
|
661
648
|
)
|
662
|
-
alloc =
|
649
|
+
alloc = allocator.allocate(
|
663
650
|
AllocSpec(
|
664
651
|
AllocConstraints(
|
665
652
|
match_labels={ALLOC_LABEL_PROC_MESH_NAME: "x"}
|
@@ -668,7 +655,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
668
655
|
gpu=3,
|
669
656
|
)
|
670
657
|
)
|
671
|
-
proc_mesh =
|
658
|
+
proc_mesh = ProcMesh.from_alloc(alloc)
|
672
659
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
673
660
|
results = await actor.compute_world_size.call(
|
674
661
|
master_addr="0.0.0.0", master_port=get_free_port()
|
@@ -698,7 +685,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
698
685
|
with self.assertRaisesRegex(RuntimeError, r"'y' not found in job: test"):
|
699
686
|
initializer = TorchXRemoteAllocInitializer("local:///test")
|
700
687
|
allocator = RemoteAllocator(world_id="test", initializer=initializer)
|
701
|
-
alloc =
|
688
|
+
alloc = allocator.allocate(
|
702
689
|
AllocSpec(
|
703
690
|
AllocConstraints(
|
704
691
|
match_labels={ALLOC_LABEL_PROC_MESH_NAME: "y"}
|
@@ -707,7 +694,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
707
694
|
gpu=1,
|
708
695
|
)
|
709
696
|
)
|
710
|
-
await
|
697
|
+
await alloc.initialized
|
698
|
+
await ProcMesh.from_alloc(alloc).initialized
|
711
699
|
|
712
700
|
async def test_log(self) -> None:
|
713
701
|
# create a mesh to log to both stdout and stderr
|
@@ -716,12 +704,11 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
716
704
|
allocator = RemoteAllocator(
|
717
705
|
world_id="test_actor_logger",
|
718
706
|
initializer=StaticRemoteAllocInitializer(host),
|
719
|
-
heartbeat_interval=_100_MILLISECONDS,
|
720
707
|
)
|
721
708
|
|
722
709
|
spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
|
723
710
|
|
724
|
-
proc_mesh =
|
711
|
+
proc_mesh = ProcMesh.from_alloc(allocator.allocate(spec))
|
725
712
|
|
726
713
|
# Generate aggregated log every 1 second.
|
727
714
|
await proc_mesh.logging_option(True, 1)
|