torchmonarch-nightly 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.26__cp313-cp313-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +878 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +303 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +508 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +59 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +53 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +21 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/mesh_controller.py +263 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +129 -47
- monarch/tools/components/hyperactor.py +5 -3
- monarch/tools/config/__init__.py +18 -1
- monarch/tools/config/defaults.py +2 -2
- monarch/tools/mesh_spec.py +59 -1
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +369 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +161 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +81 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
- torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
tests/test_allocator.py
CHANGED
@@ -6,14 +6,17 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
|
+
import asyncio
|
9
10
|
import contextlib
|
10
11
|
import importlib.resources
|
12
|
+
import logging
|
11
13
|
import math
|
12
14
|
import os
|
13
15
|
import subprocess
|
14
16
|
import sys
|
15
17
|
import unittest
|
16
18
|
from datetime import timedelta
|
19
|
+
from time import sleep
|
17
20
|
from typing import Generator, Optional
|
18
21
|
from unittest import mock
|
19
22
|
|
@@ -24,22 +27,28 @@ import torch
|
|
24
27
|
import torch.distributed as dist
|
25
28
|
import torch.nn.functional as F
|
26
29
|
|
27
|
-
from monarch._rust_bindings.
|
28
|
-
AllocConstraints,
|
29
|
-
AllocSpec,
|
30
|
-
)
|
30
|
+
from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
|
31
31
|
from monarch._rust_bindings.monarch_hyperactor.channel import (
|
32
32
|
ChannelAddr,
|
33
33
|
ChannelTransport,
|
34
34
|
)
|
35
|
-
|
36
|
-
from monarch.allocator import (
|
35
|
+
|
36
|
+
from monarch._src.actor.allocator import (
|
37
37
|
ALLOC_LABEL_PROC_MESH_NAME,
|
38
|
+
LocalAllocator,
|
38
39
|
RemoteAllocator,
|
39
40
|
StaticRemoteAllocInitializer,
|
40
41
|
TorchXRemoteAllocInitializer,
|
41
42
|
)
|
42
|
-
from monarch.
|
43
|
+
from monarch._src.actor.sync_state import fake_sync_state
|
44
|
+
from monarch.actor import (
|
45
|
+
Actor,
|
46
|
+
current_rank,
|
47
|
+
current_size,
|
48
|
+
endpoint,
|
49
|
+
ProcMesh,
|
50
|
+
ValueMesh,
|
51
|
+
)
|
43
52
|
from monarch.tools.mesh_spec import MeshSpec, ServerSpec
|
44
53
|
from monarch.tools.network import get_sockaddr
|
45
54
|
|
@@ -49,6 +58,19 @@ from torchx.specs import AppState
|
|
49
58
|
_100_MILLISECONDS = timedelta(milliseconds=100)
|
50
59
|
|
51
60
|
SERVER_READY = "monarch.tools.commands.server_ready"
|
61
|
+
UNUSED = "__UNUSED__"
|
62
|
+
|
63
|
+
|
64
|
+
class EnvCheckActor(Actor):
|
65
|
+
"""Actor that checks for the presence of an environment variable"""
|
66
|
+
|
67
|
+
def __init__(self) -> None:
|
68
|
+
pass
|
69
|
+
|
70
|
+
@endpoint
|
71
|
+
async def get_env_var(self, var_name: str) -> str:
|
72
|
+
"""Return the value of the specified environment variable or 'NOT_SET' if not found"""
|
73
|
+
return os.environ.get(var_name, "NOT_SET")
|
52
74
|
|
53
75
|
|
54
76
|
class TestActor(Actor):
|
@@ -57,6 +79,8 @@ class TestActor(Actor):
|
|
57
79
|
def __init__(self) -> None:
|
58
80
|
self.rank: int = current_rank().rank
|
59
81
|
self.world_size: int = math.prod(current_size().values())
|
82
|
+
self.logger: logging.Logger = logging.getLogger("test_actor")
|
83
|
+
self.logger.setLevel(logging.INFO)
|
60
84
|
|
61
85
|
@endpoint
|
62
86
|
async def compute_world_size(self, master_addr: str, master_port: int) -> int:
|
@@ -71,17 +95,33 @@ class TestActor(Actor):
|
|
71
95
|
finally:
|
72
96
|
dist.destroy_process_group()
|
73
97
|
|
98
|
+
@endpoint
|
99
|
+
async def log(self, message: str) -> None:
|
100
|
+
print(f"Stdout LogMessage from print: {message}")
|
101
|
+
sys.stderr.write(f"Stderr LogMessage from print: {message}\n")
|
102
|
+
self.logger.info(f"LogMessage from logger: {message}")
|
103
|
+
|
74
104
|
|
75
105
|
@contextlib.contextmanager
|
76
|
-
def remote_process_allocator(
|
77
|
-
|
106
|
+
def remote_process_allocator(
|
107
|
+
addr: Optional[str] = None, timeout: Optional[int] = None
|
108
|
+
) -> Generator[str, None, None]:
|
109
|
+
"""Start a remote process allocator on addr. If timeout is not None, have it
|
110
|
+
timeout after that many seconds if no messages come in"""
|
111
|
+
|
112
|
+
with importlib.resources.as_file(
|
113
|
+
importlib.resources.files(__package__)
|
114
|
+
) as package_path:
|
78
115
|
addr = addr or ChannelAddr.any(ChannelTransport.Unix)
|
116
|
+
args = [
|
117
|
+
"process_allocator",
|
118
|
+
f"--addr={addr}",
|
119
|
+
]
|
120
|
+
if timeout is not None:
|
121
|
+
args.append(f"--timeout-sec={timeout}")
|
79
122
|
|
80
123
|
process_allocator = subprocess.Popen(
|
81
|
-
args=
|
82
|
-
"process_allocator",
|
83
|
-
f"--addr={addr}",
|
84
|
-
],
|
124
|
+
args=args,
|
85
125
|
env={
|
86
126
|
# prefix PATH with this test module's directory to
|
87
127
|
# give 'process_allocator' and 'monarch_bootstrap' binary resources
|
@@ -102,6 +142,79 @@ def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None,
|
|
102
142
|
process_allocator.kill()
|
103
143
|
|
104
144
|
|
145
|
+
class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
146
|
+
@classmethod
|
147
|
+
def setUpClass(cls) -> None:
|
148
|
+
cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
|
149
|
+
|
150
|
+
@classmethod
|
151
|
+
def tearDownClass(cls) -> None:
|
152
|
+
cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
|
153
|
+
|
154
|
+
async def test_setup_lambda_with_multiple_env_vars(self) -> None:
|
155
|
+
"""Test that the setup lambda can set multiple environment variables"""
|
156
|
+
env_vars: dict[str, str] = {
|
157
|
+
"TEST_ENV_VAR_1": "value_1",
|
158
|
+
"TEST_ENV_VAR_2": "value_2",
|
159
|
+
"TEST_ENV_VAR_3": "value_3",
|
160
|
+
}
|
161
|
+
|
162
|
+
def setup_multiple_env_vars() -> None:
|
163
|
+
for name, value in env_vars.items():
|
164
|
+
os.environ[name] = value
|
165
|
+
|
166
|
+
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
167
|
+
allocator = LocalAllocator()
|
168
|
+
alloc = await allocator.allocate(spec)
|
169
|
+
|
170
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
|
171
|
+
|
172
|
+
try:
|
173
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
174
|
+
|
175
|
+
for name, expected_value in env_vars.items():
|
176
|
+
actual_value = await actor.get_env_var.call_one(name)
|
177
|
+
self.assertEqual(
|
178
|
+
actual_value,
|
179
|
+
expected_value,
|
180
|
+
f"Environment variable {name} was not set correctly",
|
181
|
+
)
|
182
|
+
finally:
|
183
|
+
await proc_mesh.stop()
|
184
|
+
|
185
|
+
async def test_setup_lambda_with_context_info(self) -> None:
|
186
|
+
"""Test that the setup lambda can access rank information"""
|
187
|
+
context_var_name: str = "PROC_MESH_RANK_INFO"
|
188
|
+
|
189
|
+
def setup_with_rank() -> None:
|
190
|
+
context_info = f"point_rank:{current_rank().rank}"
|
191
|
+
os.environ[context_var_name] = context_info
|
192
|
+
|
193
|
+
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
194
|
+
allocator = LocalAllocator()
|
195
|
+
alloc = await allocator.allocate(spec)
|
196
|
+
|
197
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
|
198
|
+
|
199
|
+
try:
|
200
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
201
|
+
|
202
|
+
rank_info = await actor.get_env_var.call_one(context_var_name)
|
203
|
+
|
204
|
+
self.assertNotEqual(
|
205
|
+
rank_info,
|
206
|
+
"NOT_SET",
|
207
|
+
"Context information was not stored in the environment variable",
|
208
|
+
)
|
209
|
+
self.assertIn(
|
210
|
+
"point_rank:0",
|
211
|
+
rank_info,
|
212
|
+
f"Context information {rank_info} does not contain point_rank",
|
213
|
+
)
|
214
|
+
finally:
|
215
|
+
await proc_mesh.stop()
|
216
|
+
|
217
|
+
|
105
218
|
class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
106
219
|
@classmethod
|
107
220
|
def setUpClass(cls) -> None:
|
@@ -153,7 +266,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
153
266
|
"""test initializer that returns an empty list of addresses"""
|
154
267
|
|
155
268
|
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
156
|
-
_ = match_labels
|
269
|
+
_ = match_labels
|
157
270
|
return []
|
158
271
|
|
159
272
|
empty_initializer = EmptyAllocInitializer()
|
@@ -191,6 +304,209 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
191
304
|
|
192
305
|
self.assert_computed_world_size(values, world_size)
|
193
306
|
|
307
|
+
async def test_stop_proc_mesh_blocking(self) -> None:
|
308
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
309
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
310
|
+
allocator = RemoteAllocator(
|
311
|
+
world_id="test_remote_allocator",
|
312
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
313
|
+
heartbeat_interval=_100_MILLISECONDS,
|
314
|
+
)
|
315
|
+
|
316
|
+
alloc = await allocator.allocate(spec)
|
317
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
318
|
+
# XXX - it is not clear why this trying to use
|
319
|
+
# async code in a sync context.
|
320
|
+
with fake_sync_state():
|
321
|
+
actor = proc_mesh.spawn("test_actor", TestActor).get()
|
322
|
+
proc_mesh.stop().get()
|
323
|
+
with self.assertRaises(
|
324
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
325
|
+
):
|
326
|
+
proc_mesh.spawn("test_actor", TestActor).get()
|
327
|
+
del actor
|
328
|
+
|
329
|
+
async def test_wrong_address(self) -> None:
|
330
|
+
hosts = 1
|
331
|
+
gpus = 1
|
332
|
+
spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
|
333
|
+
|
334
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
335
|
+
with remote_process_allocator():
|
336
|
+
wrong_host = ChannelAddr.any(ChannelTransport.Unix)
|
337
|
+
allocator = RemoteAllocator(
|
338
|
+
world_id="test_remote_allocator",
|
339
|
+
initializer=StaticRemoteAllocInitializer(wrong_host),
|
340
|
+
heartbeat_interval=_100_MILLISECONDS,
|
341
|
+
)
|
342
|
+
alloc = await allocator.allocate(spec)
|
343
|
+
|
344
|
+
with self.assertRaisesRegex(
|
345
|
+
Exception, r"no process has ever been allocated.*"
|
346
|
+
):
|
347
|
+
await ProcMesh.from_alloc(alloc)
|
348
|
+
|
349
|
+
async def test_init_failure(self) -> None:
|
350
|
+
class FailInitActor(Actor):
|
351
|
+
def __init__(self) -> None:
|
352
|
+
if current_rank().rank == 0:
|
353
|
+
raise RuntimeError("fail on init")
|
354
|
+
|
355
|
+
@endpoint
|
356
|
+
def dummy(self) -> None:
|
357
|
+
pass
|
358
|
+
|
359
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
360
|
+
allocator = RemoteAllocator(
|
361
|
+
world_id="helloworld",
|
362
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
363
|
+
heartbeat_interval=_100_MILLISECONDS,
|
364
|
+
)
|
365
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
|
366
|
+
proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
|
367
|
+
actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
|
368
|
+
|
369
|
+
with self.assertRaisesRegex(
|
370
|
+
Exception,
|
371
|
+
r"(?s)fail on init",
|
372
|
+
):
|
373
|
+
await actor_mesh.dummy.call()
|
374
|
+
|
375
|
+
async def test_stop_proc_mesh(self) -> None:
|
376
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
377
|
+
|
378
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
379
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
380
|
+
allocator = RemoteAllocator(
|
381
|
+
world_id="test_remote_allocator",
|
382
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
383
|
+
heartbeat_interval=_100_MILLISECONDS,
|
384
|
+
)
|
385
|
+
alloc = await allocator.allocate(spec)
|
386
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
387
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
388
|
+
|
389
|
+
await proc_mesh.stop()
|
390
|
+
|
391
|
+
with self.assertRaises(
|
392
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
393
|
+
):
|
394
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
395
|
+
|
396
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
397
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
398
|
+
# now we doing casting without accessing the wrapped type.
|
399
|
+
del actor
|
400
|
+
|
401
|
+
async def test_stop_proc_mesh_context_manager(self) -> None:
|
402
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
403
|
+
|
404
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
405
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
406
|
+
allocator = RemoteAllocator(
|
407
|
+
world_id="test_remote_allocator",
|
408
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
409
|
+
heartbeat_interval=_100_MILLISECONDS,
|
410
|
+
)
|
411
|
+
alloc = await allocator.allocate(spec)
|
412
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
413
|
+
with self.assertRaises(ValueError, msg="foo"):
|
414
|
+
async with proc_mesh:
|
415
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
416
|
+
# Ensure that proc mesh is stopped when context manager exits.
|
417
|
+
raise ValueError("foo")
|
418
|
+
|
419
|
+
with self.assertRaises(
|
420
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
421
|
+
):
|
422
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
423
|
+
|
424
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
425
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
426
|
+
# now we doing casting without accessing the wrapped type.
|
427
|
+
del actor
|
428
|
+
|
429
|
+
async def test_setup_lambda_sets_env_vars(self) -> None:
|
430
|
+
"""Test that the setup lambda can set environment variables during proc_mesh allocation"""
|
431
|
+
test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
|
432
|
+
test_var_value: str = "test_value_123"
|
433
|
+
|
434
|
+
def setup_env_vars() -> None:
|
435
|
+
os.environ[test_var_name] = test_var_value
|
436
|
+
|
437
|
+
hosts = 2
|
438
|
+
gpus = 4
|
439
|
+
spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
|
440
|
+
|
441
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
442
|
+
allocator = RemoteAllocator(
|
443
|
+
world_id="test_remote_allocator",
|
444
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
445
|
+
heartbeat_interval=_100_MILLISECONDS,
|
446
|
+
)
|
447
|
+
alloc = await allocator.allocate(spec)
|
448
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
|
449
|
+
|
450
|
+
try:
|
451
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
452
|
+
|
453
|
+
env_var_values = await actor.get_env_var.call(test_var_name)
|
454
|
+
env_var_value = env_var_values.item(host=0, gpu=0)
|
455
|
+
|
456
|
+
self.assertEqual(
|
457
|
+
env_var_value,
|
458
|
+
test_var_value,
|
459
|
+
f"Environment variable {test_var_name} was not set correctly",
|
460
|
+
)
|
461
|
+
finally:
|
462
|
+
await proc_mesh.stop()
|
463
|
+
|
464
|
+
async def test_stop_proc_mesh_context_manager_multiple_times(self) -> None:
|
465
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
466
|
+
|
467
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
468
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
469
|
+
allocator = RemoteAllocator(
|
470
|
+
world_id="test_remote_allocator",
|
471
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
472
|
+
heartbeat_interval=_100_MILLISECONDS,
|
473
|
+
)
|
474
|
+
alloc = await allocator.allocate(spec)
|
475
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
476
|
+
# We can nest multiple context managers on the same mesh, the innermost
|
477
|
+
# one closes the mesh and it cannot be used after that.
|
478
|
+
async with proc_mesh:
|
479
|
+
async with proc_mesh:
|
480
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
481
|
+
|
482
|
+
with self.assertRaises(
|
483
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
484
|
+
):
|
485
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
486
|
+
# Exiting a second time should not raise an error.
|
487
|
+
|
488
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
489
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
490
|
+
# now we doing casting without accessing the wrapped type.
|
491
|
+
del actor
|
492
|
+
|
493
|
+
async def test_remote_allocator_with_no_connection(self) -> None:
|
494
|
+
spec = AllocSpec(AllocConstraints(), host=1, gpu=4)
|
495
|
+
|
496
|
+
with remote_process_allocator(timeout=1) as host1:
|
497
|
+
# Wait 3 seconds without making any processes, make sure it dies.
|
498
|
+
await asyncio.sleep(3)
|
499
|
+
allocator = RemoteAllocator(
|
500
|
+
world_id="test_remote_allocator",
|
501
|
+
initializer=StaticRemoteAllocInitializer(host1),
|
502
|
+
heartbeat_interval=_100_MILLISECONDS,
|
503
|
+
)
|
504
|
+
with self.assertRaisesRegex(
|
505
|
+
Exception, "no process has ever been allocated on"
|
506
|
+
):
|
507
|
+
alloc = await allocator.allocate(spec)
|
508
|
+
await ProcMesh.from_alloc(alloc)
|
509
|
+
|
194
510
|
async def test_stacked_1d_meshes(self) -> None:
|
195
511
|
# create two stacked actor meshes on the same host
|
196
512
|
# each actor mesh running on separate process-allocators
|
@@ -244,7 +560,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
244
560
|
# but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
|
245
561
|
|
246
562
|
server = ServerSpec(
|
247
|
-
name=
|
563
|
+
name=UNUSED,
|
564
|
+
scheduler=UNUSED,
|
248
565
|
state=AppState.RUNNING,
|
249
566
|
meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
|
250
567
|
)
|
@@ -262,7 +579,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
262
579
|
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
263
580
|
async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
|
264
581
|
server = ServerSpec(
|
265
|
-
name=
|
582
|
+
name=UNUSED,
|
583
|
+
scheduler=UNUSED,
|
266
584
|
state=AppState.RUNNING,
|
267
585
|
meshes=[
|
268
586
|
MeshSpec(
|
@@ -295,7 +613,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
295
613
|
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
296
614
|
async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
|
297
615
|
server = ServerSpec(
|
298
|
-
name=
|
616
|
+
name=UNUSED,
|
617
|
+
scheduler=UNUSED,
|
299
618
|
state=AppState.RUNNING,
|
300
619
|
meshes=[
|
301
620
|
MeshSpec(
|
@@ -338,6 +657,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
338
657
|
|
339
658
|
server = ServerSpec(
|
340
659
|
name="test",
|
660
|
+
scheduler=UNUSED,
|
341
661
|
state=AppState.RUNNING,
|
342
662
|
meshes=[
|
343
663
|
MeshSpec(
|
@@ -363,3 +683,35 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
363
683
|
)
|
364
684
|
)
|
365
685
|
await ProcMesh.from_alloc(alloc)
|
686
|
+
|
687
|
+
async def test_log(self) -> None:
|
688
|
+
# create a mesh to log to both stdout and stderr
|
689
|
+
|
690
|
+
with remote_process_allocator() as host:
|
691
|
+
allocator = RemoteAllocator(
|
692
|
+
world_id="test_actor_logger",
|
693
|
+
initializer=StaticRemoteAllocInitializer(host),
|
694
|
+
heartbeat_interval=_100_MILLISECONDS,
|
695
|
+
)
|
696
|
+
|
697
|
+
spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
|
698
|
+
|
699
|
+
proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
|
700
|
+
|
701
|
+
# Generate aggregated log every 1 second.
|
702
|
+
await proc_mesh.logging_option(True, 1)
|
703
|
+
actor = await proc_mesh.spawn("actor", TestActor)
|
704
|
+
# Run for 4 seconds, every second generates 5 logs, so we expect to see
|
705
|
+
# 2 actors x 5 logs/actor/sec * 1 sec = 10 logs per aggregation.
|
706
|
+
for _ in range(20):
|
707
|
+
await actor.log.call("Expect to see [10 processes]")
|
708
|
+
sleep(0.2)
|
709
|
+
# Generate aggregated log every 2 seconds.
|
710
|
+
await proc_mesh.logging_option(True, 2)
|
711
|
+
# Run for 8 seconds, every second generates 5 logs, so we expect to see
|
712
|
+
# 2 actors x 5 logs/actor/sec * 2 sec = 20 logs per aggregation.
|
713
|
+
for _ in range(40):
|
714
|
+
await actor.log.call("Expect to see [20 processes]")
|
715
|
+
sleep(0.2)
|
716
|
+
|
717
|
+
print("======== All Done ========")
|
tests/test_controller.py
CHANGED