torchmonarch-nightly 2025.7.1__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +874 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +270 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +500 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +56 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +51 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +12 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/mesh_controller.py +201 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +95 -35
- monarch/tools/mesh_spec.py +55 -0
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +373 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +162 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +55 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
tests/test_allocator.py
CHANGED
@@ -6,14 +6,17 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
|
+
import asyncio
|
9
10
|
import contextlib
|
10
11
|
import importlib.resources
|
12
|
+
import logging
|
11
13
|
import math
|
12
14
|
import os
|
13
15
|
import subprocess
|
14
16
|
import sys
|
15
17
|
import unittest
|
16
18
|
from datetime import timedelta
|
19
|
+
from time import sleep
|
17
20
|
from typing import Generator, Optional
|
18
21
|
from unittest import mock
|
19
22
|
|
@@ -24,22 +27,29 @@ import torch
|
|
24
27
|
import torch.distributed as dist
|
25
28
|
import torch.nn.functional as F
|
26
29
|
|
27
|
-
from monarch._rust_bindings.
|
28
|
-
AllocConstraints,
|
29
|
-
AllocSpec,
|
30
|
-
)
|
30
|
+
from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
|
31
31
|
from monarch._rust_bindings.monarch_hyperactor.channel import (
|
32
32
|
ChannelAddr,
|
33
33
|
ChannelTransport,
|
34
34
|
)
|
35
|
-
|
36
|
-
from monarch.
|
35
|
+
|
36
|
+
from monarch._src.actor.actor_mesh import MonarchContext
|
37
|
+
from monarch._src.actor.allocator import (
|
37
38
|
ALLOC_LABEL_PROC_MESH_NAME,
|
39
|
+
LocalAllocator,
|
38
40
|
RemoteAllocator,
|
39
41
|
StaticRemoteAllocInitializer,
|
40
42
|
TorchXRemoteAllocInitializer,
|
41
43
|
)
|
42
|
-
from monarch.
|
44
|
+
from monarch._src.actor.sync_state import fake_sync_state
|
45
|
+
from monarch.actor import (
|
46
|
+
Actor,
|
47
|
+
current_rank,
|
48
|
+
current_size,
|
49
|
+
endpoint,
|
50
|
+
ProcMesh,
|
51
|
+
ValueMesh,
|
52
|
+
)
|
43
53
|
from monarch.tools.mesh_spec import MeshSpec, ServerSpec
|
44
54
|
from monarch.tools.network import get_sockaddr
|
45
55
|
|
@@ -49,6 +59,19 @@ from torchx.specs import AppState
|
|
49
59
|
_100_MILLISECONDS = timedelta(milliseconds=100)
|
50
60
|
|
51
61
|
SERVER_READY = "monarch.tools.commands.server_ready"
|
62
|
+
UNUSED = "__UNUSED__"
|
63
|
+
|
64
|
+
|
65
|
+
class EnvCheckActor(Actor):
|
66
|
+
"""Actor that checks for the presence of an environment variable"""
|
67
|
+
|
68
|
+
def __init__(self) -> None:
|
69
|
+
pass
|
70
|
+
|
71
|
+
@endpoint
|
72
|
+
async def get_env_var(self, var_name: str) -> str:
|
73
|
+
"""Return the value of the specified environment variable or 'NOT_SET' if not found"""
|
74
|
+
return os.environ.get(var_name, "NOT_SET")
|
52
75
|
|
53
76
|
|
54
77
|
class TestActor(Actor):
|
@@ -57,6 +80,8 @@ class TestActor(Actor):
|
|
57
80
|
def __init__(self) -> None:
|
58
81
|
self.rank: int = current_rank().rank
|
59
82
|
self.world_size: int = math.prod(current_size().values())
|
83
|
+
self.logger: logging.Logger = logging.getLogger("test_actor")
|
84
|
+
self.logger.setLevel(logging.INFO)
|
60
85
|
|
61
86
|
@endpoint
|
62
87
|
async def compute_world_size(self, master_addr: str, master_port: int) -> int:
|
@@ -71,17 +96,33 @@ class TestActor(Actor):
|
|
71
96
|
finally:
|
72
97
|
dist.destroy_process_group()
|
73
98
|
|
99
|
+
@endpoint
|
100
|
+
async def log(self, message: str) -> None:
|
101
|
+
print(f"Stdout LogMessage from print: {message}")
|
102
|
+
sys.stderr.write(f"Stderr LogMessage from print: {message}\n")
|
103
|
+
self.logger.info(f"LogMessage from logger: {message}")
|
104
|
+
|
74
105
|
|
75
106
|
@contextlib.contextmanager
|
76
|
-
def remote_process_allocator(
|
77
|
-
|
107
|
+
def remote_process_allocator(
|
108
|
+
addr: Optional[str] = None, timeout: Optional[int] = None
|
109
|
+
) -> Generator[str, None, None]:
|
110
|
+
"""Start a remote process allocator on addr. If timeout is not None, have it
|
111
|
+
timeout after that many seconds if no messages come in"""
|
112
|
+
|
113
|
+
with importlib.resources.as_file(
|
114
|
+
importlib.resources.files(__package__)
|
115
|
+
) as package_path:
|
78
116
|
addr = addr or ChannelAddr.any(ChannelTransport.Unix)
|
117
|
+
args = [
|
118
|
+
"process_allocator",
|
119
|
+
f"--addr={addr}",
|
120
|
+
]
|
121
|
+
if timeout is not None:
|
122
|
+
args.append(f"--timeout-sec={timeout}")
|
79
123
|
|
80
124
|
process_allocator = subprocess.Popen(
|
81
|
-
args=
|
82
|
-
"process_allocator",
|
83
|
-
f"--addr={addr}",
|
84
|
-
],
|
125
|
+
args=args,
|
85
126
|
env={
|
86
127
|
# prefix PATH with this test module's directory to
|
87
128
|
# give 'process_allocator' and 'monarch_bootstrap' binary resources
|
@@ -102,6 +143,82 @@ def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None,
|
|
102
143
|
process_allocator.kill()
|
103
144
|
|
104
145
|
|
146
|
+
class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
147
|
+
@classmethod
|
148
|
+
def setUpClass(cls) -> None:
|
149
|
+
cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
def tearDownClass(cls) -> None:
|
153
|
+
cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
|
154
|
+
|
155
|
+
async def test_setup_lambda_with_multiple_env_vars(self) -> None:
|
156
|
+
"""Test that the setup lambda can set multiple environment variables"""
|
157
|
+
env_vars: dict[str, str] = {
|
158
|
+
"TEST_ENV_VAR_1": "value_1",
|
159
|
+
"TEST_ENV_VAR_2": "value_2",
|
160
|
+
"TEST_ENV_VAR_3": "value_3",
|
161
|
+
}
|
162
|
+
|
163
|
+
def setup_multiple_env_vars(ctx: MonarchContext) -> None:
|
164
|
+
for name, value in env_vars.items():
|
165
|
+
os.environ[name] = value
|
166
|
+
|
167
|
+
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
168
|
+
allocator = LocalAllocator()
|
169
|
+
alloc = await allocator.allocate(spec)
|
170
|
+
|
171
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_multiple_env_vars)
|
172
|
+
|
173
|
+
try:
|
174
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
175
|
+
|
176
|
+
for name, expected_value in env_vars.items():
|
177
|
+
actual_value = await actor.get_env_var.call_one(name)
|
178
|
+
self.assertEqual(
|
179
|
+
actual_value,
|
180
|
+
expected_value,
|
181
|
+
f"Environment variable {name} was not set correctly",
|
182
|
+
)
|
183
|
+
finally:
|
184
|
+
await proc_mesh.stop()
|
185
|
+
|
186
|
+
async def test_setup_lambda_with_context_info(self) -> None:
|
187
|
+
"""Test that the setup lambda can access context information"""
|
188
|
+
context_var_name: str = "PROC_MESH_CONTEXT_INFO"
|
189
|
+
|
190
|
+
def setup_with_context(ctx: MonarchContext) -> None:
|
191
|
+
context_info = f"proc_id:{ctx.proc_id},point_rank:{ctx.point.rank}"
|
192
|
+
os.environ[context_var_name] = context_info
|
193
|
+
|
194
|
+
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
195
|
+
allocator = LocalAllocator()
|
196
|
+
alloc = await allocator.allocate(spec)
|
197
|
+
|
198
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_context)
|
199
|
+
|
200
|
+
try:
|
201
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
202
|
+
|
203
|
+
context_info = await actor.get_env_var.call_one(context_var_name)
|
204
|
+
|
205
|
+
self.assertNotEqual(
|
206
|
+
context_info,
|
207
|
+
"NOT_SET",
|
208
|
+
"Context information was not stored in the environment variable",
|
209
|
+
)
|
210
|
+
self.assertIn(
|
211
|
+
"proc_id:", context_info, "Context information does not contain proc_id"
|
212
|
+
)
|
213
|
+
self.assertIn(
|
214
|
+
"point_rank:0",
|
215
|
+
context_info,
|
216
|
+
f"Context information {context_info} does not contain point_rank",
|
217
|
+
)
|
218
|
+
finally:
|
219
|
+
await proc_mesh.stop()
|
220
|
+
|
221
|
+
|
105
222
|
class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
106
223
|
@classmethod
|
107
224
|
def setUpClass(cls) -> None:
|
@@ -153,7 +270,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
153
270
|
"""test initializer that returns an empty list of addresses"""
|
154
271
|
|
155
272
|
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
156
|
-
_ = match_labels
|
273
|
+
_ = match_labels
|
157
274
|
return []
|
158
275
|
|
159
276
|
empty_initializer = EmptyAllocInitializer()
|
@@ -191,6 +308,209 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
191
308
|
|
192
309
|
self.assert_computed_world_size(values, world_size)
|
193
310
|
|
311
|
+
async def test_stop_proc_mesh_blocking(self) -> None:
|
312
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
313
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
314
|
+
allocator = RemoteAllocator(
|
315
|
+
world_id="test_remote_allocator",
|
316
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
317
|
+
heartbeat_interval=_100_MILLISECONDS,
|
318
|
+
)
|
319
|
+
|
320
|
+
alloc = await allocator.allocate(spec)
|
321
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
322
|
+
# XXX - it is not clear why this trying to use
|
323
|
+
# async code in a sync context.
|
324
|
+
with fake_sync_state():
|
325
|
+
actor = proc_mesh.spawn("test_actor", TestActor).get()
|
326
|
+
proc_mesh.stop().get()
|
327
|
+
with self.assertRaises(
|
328
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
329
|
+
):
|
330
|
+
proc_mesh.spawn("test_actor", TestActor).get()
|
331
|
+
del actor
|
332
|
+
|
333
|
+
async def test_wrong_address(self) -> None:
|
334
|
+
hosts = 1
|
335
|
+
gpus = 1
|
336
|
+
spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
|
337
|
+
|
338
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
339
|
+
with remote_process_allocator():
|
340
|
+
wrong_host = ChannelAddr.any(ChannelTransport.Unix)
|
341
|
+
allocator = RemoteAllocator(
|
342
|
+
world_id="test_remote_allocator",
|
343
|
+
initializer=StaticRemoteAllocInitializer(wrong_host),
|
344
|
+
heartbeat_interval=_100_MILLISECONDS,
|
345
|
+
)
|
346
|
+
alloc = await allocator.allocate(spec)
|
347
|
+
|
348
|
+
with self.assertRaisesRegex(
|
349
|
+
Exception, r"no process has ever been allocated.*"
|
350
|
+
):
|
351
|
+
await ProcMesh.from_alloc(alloc)
|
352
|
+
|
353
|
+
async def test_init_failure(self) -> None:
|
354
|
+
class FailInitActor(Actor):
|
355
|
+
def __init__(self) -> None:
|
356
|
+
if current_rank().rank == 0:
|
357
|
+
raise RuntimeError("fail on init")
|
358
|
+
|
359
|
+
@endpoint
|
360
|
+
def dummy(self) -> None:
|
361
|
+
pass
|
362
|
+
|
363
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
364
|
+
allocator = RemoteAllocator(
|
365
|
+
world_id="helloworld",
|
366
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
367
|
+
heartbeat_interval=_100_MILLISECONDS,
|
368
|
+
)
|
369
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=2)
|
370
|
+
proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
|
371
|
+
actor_mesh = await proc_mesh.spawn("actor", FailInitActor)
|
372
|
+
|
373
|
+
with self.assertRaisesRegex(
|
374
|
+
Exception,
|
375
|
+
r"(?s)fail on init",
|
376
|
+
):
|
377
|
+
await actor_mesh.dummy.call()
|
378
|
+
|
379
|
+
async def test_stop_proc_mesh(self) -> None:
|
380
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
381
|
+
|
382
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
383
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
384
|
+
allocator = RemoteAllocator(
|
385
|
+
world_id="test_remote_allocator",
|
386
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
387
|
+
heartbeat_interval=_100_MILLISECONDS,
|
388
|
+
)
|
389
|
+
alloc = await allocator.allocate(spec)
|
390
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
391
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
392
|
+
|
393
|
+
await proc_mesh.stop()
|
394
|
+
|
395
|
+
with self.assertRaises(
|
396
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
397
|
+
):
|
398
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
399
|
+
|
400
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
401
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
402
|
+
# now we doing casting without accessing the wrapped type.
|
403
|
+
del actor
|
404
|
+
|
405
|
+
async def test_stop_proc_mesh_context_manager(self) -> None:
|
406
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
407
|
+
|
408
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
409
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
410
|
+
allocator = RemoteAllocator(
|
411
|
+
world_id="test_remote_allocator",
|
412
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
413
|
+
heartbeat_interval=_100_MILLISECONDS,
|
414
|
+
)
|
415
|
+
alloc = await allocator.allocate(spec)
|
416
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
417
|
+
with self.assertRaises(ValueError, msg="foo"):
|
418
|
+
async with proc_mesh:
|
419
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
420
|
+
# Ensure that proc mesh is stopped when context manager exits.
|
421
|
+
raise ValueError("foo")
|
422
|
+
|
423
|
+
with self.assertRaises(
|
424
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
425
|
+
):
|
426
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
427
|
+
|
428
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
429
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
430
|
+
# now we doing casting without accessing the wrapped type.
|
431
|
+
del actor
|
432
|
+
|
433
|
+
async def test_setup_lambda_sets_env_vars(self) -> None:
|
434
|
+
"""Test that the setup lambda can set environment variables during proc_mesh allocation"""
|
435
|
+
test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
|
436
|
+
test_var_value: str = "test_value_123"
|
437
|
+
|
438
|
+
def setup_env_vars(ctx: MonarchContext) -> None:
|
439
|
+
os.environ[test_var_name] = test_var_value
|
440
|
+
|
441
|
+
hosts = 2
|
442
|
+
gpus = 4
|
443
|
+
spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
|
444
|
+
|
445
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
446
|
+
allocator = RemoteAllocator(
|
447
|
+
world_id="test_remote_allocator",
|
448
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
449
|
+
heartbeat_interval=_100_MILLISECONDS,
|
450
|
+
)
|
451
|
+
alloc = await allocator.allocate(spec)
|
452
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_env_vars)
|
453
|
+
|
454
|
+
try:
|
455
|
+
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
456
|
+
|
457
|
+
env_var_values = await actor.get_env_var.call(test_var_name)
|
458
|
+
env_var_value = env_var_values.item(host=0, gpu=0)
|
459
|
+
|
460
|
+
self.assertEqual(
|
461
|
+
env_var_value,
|
462
|
+
test_var_value,
|
463
|
+
f"Environment variable {test_var_name} was not set correctly",
|
464
|
+
)
|
465
|
+
finally:
|
466
|
+
await proc_mesh.stop()
|
467
|
+
|
468
|
+
async def test_stop_proc_mesh_context_manager_multiple_times(self) -> None:
|
469
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
470
|
+
|
471
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
472
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
473
|
+
allocator = RemoteAllocator(
|
474
|
+
world_id="test_remote_allocator",
|
475
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
476
|
+
heartbeat_interval=_100_MILLISECONDS,
|
477
|
+
)
|
478
|
+
alloc = await allocator.allocate(spec)
|
479
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
480
|
+
# We can nest multiple context managers on the same mesh, the innermost
|
481
|
+
# one closes the mesh and it cannot be used after that.
|
482
|
+
async with proc_mesh:
|
483
|
+
async with proc_mesh:
|
484
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
485
|
+
|
486
|
+
with self.assertRaises(
|
487
|
+
RuntimeError, msg="`ProcMesh` has already been stopped"
|
488
|
+
):
|
489
|
+
await proc_mesh.spawn("test_actor", TestActor)
|
490
|
+
# Exiting a second time should not raise an error.
|
491
|
+
|
492
|
+
# TODO(agallagher): It'd be nice to test that this just fails
|
493
|
+
# immediately, trying to access the wrapped actor mesh, but right
|
494
|
+
# now we doing casting without accessing the wrapped type.
|
495
|
+
del actor
|
496
|
+
|
497
|
+
async def test_remote_allocator_with_no_connection(self) -> None:
|
498
|
+
spec = AllocSpec(AllocConstraints(), host=1, gpu=4)
|
499
|
+
|
500
|
+
with remote_process_allocator(timeout=1) as host1:
|
501
|
+
# Wait 3 seconds without making any processes, make sure it dies.
|
502
|
+
await asyncio.sleep(3)
|
503
|
+
allocator = RemoteAllocator(
|
504
|
+
world_id="test_remote_allocator",
|
505
|
+
initializer=StaticRemoteAllocInitializer(host1),
|
506
|
+
heartbeat_interval=_100_MILLISECONDS,
|
507
|
+
)
|
508
|
+
with self.assertRaisesRegex(
|
509
|
+
Exception, "no process has ever been allocated on"
|
510
|
+
):
|
511
|
+
alloc = await allocator.allocate(spec)
|
512
|
+
await ProcMesh.from_alloc(alloc)
|
513
|
+
|
194
514
|
async def test_stacked_1d_meshes(self) -> None:
|
195
515
|
# create two stacked actor meshes on the same host
|
196
516
|
# each actor mesh running on separate process-allocators
|
@@ -244,7 +564,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
244
564
|
# but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
|
245
565
|
|
246
566
|
server = ServerSpec(
|
247
|
-
name=
|
567
|
+
name=UNUSED,
|
568
|
+
scheduler=UNUSED,
|
248
569
|
state=AppState.RUNNING,
|
249
570
|
meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
|
250
571
|
)
|
@@ -262,7 +583,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
262
583
|
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
263
584
|
async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
|
264
585
|
server = ServerSpec(
|
265
|
-
name=
|
586
|
+
name=UNUSED,
|
587
|
+
scheduler=UNUSED,
|
266
588
|
state=AppState.RUNNING,
|
267
589
|
meshes=[
|
268
590
|
MeshSpec(
|
@@ -295,7 +617,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
295
617
|
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
296
618
|
async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
|
297
619
|
server = ServerSpec(
|
298
|
-
name=
|
620
|
+
name=UNUSED,
|
621
|
+
scheduler=UNUSED,
|
299
622
|
state=AppState.RUNNING,
|
300
623
|
meshes=[
|
301
624
|
MeshSpec(
|
@@ -338,6 +661,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
338
661
|
|
339
662
|
server = ServerSpec(
|
340
663
|
name="test",
|
664
|
+
scheduler=UNUSED,
|
341
665
|
state=AppState.RUNNING,
|
342
666
|
meshes=[
|
343
667
|
MeshSpec(
|
@@ -363,3 +687,35 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
363
687
|
)
|
364
688
|
)
|
365
689
|
await ProcMesh.from_alloc(alloc)
|
690
|
+
|
691
|
+
async def test_log(self) -> None:
|
692
|
+
# create a mesh to log to both stdout and stderr
|
693
|
+
|
694
|
+
with remote_process_allocator() as host:
|
695
|
+
allocator = RemoteAllocator(
|
696
|
+
world_id="test_actor_logger",
|
697
|
+
initializer=StaticRemoteAllocInitializer(host),
|
698
|
+
heartbeat_interval=_100_MILLISECONDS,
|
699
|
+
)
|
700
|
+
|
701
|
+
spec = AllocSpec(AllocConstraints(), host=1, gpu=2)
|
702
|
+
|
703
|
+
proc_mesh = await ProcMesh.from_alloc(await allocator.allocate(spec))
|
704
|
+
|
705
|
+
# Generate aggregated log every 1 second.
|
706
|
+
await proc_mesh.logging_option(True, 1)
|
707
|
+
actor = await proc_mesh.spawn("actor", TestActor)
|
708
|
+
# Run for 4 seconds, every second generates 5 logs, so we expect to see
|
709
|
+
# 2 actors x 5 logs/actor/sec * 1 sec = 10 logs per aggregation.
|
710
|
+
for _ in range(20):
|
711
|
+
await actor.log.call("Expect to see [10 processes]")
|
712
|
+
sleep(0.2)
|
713
|
+
# Generate aggregated log every 2 seconds.
|
714
|
+
await proc_mesh.logging_option(True, 2)
|
715
|
+
# Run for 8 seconds, every second generates 5 logs, so we expect to see
|
716
|
+
# 2 actors x 5 logs/actor/sec * 2 sec = 20 logs per aggregation.
|
717
|
+
for _ in range(40):
|
718
|
+
await actor.log.call("Expect to see [20 processes]")
|
719
|
+
sleep(0.2)
|
720
|
+
|
721
|
+
print("======== All Done ========")
|
tests/test_controller.py
CHANGED