torchmonarch-nightly 2025.7.28__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.30__cp313-cp313-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +9 -5
- monarch/_src/actor/allocator.py +18 -47
- monarch/_src/actor/debugger.py +159 -98
- monarch/_src/actor/endpoint.py +15 -4
- monarch/_src/actor/future.py +79 -32
- monarch/_src/actor/pdb_wrapper.py +10 -4
- monarch/_src/actor/proc_mesh.py +82 -114
- monarch/_src/actor/shape.py +32 -33
- monarch/_src/tensor_engine/rdma.py +12 -6
- monarch/mesh_controller.py +37 -4
- monarch/monarch_controller +0 -0
- monarch/tools/components/hyperactor.py +1 -1
- monarch/tools/config/__init__.py +1 -1
- monarch/tools/config/defaults.py +1 -1
- monarch/tools/utils.py +27 -0
- tests/test_actor_error.py +3 -4
- tests/test_actor_shape.py +114 -0
- tests/test_debugger.py +406 -178
- tests/test_python_actors.py +67 -67
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/RECORD +26 -25
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/top_level.txt +0 -0
tests/test_debugger.py
CHANGED
@@ -8,7 +8,8 @@
|
|
8
8
|
import asyncio
|
9
9
|
import re
|
10
10
|
import sys
|
11
|
-
from
|
11
|
+
from typing import cast, List
|
12
|
+
from unittest.mock import AsyncMock, patch
|
12
13
|
|
13
14
|
import monarch
|
14
15
|
import monarch.actor as actor
|
@@ -17,14 +18,15 @@ import pytest
|
|
17
18
|
|
18
19
|
import torch
|
19
20
|
|
20
|
-
from monarch._src.actor.actor_mesh import Actor, current_rank
|
21
|
+
from monarch._src.actor.actor_mesh import Actor, ActorError, current_rank
|
21
22
|
from monarch._src.actor.debugger import (
|
22
23
|
Attach,
|
23
24
|
Cast,
|
24
25
|
Continue,
|
25
|
-
DebugClient,
|
26
26
|
DebugCommand,
|
27
27
|
DebugSession,
|
28
|
+
DebugSessionInfo,
|
29
|
+
DebugSessions,
|
28
30
|
Help,
|
29
31
|
ListCommand,
|
30
32
|
Quit,
|
@@ -73,6 +75,18 @@ class DebugeeActor(Actor):
|
|
73
75
|
return _debugee_actor_internal(rank)
|
74
76
|
|
75
77
|
|
78
|
+
async def _wait_for_breakpoints(debug_client, n_breakpoints) -> List[DebugSessionInfo]:
|
79
|
+
breakpoints: List[DebugSessionInfo] = []
|
80
|
+
for i in range(10):
|
81
|
+
breakpoints = await debug_client.list.call_one()
|
82
|
+
if len(breakpoints) == n_breakpoints:
|
83
|
+
break
|
84
|
+
await asyncio.sleep(1)
|
85
|
+
if i == 9:
|
86
|
+
raise RuntimeError("timed out waiting for breakpoints")
|
87
|
+
return breakpoints
|
88
|
+
|
89
|
+
|
76
90
|
@pytest.mark.skipif(
|
77
91
|
torch.cuda.device_count() < 2,
|
78
92
|
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
@@ -80,30 +94,30 @@ class DebugeeActor(Actor):
|
|
80
94
|
async def test_debug() -> None:
|
81
95
|
input_mock = AsyncMock()
|
82
96
|
input_mock.side_effect = [
|
83
|
-
"attach 1",
|
97
|
+
"attach debugee 1",
|
84
98
|
"n",
|
85
99
|
"n",
|
86
100
|
"n",
|
87
101
|
"n",
|
88
102
|
"detach",
|
89
|
-
"attach 1",
|
103
|
+
"attach debugee 1",
|
90
104
|
"detach",
|
91
105
|
"quit",
|
92
|
-
"cast ranks(0,3) n",
|
93
|
-
"cast ranks(0,3) n",
|
106
|
+
"cast debugee ranks(0,3) n",
|
107
|
+
"cast debugee ranks(0,3) n",
|
94
108
|
# Attaching to 0 and 3 ensures that when we call "list"
|
95
109
|
# the next time, their function/lineno info will be
|
96
110
|
# up-to-date.
|
97
|
-
"attach 0",
|
111
|
+
"attach debugee 0",
|
98
112
|
"detach",
|
99
|
-
"attach 3",
|
113
|
+
"attach debugee 3",
|
100
114
|
"detach",
|
101
115
|
"quit",
|
102
|
-
"attach 2",
|
116
|
+
"attach debugee 2",
|
103
117
|
"c",
|
104
118
|
"detach",
|
105
119
|
"quit",
|
106
|
-
"attach 2",
|
120
|
+
"attach debugee 2",
|
107
121
|
"bt",
|
108
122
|
"c",
|
109
123
|
"quit",
|
@@ -125,23 +139,16 @@ async def test_debug() -> None:
|
|
125
139
|
|
126
140
|
fut = debugee.to_debug.call()
|
127
141
|
await debug_client.wait_pending_session.call_one()
|
128
|
-
breakpoints =
|
129
|
-
for i in range(10):
|
130
|
-
breakpoints = await debug_client.list.call_one()
|
131
|
-
if len(breakpoints) == 4:
|
132
|
-
break
|
133
|
-
await asyncio.sleep(1)
|
134
|
-
if i == 9:
|
135
|
-
raise RuntimeError("timed out waiting for breakpoints")
|
142
|
+
breakpoints = await _wait_for_breakpoints(debug_client, 4)
|
136
143
|
|
137
144
|
initial_linenos = {}
|
138
145
|
for i in range(len(breakpoints)):
|
139
|
-
|
140
|
-
initial_linenos[rank] = lineno
|
141
|
-
assert rank == i
|
142
|
-
assert coords == {"hosts": rank // 2, "gpus": rank % 2}
|
143
|
-
assert function == "test_debugger._debugee_actor_internal"
|
144
|
-
assert lineno == breakpoints[0]
|
146
|
+
info = breakpoints[i]
|
147
|
+
initial_linenos[info.rank] = info.lineno
|
148
|
+
assert info.rank == i
|
149
|
+
assert info.coords == {"hosts": info.rank // 2, "gpus": info.rank % 2}
|
150
|
+
assert info.function == "test_debugger._debugee_actor_internal"
|
151
|
+
assert info.lineno == cast(int, breakpoints[0].lineno) + 5 * info.rank
|
145
152
|
|
146
153
|
await debug_client.enter.call_one()
|
147
154
|
|
@@ -163,30 +170,36 @@ async def test_debug() -> None:
|
|
163
170
|
breakpoints = await debug_client.list.call_one()
|
164
171
|
for i in range(len(breakpoints)):
|
165
172
|
if i == 1:
|
166
|
-
assert breakpoints[i]
|
173
|
+
assert breakpoints[i].function == "test_debugger.to_debug"
|
167
174
|
else:
|
168
|
-
assert
|
169
|
-
|
175
|
+
assert (
|
176
|
+
breakpoints[i].function == "test_debugger._debugee_actor_internal"
|
177
|
+
)
|
178
|
+
assert breakpoints[i].lineno == initial_linenos[i]
|
170
179
|
|
171
180
|
await debug_client.enter.call_one()
|
172
181
|
|
173
182
|
breakpoints = await debug_client.list.call_one()
|
174
183
|
for i in range(len(breakpoints)):
|
175
184
|
if i == 1:
|
176
|
-
assert breakpoints[i]
|
185
|
+
assert breakpoints[i].function == "test_debugger.to_debug"
|
177
186
|
elif i in (0, 3):
|
178
|
-
assert
|
179
|
-
|
187
|
+
assert (
|
188
|
+
breakpoints[i].function == "test_debugger._debugee_actor_internal"
|
189
|
+
)
|
190
|
+
assert breakpoints[i].lineno == initial_linenos[i] + 2
|
180
191
|
else:
|
181
|
-
assert
|
182
|
-
|
192
|
+
assert (
|
193
|
+
breakpoints[i].function == "test_debugger._debugee_actor_internal"
|
194
|
+
)
|
195
|
+
assert breakpoints[i].lineno == initial_linenos[i]
|
183
196
|
|
184
197
|
await debug_client.enter.call_one()
|
185
198
|
|
186
199
|
breakpoints = await debug_client.list.call_one()
|
187
200
|
assert len(breakpoints) == 4
|
188
201
|
# Expect post-mortem debugging for rank 2
|
189
|
-
assert breakpoints[2]
|
202
|
+
assert breakpoints[2].function == "test_debugger._bad_rank"
|
190
203
|
|
191
204
|
await debug_client.enter.call_one()
|
192
205
|
|
@@ -206,7 +219,7 @@ async def test_debug() -> None:
|
|
206
219
|
breakpoints = await debug_client.list.call_one()
|
207
220
|
assert len(breakpoints) == 3
|
208
221
|
for i, rank in enumerate((0, 1, 3)):
|
209
|
-
assert breakpoints[i]
|
222
|
+
assert breakpoints[i].rank == rank
|
210
223
|
|
211
224
|
await debug_client.enter.call_one()
|
212
225
|
breakpoints = await debug_client.list.call_one()
|
@@ -218,122 +231,288 @@ async def test_debug() -> None:
|
|
218
231
|
await fut
|
219
232
|
|
220
233
|
|
221
|
-
|
222
|
-
|
234
|
+
@pytest.mark.skipif(
|
235
|
+
torch.cuda.device_count() < 2,
|
236
|
+
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
237
|
+
)
|
238
|
+
async def test_debug_multi_actor() -> None:
|
239
|
+
input_mock = AsyncMock()
|
240
|
+
input_mock.side_effect = [
|
241
|
+
"attach debugee_2 2",
|
242
|
+
"n",
|
243
|
+
"detach",
|
244
|
+
"attach debugee_1 1",
|
245
|
+
"n",
|
246
|
+
"detach",
|
247
|
+
"quit",
|
248
|
+
"cast debugee_1 ranks(:) c",
|
249
|
+
"cast debugee_2 ranks(:) c",
|
250
|
+
"attach debugee_2 2",
|
251
|
+
"c",
|
252
|
+
"quit",
|
253
|
+
"continue",
|
254
|
+
]
|
255
|
+
|
256
|
+
with patch("monarch._src.actor.debugger._debugger_input", side_effect=input_mock):
|
257
|
+
proc = await proc_mesh(hosts=2, gpus=2)
|
258
|
+
debugee_1 = await proc.spawn("debugee_1", DebugeeActor)
|
259
|
+
debugee_2 = await proc.spawn("debugee_2", DebugeeActor)
|
260
|
+
debug_client = actor.debug_client()
|
223
261
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
rank = host * 8 + gpu
|
228
|
-
mock_session = MagicMock(spec=DebugSession)
|
229
|
-
mock_session.attach = AsyncMock()
|
230
|
-
mock_session.rank = rank
|
231
|
-
mock_session.coords = {"hosts": host, "gpus": gpu}
|
232
|
-
mock_sessions[rank] = mock_session
|
262
|
+
fut_1 = debugee_1.to_debug.call()
|
263
|
+
fut_2 = debugee_2.to_debug.call()
|
264
|
+
await debug_client.wait_pending_session.call_one()
|
233
265
|
|
234
|
-
|
266
|
+
breakpoints = await _wait_for_breakpoints(debug_client, 8)
|
235
267
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
268
|
+
initial_linenos = {}
|
269
|
+
for i in range(len(breakpoints)):
|
270
|
+
info = breakpoints[i]
|
271
|
+
initial_linenos[info.rank] = info.lineno
|
272
|
+
assert info.rank == i % 4
|
273
|
+
assert info.actor_name == "debugee_1" if i < 4 else "debugee_2"
|
274
|
+
assert info.coords == {"hosts": info.rank // 2, "gpus": info.rank % 2}
|
275
|
+
assert info.function == "test_debugger._debugee_actor_internal"
|
276
|
+
assert info.lineno == cast(int, breakpoints[0].lineno) + 5 * info.rank
|
242
277
|
|
243
|
-
|
244
|
-
session.attach.reset_mock()
|
278
|
+
await debug_client.enter.call_one()
|
245
279
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
280
|
+
breakpoints = await _wait_for_breakpoints(debug_client, 8)
|
281
|
+
for i in range(len(breakpoints)):
|
282
|
+
if i == 1:
|
283
|
+
assert breakpoints[i].actor_name == "debugee_1"
|
284
|
+
assert breakpoints[i].rank == 1
|
285
|
+
assert breakpoints[i].lineno == initial_linenos[breakpoints[i].rank] + 1
|
286
|
+
elif i == 6:
|
287
|
+
assert breakpoints[i].actor_name == "debugee_2"
|
288
|
+
assert breakpoints[i].rank == 2
|
289
|
+
assert breakpoints[i].lineno == initial_linenos[breakpoints[i].rank] + 1
|
290
|
+
else:
|
291
|
+
assert (
|
292
|
+
breakpoints[i].actor_name == "debugee_1" if i < 4 else "debugee_2"
|
293
|
+
)
|
294
|
+
assert breakpoints[i].rank == i % 4
|
295
|
+
assert breakpoints[i].lineno == initial_linenos[breakpoints[i].rank]
|
254
296
|
|
255
|
-
|
256
|
-
session.attach.reset_mock()
|
297
|
+
await debug_client.enter.call_one()
|
257
298
|
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
if rank not in ranks:
|
265
|
-
session.attach.assert_not_called()
|
266
|
-
|
267
|
-
for session in mock_sessions.values():
|
268
|
-
session.attach.reset_mock()
|
269
|
-
|
270
|
-
# Cast to all ranks
|
271
|
-
await debug_client._cast_input_and_wait("n", None)
|
272
|
-
for session in mock_sessions.values():
|
273
|
-
session.attach.assert_called_once_with("n", suppress_output=True)
|
274
|
-
|
275
|
-
for session in mock_sessions.values():
|
276
|
-
session.attach.reset_mock()
|
277
|
-
|
278
|
-
# Cast using dimension filtering with a single value
|
279
|
-
await debug_client._cast_input_and_wait("n", {"hosts": 1})
|
280
|
-
for session in mock_sessions.values():
|
281
|
-
if session.coords["hosts"] == 1:
|
282
|
-
session.attach.assert_called_once_with("n", suppress_output=True)
|
283
|
-
else:
|
284
|
-
session.attach.assert_not_called()
|
285
|
-
|
286
|
-
for session in mock_sessions.values():
|
287
|
-
session.attach.reset_mock()
|
288
|
-
|
289
|
-
# Cast using dimension filtering with a list
|
290
|
-
await debug_client._cast_input_and_wait("n", {"hosts": [0, 2]})
|
291
|
-
for _rank, session in mock_sessions.items():
|
292
|
-
if session.coords["hosts"] in [0, 2]:
|
293
|
-
session.attach.assert_called_once_with("n", suppress_output=True)
|
294
|
-
else:
|
295
|
-
session.attach.assert_not_called()
|
296
|
-
|
297
|
-
for session in mock_sessions.values():
|
298
|
-
session.attach.reset_mock()
|
299
|
-
|
300
|
-
# Cast using dimension filtering with a range
|
301
|
-
await debug_client._cast_input_and_wait("n", {"gpus": range(5, 8)})
|
302
|
-
for session in mock_sessions.values():
|
303
|
-
if session.coords["gpus"] in range(5, 8):
|
304
|
-
session.attach.assert_called_once_with("n", suppress_output=True)
|
305
|
-
else:
|
306
|
-
session.attach.assert_not_called()
|
307
|
-
|
308
|
-
for session in mock_sessions.values():
|
309
|
-
session.attach.reset_mock()
|
310
|
-
|
311
|
-
# Cast using multiple dimension filters
|
312
|
-
await debug_client._cast_input_and_wait(
|
313
|
-
"n", {"hosts": [1, 3], "gpus": range(0, sys.maxsize, 3)}
|
314
|
-
)
|
315
|
-
for session in mock_sessions.values():
|
316
|
-
if session.coords["hosts"] in [1, 3] and session.coords["gpus"] in range(
|
317
|
-
0, sys.maxsize, 3
|
318
|
-
):
|
319
|
-
session.attach.assert_called_once_with("n", suppress_output=True)
|
320
|
-
else:
|
321
|
-
session.attach.assert_not_called()
|
299
|
+
breakpoints = await _wait_for_breakpoints(debug_client, 1)
|
300
|
+
with pytest.raises(ActorError, match="ValueError: bad rank"):
|
301
|
+
await fut_2
|
302
|
+
assert breakpoints[0].actor_name == "debugee_1"
|
303
|
+
assert breakpoints[0].rank == 2
|
304
|
+
assert breakpoints[0].function == "test_debugger._bad_rank"
|
322
305
|
|
323
|
-
|
324
|
-
session.attach.reset_mock()
|
306
|
+
await debug_client.enter.call_one()
|
325
307
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
308
|
+
breakpoints = await _wait_for_breakpoints(debug_client, 0)
|
309
|
+
with pytest.raises(ActorError, match="ValueError: bad rank"):
|
310
|
+
await fut_1
|
311
|
+
|
312
|
+
|
313
|
+
async def test_debug_sessions_insert_get_remove() -> None:
|
314
|
+
mock_sessions = []
|
315
|
+
for actor_name in ("actor_a", "actor_b"):
|
316
|
+
for rank in range(2):
|
317
|
+
mock_session = DebugSession(rank, {}, "", actor_name)
|
318
|
+
mock_sessions.append(mock_session)
|
319
|
+
|
320
|
+
debug_sessions = DebugSessions()
|
321
|
+
|
322
|
+
with pytest.raises(ValueError, match="No debug sessions for actor actor_a"):
|
323
|
+
debug_sessions.get("actor_a", 0)
|
324
|
+
debug_sessions.insert(mock_sessions[0])
|
325
|
+
assert debug_sessions.get("actor_a", 0) is mock_sessions[0]
|
326
|
+
assert ("actor_a", 0) in debug_sessions
|
327
|
+
with pytest.raises(
|
328
|
+
ValueError, match="Debug session for rank 0 already exists for actor actor_a"
|
329
|
+
):
|
330
|
+
debug_sessions.insert(mock_sessions[0])
|
331
|
+
|
332
|
+
with pytest.raises(
|
333
|
+
ValueError, match="No debug session for rank 1 for actor actor_a"
|
334
|
+
):
|
335
|
+
debug_sessions.get("actor_a", 1)
|
336
|
+
debug_sessions.insert(mock_sessions[1])
|
337
|
+
assert debug_sessions.get("actor_a", 1) is mock_sessions[1]
|
338
|
+
assert ("actor_a", 1) in debug_sessions
|
339
|
+
with pytest.raises(
|
340
|
+
ValueError, match="Debug session for rank 1 already exists for actor actor_a"
|
341
|
+
):
|
342
|
+
debug_sessions.insert(mock_sessions[1])
|
343
|
+
|
344
|
+
with pytest.raises(ValueError, match="No debug sessions for actor actor_b"):
|
345
|
+
debug_sessions.get("actor_b", 0)
|
346
|
+
debug_sessions.insert(mock_sessions[2])
|
347
|
+
assert debug_sessions.get("actor_b", 0) is mock_sessions[2]
|
348
|
+
assert ("actor_b", 0) in debug_sessions
|
349
|
+
with pytest.raises(
|
350
|
+
ValueError, match="Debug session for rank 0 already exists for actor actor_b"
|
351
|
+
):
|
352
|
+
debug_sessions.insert(mock_sessions[2])
|
353
|
+
|
354
|
+
with pytest.raises(
|
355
|
+
ValueError, match="No debug session for rank 1 for actor actor_b"
|
356
|
+
):
|
357
|
+
debug_sessions.get("actor_b", 1)
|
358
|
+
debug_sessions.insert(mock_sessions[3])
|
359
|
+
assert debug_sessions.get("actor_b", 1) is mock_sessions[3]
|
360
|
+
assert ("actor_b", 1) in debug_sessions
|
361
|
+
with pytest.raises(
|
362
|
+
ValueError, match="Debug session for rank 1 already exists for actor actor_b"
|
363
|
+
):
|
364
|
+
debug_sessions.insert(mock_sessions[3])
|
365
|
+
|
366
|
+
assert len(debug_sessions) == 4
|
367
|
+
|
368
|
+
assert debug_sessions.remove("actor_a", 0) is mock_sessions[0]
|
369
|
+
assert len(debug_sessions) == 3
|
370
|
+
assert ("actor_a", 0) not in debug_sessions
|
371
|
+
with pytest.raises(
|
372
|
+
ValueError, match="No debug session for rank 0 for actor actor_a"
|
373
|
+
):
|
374
|
+
debug_sessions.remove("actor_a", 0)
|
375
|
+
|
376
|
+
assert debug_sessions.remove("actor_a", 1) is mock_sessions[1]
|
377
|
+
assert len(debug_sessions) == 2
|
378
|
+
assert ("actor_a", 1) not in debug_sessions
|
379
|
+
with pytest.raises(ValueError, match="No debug sessions for actor actor_a"):
|
380
|
+
debug_sessions.remove("actor_a", 1)
|
381
|
+
|
382
|
+
assert debug_sessions.remove("actor_b", 0) is mock_sessions[2]
|
383
|
+
assert len(debug_sessions) == 1
|
384
|
+
assert ("actor_b", 0) not in debug_sessions
|
385
|
+
with pytest.raises(
|
386
|
+
ValueError, match="No debug session for rank 0 for actor actor_b"
|
387
|
+
):
|
388
|
+
debug_sessions.remove("actor_b", 0)
|
389
|
+
|
390
|
+
assert debug_sessions.remove("actor_b", 1) is mock_sessions[3]
|
391
|
+
assert len(debug_sessions) == 0
|
392
|
+
assert ("actor_b", 1) not in debug_sessions
|
393
|
+
with pytest.raises(ValueError, match="No debug sessions for actor actor_b"):
|
394
|
+
debug_sessions.remove("actor_b", 1)
|
395
|
+
|
396
|
+
|
397
|
+
async def test_debug_sessions_iter() -> None:
|
398
|
+
debug_sessions = DebugSessions()
|
399
|
+
mock_sessions = []
|
400
|
+
|
401
|
+
for actor_name in ("actor_a", "actor_b"):
|
402
|
+
for host in range(3):
|
403
|
+
for gpu in range(8):
|
404
|
+
rank = host * 8 + gpu
|
405
|
+
mock_session = DebugSession(
|
406
|
+
rank, {"hosts": host, "gpus": gpu}, "", actor_name
|
407
|
+
)
|
408
|
+
mock_sessions.append(mock_session)
|
409
|
+
debug_sessions.insert(mock_session)
|
410
|
+
|
411
|
+
# Single rank
|
412
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
413
|
+
sessions = list(debug_sessions.iter((actor_name, 2)))
|
414
|
+
assert len(sessions) == 1
|
415
|
+
assert sessions[0] is mock_sessions[i * 24 + 2]
|
416
|
+
|
417
|
+
# List of ranks
|
418
|
+
ranks = [1, 3, 5]
|
419
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
420
|
+
sessions = sorted(
|
421
|
+
debug_sessions.iter((actor_name, ranks)), key=lambda s: s.get_info()
|
422
|
+
)
|
423
|
+
assert len(sessions) == 3
|
424
|
+
for j in range(3):
|
425
|
+
assert sessions[j] is mock_sessions[i * 24 + ranks[j]]
|
426
|
+
|
427
|
+
# Range of ranks
|
428
|
+
ranks = range(2, 24, 3)
|
429
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
430
|
+
sessions = sorted(
|
431
|
+
debug_sessions.iter((actor_name, ranks)), key=lambda s: s.get_info()
|
432
|
+
)
|
433
|
+
ranks = list(ranks)
|
434
|
+
assert len(sessions) == len(ranks)
|
435
|
+
for j in range(len(ranks)):
|
436
|
+
assert sessions[j] is mock_sessions[i * 24 + ranks[j]]
|
437
|
+
|
438
|
+
# All ranks
|
439
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
440
|
+
sessions = sorted(
|
441
|
+
debug_sessions.iter((actor_name, None)), key=lambda s: s.get_info()
|
442
|
+
)
|
443
|
+
assert len(sessions) == 24
|
444
|
+
for j in range(24):
|
445
|
+
assert sessions[j] is mock_sessions[i * 24 + j]
|
446
|
+
|
447
|
+
# All ranks, all actors
|
448
|
+
sessions = sorted(debug_sessions.iter(None), key=lambda s: s.get_info())
|
449
|
+
assert len(sessions) == 48
|
450
|
+
for i in range(48):
|
451
|
+
assert sessions[i] is mock_sessions[i]
|
452
|
+
|
453
|
+
# Dimension filtering with a single value
|
454
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
455
|
+
sessions = sorted(
|
456
|
+
debug_sessions.iter((actor_name, {"hosts": 1})), key=lambda s: s.get_info()
|
457
|
+
)
|
458
|
+
assert len(sessions) == 8
|
459
|
+
for j in range(8):
|
460
|
+
assert sessions[j] is mock_sessions[i * 24 + 8 + j]
|
461
|
+
|
462
|
+
# Dimension filtering with a list
|
463
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
464
|
+
sessions = sorted(
|
465
|
+
debug_sessions.iter((actor_name, {"hosts": [0, 2]})),
|
466
|
+
key=lambda s: s.get_info(),
|
467
|
+
)
|
468
|
+
assert len(sessions) == 16
|
469
|
+
j = 0
|
470
|
+
for host in (0, 2):
|
471
|
+
for gpu in range(8):
|
472
|
+
assert sessions[j] is mock_sessions[i * 24 + host * 8 + gpu]
|
473
|
+
j += 1
|
474
|
+
|
475
|
+
# Dimension filtering with a range
|
476
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
477
|
+
sessions = sorted(
|
478
|
+
debug_sessions.iter((actor_name, {"gpus": range(5, 8)})),
|
479
|
+
key=lambda s: s.get_info(),
|
480
|
+
)
|
481
|
+
assert len(sessions) == 9
|
482
|
+
j = 0
|
483
|
+
for host in range(3):
|
484
|
+
for gpu in range(5, 8):
|
485
|
+
assert sessions[j] is mock_sessions[i * 24 + host * 8 + gpu]
|
486
|
+
j += 1
|
487
|
+
|
488
|
+
# Multiple dimension filters
|
489
|
+
for i, actor_name in enumerate(("actor_a", "actor_b")):
|
490
|
+
sessions = sorted(
|
491
|
+
debug_sessions.iter(
|
492
|
+
(actor_name, {"hosts": [1, 3], "gpus": range(0, sys.maxsize, 3)})
|
493
|
+
),
|
494
|
+
key=lambda s: s.get_info(),
|
495
|
+
)
|
496
|
+
assert len(sessions) == 3
|
497
|
+
j = 0
|
498
|
+
for gpu in range(0, 8, 3):
|
499
|
+
assert sessions[j] is mock_sessions[i * 24 + 8 + gpu]
|
500
|
+
j += 1
|
501
|
+
|
502
|
+
# Non-existent dimension
|
503
|
+
for actor_name in ("actor_a", "actor_b"):
|
504
|
+
sessions = sorted(
|
505
|
+
debug_sessions.iter((actor_name, {"hosts": 0, "gpus": 0, "foo": 0})),
|
506
|
+
key=lambda s: s.get_info(),
|
507
|
+
)
|
508
|
+
assert len(sessions) == 0
|
330
509
|
|
331
510
|
|
332
511
|
@pytest.mark.parametrize(
|
333
512
|
["user_input", "expected_output"],
|
334
513
|
[
|
335
|
-
("attach 1", Attach(1)),
|
336
|
-
("a 100", Attach(100)),
|
514
|
+
("attach debugee 1", Attach("debugee", 1)),
|
515
|
+
("a my_awesome_actor 100", Attach("my_awesome_actor", 100)),
|
337
516
|
("list", ListCommand()),
|
338
517
|
("l", ListCommand()),
|
339
518
|
("help", Help()),
|
@@ -342,33 +521,74 @@ async def test_cast_input_and_wait() -> None:
|
|
342
521
|
("q", Quit()),
|
343
522
|
("continue", Continue()),
|
344
523
|
("c", Continue()),
|
345
|
-
("cast ranks(123) b 25", Cast(ranks=123, command="b 25")),
|
346
|
-
("cast ranks(12,34,56) b 25", Cast(ranks=[12, 34, 56], command="b 25")),
|
347
|
-
("cast ranks(:) b 25", Cast(ranks=range(0, sys.maxsize), command="b 25")),
|
348
|
-
("cast ranks(:123) b 25", Cast(ranks=range(0, 123), command="b 25")),
|
349
|
-
("cast ranks(123:) b 25", Cast(ranks=range(123, sys.maxsize), command="b 25")),
|
350
|
-
("cast ranks(123:456) b 25", Cast(ranks=range(123, 456), command="b 25")),
|
351
|
-
("cast ranks(::) b 25", Cast(ranks=range(0, sys.maxsize), command="b 25")),
|
352
524
|
(
|
353
|
-
"cast ranks(
|
354
|
-
Cast(
|
525
|
+
"cast debugee ranks(123) b 25",
|
526
|
+
Cast(actor_name="debugee", ranks=123, command="b 25"),
|
527
|
+
),
|
528
|
+
(
|
529
|
+
"cast my_awesome_actor ranks(12,34,56) b 25",
|
530
|
+
Cast(actor_name="my_awesome_actor", ranks=[12, 34, 56], command="b 25"),
|
531
|
+
),
|
532
|
+
(
|
533
|
+
"cast debugee ranks(:) b 25",
|
534
|
+
Cast(actor_name="debugee", ranks=range(0, sys.maxsize), command="b 25"),
|
535
|
+
),
|
536
|
+
(
|
537
|
+
"cast debugee ranks(:123) b 25",
|
538
|
+
Cast(actor_name="debugee", ranks=range(0, 123), command="b 25"),
|
539
|
+
),
|
540
|
+
(
|
541
|
+
"cast debugee ranks(123:) b 25",
|
542
|
+
Cast(actor_name="debugee", ranks=range(123, sys.maxsize), command="b 25"),
|
543
|
+
),
|
544
|
+
(
|
545
|
+
"cast debugee ranks(123:456) b 25",
|
546
|
+
Cast(actor_name="debugee", ranks=range(123, 456), command="b 25"),
|
547
|
+
),
|
548
|
+
(
|
549
|
+
"cast debugee ranks(::) b 25",
|
550
|
+
Cast(actor_name="debugee", ranks=range(0, sys.maxsize), command="b 25"),
|
551
|
+
),
|
552
|
+
(
|
553
|
+
"cast debugee ranks(::123) b 25",
|
554
|
+
Cast(
|
555
|
+
actor_name="debugee", ranks=range(0, sys.maxsize, 123), command="b 25"
|
556
|
+
),
|
557
|
+
),
|
558
|
+
(
|
559
|
+
"cast debugee ranks(123::) b 25",
|
560
|
+
Cast(actor_name="debugee", ranks=range(123, sys.maxsize), command="b 25"),
|
561
|
+
),
|
562
|
+
(
|
563
|
+
"cast debugee ranks(:123:) b 25",
|
564
|
+
Cast(actor_name="debugee", ranks=range(0, 123), command="b 25"),
|
565
|
+
),
|
566
|
+
(
|
567
|
+
"cast debugee ranks(:456:123) b 25",
|
568
|
+
Cast(actor_name="debugee", ranks=range(0, 456, 123), command="b 25"),
|
569
|
+
),
|
570
|
+
(
|
571
|
+
"cast debugee ranks(456::123) b 25",
|
572
|
+
Cast(
|
573
|
+
actor_name="debugee", ranks=range(456, sys.maxsize, 123), command="b 25"
|
574
|
+
),
|
575
|
+
),
|
576
|
+
(
|
577
|
+
"cast debugee ranks(123:456:) b 25",
|
578
|
+
Cast(actor_name="debugee", ranks=range(123, 456), command="b 25"),
|
355
579
|
),
|
356
|
-
("cast ranks(123::) b 25", Cast(ranks=range(123, sys.maxsize), command="b 25")),
|
357
|
-
("cast ranks(:123:) b 25", Cast(ranks=range(0, 123), command="b 25")),
|
358
|
-
("cast ranks(:456:123) b 25", Cast(ranks=range(0, 456, 123), command="b 25")),
|
359
580
|
(
|
360
|
-
"cast ranks(456
|
361
|
-
Cast(ranks=range(456,
|
581
|
+
"cast debugee ranks(456:789:123) b 25",
|
582
|
+
Cast(actor_name="debugee", ranks=range(456, 789, 123), command="b 25"),
|
362
583
|
),
|
363
|
-
("cast ranks(123:456:) b 25", Cast(ranks=range(123, 456), command="b 25")),
|
364
584
|
(
|
365
|
-
"cast ranks(
|
366
|
-
Cast(
|
585
|
+
"cast debugee ranks(dim1=123) up 2",
|
586
|
+
Cast(actor_name="debugee", ranks={"dim1": 123}, command="up 2"),
|
367
587
|
),
|
368
|
-
("cast ranks(dim1=123) up 2", Cast(ranks={"dim1": 123}, command="up 2")),
|
369
588
|
(
|
370
|
-
"cast ranks(dim1=123, dim2=(12,34,56), dim3=15::2) up 2",
|
589
|
+
"cast debugee ranks(dim1=123, dim2=(12,34,56), dim3=15::2) up 2",
|
371
590
|
Cast(
|
591
|
+
actor_name="debugee",
|
372
592
|
ranks={
|
373
593
|
"dim1": 123,
|
374
594
|
"dim2": [12, 34, 56],
|
@@ -387,29 +607,37 @@ async def test_debug_command_parser_valid_inputs(user_input, expected_output):
|
|
387
607
|
"invalid_input",
|
388
608
|
[
|
389
609
|
"",
|
390
|
-
"
|
610
|
+
"a",
|
391
611
|
"attach",
|
392
|
-
"
|
393
|
-
"
|
394
|
-
"
|
395
|
-
"
|
396
|
-
"
|
397
|
-
"cast ranks(
|
398
|
-
"cast
|
399
|
-
"
|
400
|
-
"cast
|
401
|
-
"cast ranks(
|
402
|
-
"cast ranks(
|
403
|
-
"cast ranks(
|
404
|
-
"cast ranks(:
|
405
|
-
"cast ranks(1
|
406
|
-
"cast ranks(
|
407
|
-
"cast ranks(
|
408
|
-
"cast ranks(
|
409
|
-
"cast ranks(
|
410
|
-
"cast ranks(
|
411
|
-
"cast ranks(
|
412
|
-
"cast ranks(
|
612
|
+
"a actor",
|
613
|
+
"attach actor",
|
614
|
+
"attacha actor 1" "attch actor 1",
|
615
|
+
"attach actor 1abc",
|
616
|
+
"attach actor 1 a",
|
617
|
+
"cast ranks(123) b 25",
|
618
|
+
"cast ranks(123) b 25",
|
619
|
+
"castactor ranks(123) b 25",
|
620
|
+
"cast actor rnks(123) b 25",
|
621
|
+
"cast actor ranks() b 25",
|
622
|
+
"cast actor ranks(1ab) b 25",
|
623
|
+
"cast actor ranks(1,a,3) b 25",
|
624
|
+
"cast actor ranks(a:2:4) b 25",
|
625
|
+
"cast actor ranks(1,2,3",
|
626
|
+
"cast actor ranks(1,2,3)) b 25",
|
627
|
+
"cast actor ranks(1,) b 25",
|
628
|
+
"cast actor ranks(1,2,) b 25",
|
629
|
+
"cast actor ranks(,1,2) b 25",
|
630
|
+
"cast actor ranks(1,,2) b 25",
|
631
|
+
"cast actor ranks(:::) b 25",
|
632
|
+
"cast actor ranks(:123::) b 25",
|
633
|
+
"cast actor ranks(1:2:3,4) b 25",
|
634
|
+
"cast actor ranks(dim1=) b 25",
|
635
|
+
"cast actor ranks(dim1=123, dim2=) b 25",
|
636
|
+
"cast actor ranks(dim1=123, dim2=(12,34,56) b 25",
|
637
|
+
"cast actor ranks(dim1=123, dim2=(,12,34,56) b 25",
|
638
|
+
"cast actor ranks(dim1=123, dim2=(12,,34,56) b 25",
|
639
|
+
"cast actor ranks(dim1=123, dim2=(12,34,56), dim3=15::2 b 25",
|
640
|
+
"cast actor ranks(dim1=123,) b 25",
|
413
641
|
],
|
414
642
|
)
|
415
643
|
async def test_debug_command_parser_invalid_inputs(invalid_input):
|