torchmonarch-nightly 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.13__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +50 -18
- monarch/actor_mesh.py +27 -1
- monarch/bootstrap_main.py +1 -20
- monarch/builtins/random.py +4 -5
- monarch/common/client.py +15 -1
- monarch/debugger.py +377 -0
- monarch/mesh_controller.py +71 -13
- monarch/monarch_controller +0 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/telemetry.py +19 -0
- tests/test_coalescing.py +1 -1
- tests/test_controller.py +12 -2
- tests/test_python_actors.py +150 -0
- tests/test_remote_functions.py +1 -1
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/RECORD +21 -18
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/_testing.py
CHANGED
@@ -10,7 +10,7 @@ import logging
|
|
10
10
|
import tempfile
|
11
11
|
import time
|
12
12
|
from contextlib import contextmanager, ExitStack
|
13
|
-
from typing import Callable, Generator, Optional
|
13
|
+
from typing import Any, Callable, Dict, Generator, Literal, Optional
|
14
14
|
|
15
15
|
import monarch_supervisor
|
16
16
|
from monarch.common.client import Client
|
@@ -18,6 +18,8 @@ from monarch.common.device_mesh import DeviceMesh
|
|
18
18
|
from monarch.common.invocation import DeviceException, RemoteException
|
19
19
|
from monarch.common.shape import NDSlice
|
20
20
|
from monarch.controller.backend import ProcessBackend
|
21
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
22
|
+
from monarch.proc_mesh import proc_mesh, ProcMesh
|
21
23
|
from monarch.python_local_mesh import PythonLocalContext
|
22
24
|
from monarch.rust_local_mesh import (
|
23
25
|
local_mesh,
|
@@ -50,6 +52,7 @@ class TestingContext:
|
|
50
52
|
self.cleanup = ExitStack()
|
51
53
|
self._py_process_cache = {}
|
52
54
|
self._rust_process_cache = None
|
55
|
+
self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
|
53
56
|
|
54
57
|
@contextmanager
|
55
58
|
def _get_context(self, num_hosts, gpu_per_host):
|
@@ -75,16 +78,14 @@ class TestingContext:
|
|
75
78
|
|
76
79
|
@contextmanager
|
77
80
|
def local_py_device_mesh(
|
78
|
-
self,
|
81
|
+
self,
|
82
|
+
num_hosts,
|
83
|
+
gpu_per_host,
|
79
84
|
) -> Generator[DeviceMesh, None, None]:
|
80
85
|
ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
|
81
86
|
dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
|
82
87
|
try:
|
83
|
-
|
84
|
-
with dm.activate():
|
85
|
-
yield dm
|
86
|
-
else:
|
87
|
-
yield dm
|
88
|
+
yield dm
|
88
89
|
dm.client.shutdown(destroy_pg=False)
|
89
90
|
except Exception:
|
90
91
|
# abnormal exit, so we just make sure we do not try to communicate in destructors,
|
@@ -97,7 +98,6 @@ class TestingContext:
|
|
97
98
|
self,
|
98
99
|
num_hosts,
|
99
100
|
gpu_per_host,
|
100
|
-
activate: bool = True,
|
101
101
|
controller_params=None,
|
102
102
|
) -> Generator[DeviceMesh, None, None]:
|
103
103
|
# Create a new system and mesh for test.
|
@@ -115,11 +115,7 @@ class TestingContext:
|
|
115
115
|
controller_params=controller_params,
|
116
116
|
) as dm:
|
117
117
|
try:
|
118
|
-
|
119
|
-
with dm.activate():
|
120
|
-
yield dm
|
121
|
-
else:
|
122
|
-
yield dm
|
118
|
+
yield dm
|
123
119
|
dm.exit()
|
124
120
|
except Exception:
|
125
121
|
dm.client._shutdown = True
|
@@ -129,21 +125,57 @@ class TestingContext:
|
|
129
125
|
# pyre-ignore: Undefined attribute
|
130
126
|
dm.client.inner._actor.stop()
|
131
127
|
|
128
|
+
@contextmanager
|
129
|
+
def local_engine_on_proc_mesh(
|
130
|
+
self,
|
131
|
+
num_hosts,
|
132
|
+
gpu_per_host,
|
133
|
+
) -> Generator[DeviceMesh, None, None]:
|
134
|
+
key = (num_hosts, gpu_per_host)
|
135
|
+
if key not in self._proc_mesh_cache:
|
136
|
+
self._proc_mesh_cache[key] = proc_mesh(
|
137
|
+
hosts=num_hosts, gpus=gpu_per_host
|
138
|
+
).get()
|
139
|
+
|
140
|
+
dm = spawn_tensor_engine(self._proc_mesh_cache[key])
|
141
|
+
dm = dm.rename(hosts="host", gpus="gpu")
|
142
|
+
try:
|
143
|
+
yield dm
|
144
|
+
dm.exit()
|
145
|
+
except Exception as e:
|
146
|
+
# abnormal exit, so we just make sure we do not try to communicate in destructors,
|
147
|
+
# but we do notn wait for workers to exit since we do not know what state they are in.
|
148
|
+
dm.client._shutdown = True
|
149
|
+
raise
|
150
|
+
|
132
151
|
@contextmanager
|
133
152
|
def local_device_mesh(
|
134
|
-
self,
|
153
|
+
self,
|
154
|
+
num_hosts,
|
155
|
+
gpu_per_host,
|
156
|
+
activate=True,
|
157
|
+
backend: Literal["py", "rs", "mesh"] = "py",
|
158
|
+
controller_params=None,
|
135
159
|
) -> Generator[DeviceMesh, None, None]:
|
136
160
|
start = time.time()
|
137
|
-
if
|
161
|
+
if backend == "rs":
|
138
162
|
generator = self.local_rust_device_mesh(
|
139
|
-
num_hosts, gpu_per_host,
|
163
|
+
num_hosts, gpu_per_host, controller_params=controller_params
|
140
164
|
)
|
165
|
+
elif backend == "py":
|
166
|
+
generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
|
167
|
+
elif backend == "mesh":
|
168
|
+
generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
|
141
169
|
else:
|
142
|
-
|
170
|
+
raise ValueError(f"invalid backend: {backend}")
|
143
171
|
with generator as dm:
|
144
172
|
end = time.time()
|
145
173
|
logging.info("initialized mesh in {:.2f}s".format(end - start))
|
146
|
-
|
174
|
+
if activate:
|
175
|
+
with dm.activate():
|
176
|
+
yield dm
|
177
|
+
else:
|
178
|
+
yield dm
|
147
179
|
start = time.time()
|
148
180
|
end = time.time()
|
149
181
|
logging.info("shutdown mesh in {:.2f}s".format(end - start))
|
monarch/actor_mesh.py
CHANGED
@@ -15,6 +15,7 @@ import inspect
|
|
15
15
|
import itertools
|
16
16
|
import logging
|
17
17
|
import random
|
18
|
+
import sys
|
18
19
|
import traceback
|
19
20
|
|
20
21
|
from dataclasses import dataclass
|
@@ -37,6 +38,7 @@ from typing import (
|
|
37
38
|
ParamSpec,
|
38
39
|
Tuple,
|
39
40
|
Type,
|
41
|
+
TYPE_CHECKING,
|
40
42
|
TypeVar,
|
41
43
|
)
|
42
44
|
|
@@ -57,6 +59,10 @@ from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Sh
|
|
57
59
|
|
58
60
|
from monarch.common.pickle_flatten import flatten, unflatten
|
59
61
|
from monarch.common.shape import MeshTrait, NDSlice
|
62
|
+
from monarch.pdb_wrapper import remote_breakpointhook
|
63
|
+
|
64
|
+
if TYPE_CHECKING:
|
65
|
+
from monarch.debugger import DebugClient
|
60
66
|
|
61
67
|
logger: logging.Logger = logging.getLogger(__name__)
|
62
68
|
|
@@ -519,7 +525,14 @@ class _Actor:
|
|
519
525
|
enter_span(
|
520
526
|
the_method.__module__, message.method, str(ctx.mailbox.actor_id)
|
521
527
|
)
|
522
|
-
|
528
|
+
try:
|
529
|
+
result = await the_method(self.instance, *args, **kwargs)
|
530
|
+
except Exception as e:
|
531
|
+
logging.critical(
|
532
|
+
"Unahndled exception in actor endpoint",
|
533
|
+
exc_info=e,
|
534
|
+
)
|
535
|
+
raise e
|
523
536
|
exit_span()
|
524
537
|
return result
|
525
538
|
|
@@ -624,6 +637,19 @@ class Actor(MeshTrait):
|
|
624
637
|
"actor implementations are not meshes, but we can't convince the typechecker of it..."
|
625
638
|
)
|
626
639
|
|
640
|
+
@endpoint
|
641
|
+
async def _set_debug_client(self, client: "DebugClient") -> None:
|
642
|
+
point = MonarchContext.get().point
|
643
|
+
# For some reason, using a lambda instead of functools.partial
|
644
|
+
# confuses the pdb wrapper implementation.
|
645
|
+
sys.breakpointhook = functools.partial( # pyre-ignore
|
646
|
+
remote_breakpointhook,
|
647
|
+
point.rank,
|
648
|
+
point.shape.coordinates(point.rank),
|
649
|
+
MonarchContext.get().mailbox.actor_id,
|
650
|
+
client,
|
651
|
+
)
|
652
|
+
|
627
653
|
|
628
654
|
class ActorMeshRef(MeshTrait):
|
629
655
|
def __init__(
|
monarch/bootstrap_main.py
CHANGED
@@ -30,28 +30,9 @@ def invoke_main():
|
|
30
30
|
# behavior of std out as if it were a terminal.
|
31
31
|
sys.stdout.reconfigure(line_buffering=True)
|
32
32
|
global bootstrap_main
|
33
|
-
from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
|
34
|
-
forward_to_tracing,
|
35
|
-
)
|
36
33
|
|
37
34
|
# TODO: figure out what from worker_main.py we should reproduce here.
|
38
|
-
|
39
|
-
class TracingForwarder(logging.Handler):
|
40
|
-
def emit(self, record: logging.LogRecord) -> None:
|
41
|
-
try:
|
42
|
-
forward_to_tracing(
|
43
|
-
record.getMessage(),
|
44
|
-
record.filename or "",
|
45
|
-
record.lineno or 0,
|
46
|
-
record.levelno,
|
47
|
-
)
|
48
|
-
except AttributeError:
|
49
|
-
forward_to_tracing(
|
50
|
-
record.__str__(),
|
51
|
-
record.filename or "",
|
52
|
-
record.lineno or 0,
|
53
|
-
record.levelno,
|
54
|
-
)
|
35
|
+
from monarch.telemetry import TracingForwarder
|
55
36
|
|
56
37
|
if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
|
57
38
|
raise RuntimeError("Error during bootstrap for testing")
|
monarch/builtins/random.py
CHANGED
@@ -16,11 +16,6 @@ def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
|
|
16
16
|
torch.manual_seed(seed ^ process_idx)
|
17
17
|
|
18
18
|
|
19
|
-
@remote(propagate=lambda: 0)
|
20
|
-
def initial_seed_remote() -> int:
|
21
|
-
return torch.initial_seed()
|
22
|
-
|
23
|
-
|
24
19
|
@remote(propagate=lambda: torch.zeros(1))
|
25
20
|
def get_rng_state_remote() -> torch.Tensor:
|
26
21
|
return torch.get_rng_state()
|
@@ -67,3 +62,7 @@ def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
|
|
67
62
|
@remote(propagate="inspect")
|
68
63
|
def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
|
69
64
|
torch.cuda.set_rng_state_all(states)
|
65
|
+
|
66
|
+
|
67
|
+
# initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework
|
68
|
+
# def initial_seed_remote() -> int: ...
|
monarch/common/client.py
CHANGED
@@ -103,6 +103,13 @@ class Client:
|
|
103
103
|
# workers.
|
104
104
|
self.last_processed_seq = -1
|
105
105
|
|
106
|
+
# an error that we have received but know for certain has not
|
107
|
+
# been propagated to a future. This will be reported on shutdown
|
108
|
+
# to avoid hiding the error. This is best effort: we only keep
|
109
|
+
# the error until the point the a future is dependent on
|
110
|
+
# _any_ error, not particularly the tracked one.
|
111
|
+
self._pending_shutdown_error = None
|
112
|
+
|
106
113
|
self.recorder = Recorder()
|
107
114
|
|
108
115
|
self.pending_results: Dict[
|
@@ -174,6 +181,8 @@ class Client:
|
|
174
181
|
destroy_pg: bool = True,
|
175
182
|
error_reason: Optional[RemoteException | DeviceException | Exception] = None,
|
176
183
|
) -> None:
|
184
|
+
if self.has_shutdown:
|
185
|
+
return
|
177
186
|
logger.info("shutting down the client gracefully")
|
178
187
|
|
179
188
|
atexit.unregister(self._atexit)
|
@@ -303,6 +312,7 @@ class Client:
|
|
303
312
|
|
304
313
|
if error is not None:
|
305
314
|
logging.info("Received error for seq %s: %s", seq, error)
|
315
|
+
self._pending_shutdown_error = error
|
306
316
|
# We should not have set result if we have an error.
|
307
317
|
assert result is None
|
308
318
|
if not isinstance(error, RemoteException):
|
@@ -326,7 +336,11 @@ class Client:
|
|
326
336
|
|
327
337
|
fut, _ = self.pending_results[seq]
|
328
338
|
if fut is not None:
|
329
|
-
|
339
|
+
if error is None:
|
340
|
+
fut._set_result(result)
|
341
|
+
else:
|
342
|
+
fut._set_result(error)
|
343
|
+
self._pending_shutdown_error = None
|
330
344
|
elif result is not None:
|
331
345
|
logger.debug(f"{seq}: unused result {result}")
|
332
346
|
elif error is not None:
|
monarch/debugger.py
ADDED
@@ -0,0 +1,377 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import sys
|
10
|
+
from dataclasses import dataclass
|
11
|
+
from typing import Dict, List, Tuple, Union
|
12
|
+
|
13
|
+
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
14
|
+
from monarch.actor_mesh import Actor, endpoint
|
15
|
+
|
16
|
+
from monarch.pdb_wrapper import DebuggerWrite
|
17
|
+
|
18
|
+
from monarch.proc_mesh import local_proc_mesh
|
19
|
+
from tabulate import tabulate
|
20
|
+
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
CANCEL_TOKEN = object()
|
26
|
+
|
27
|
+
|
28
|
+
async def _debugger_input(prompt=""):
|
29
|
+
return await asyncio.to_thread(input, prompt)
|
30
|
+
|
31
|
+
|
32
|
+
def _debugger_output(msg):
|
33
|
+
sys.stdout.write(msg)
|
34
|
+
sys.stdout.flush()
|
35
|
+
|
36
|
+
|
37
|
+
@dataclass
|
38
|
+
class DebugSessionInfo:
|
39
|
+
rank: int
|
40
|
+
coords: Dict[str, int]
|
41
|
+
hostname: str
|
42
|
+
actor_id: ActorId
|
43
|
+
function: str | None
|
44
|
+
lineno: int | None
|
45
|
+
|
46
|
+
|
47
|
+
class DebugSession:
|
48
|
+
"""Represents a single session with a remote debugger."""
|
49
|
+
|
50
|
+
def __init__(
|
51
|
+
self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
|
52
|
+
):
|
53
|
+
self.rank = rank
|
54
|
+
self.coords = coords
|
55
|
+
self.hostname = hostname
|
56
|
+
self.actor_id = actor_id
|
57
|
+
self._active = False
|
58
|
+
self._message_queue = asyncio.Queue()
|
59
|
+
self._task = None
|
60
|
+
self._pending_send_to_actor = asyncio.Queue()
|
61
|
+
self._outputs_since_last_input = []
|
62
|
+
self._function_lineno = None
|
63
|
+
self._need_read = False
|
64
|
+
|
65
|
+
async def _event_loop(self, line=None, suppress_output=False):
|
66
|
+
if not suppress_output:
|
67
|
+
# If the user had previously attached to this debug session,
|
68
|
+
# then it would have printed various messages from the
|
69
|
+
# message queue. When the user re-attaches, we want to
|
70
|
+
# print out all of the output that was printed since the
|
71
|
+
# last command sent to this session.
|
72
|
+
for output in self._outputs_since_last_input:
|
73
|
+
_debugger_output(output.payload.decode())
|
74
|
+
|
75
|
+
while True:
|
76
|
+
# When the user inputs "detach", it uses up a "read" message
|
77
|
+
# without actually responding to the actor being debugged. We
|
78
|
+
# can't manually reinsert the "read" message into the message queue,
|
79
|
+
# so instead the self._need_read flag indicates there's an additional
|
80
|
+
# "read" that we need to respond to.
|
81
|
+
if self._need_read:
|
82
|
+
self._need_read = False
|
83
|
+
message = "read"
|
84
|
+
else:
|
85
|
+
message = await self._message_queue.get()
|
86
|
+
if message == "detach":
|
87
|
+
# Return to the main outer debug loop.
|
88
|
+
break
|
89
|
+
elif message == "read":
|
90
|
+
break_after = False
|
91
|
+
if line is not None:
|
92
|
+
break_after = True
|
93
|
+
else:
|
94
|
+
line = await _debugger_input()
|
95
|
+
if line.strip("\n") == "detach":
|
96
|
+
self._need_read = True
|
97
|
+
break
|
98
|
+
else:
|
99
|
+
self._outputs_since_last_input = []
|
100
|
+
await self._pending_send_to_actor.put((line + "\n").encode())
|
101
|
+
line = None
|
102
|
+
if break_after:
|
103
|
+
break
|
104
|
+
elif message[0] == "write":
|
105
|
+
output = message[1]
|
106
|
+
# If the user sees this output but then detaches from the session,
|
107
|
+
# its useful to store all outputs since the last input so that
|
108
|
+
# they can be printed again when the user re-attaches.
|
109
|
+
self._outputs_since_last_input.append(output)
|
110
|
+
if not suppress_output:
|
111
|
+
_debugger_output(output.payload.decode())
|
112
|
+
|
113
|
+
if not suppress_output:
|
114
|
+
print(
|
115
|
+
f"Detaching from debug session for rank {self.rank} ({self.hostname})"
|
116
|
+
)
|
117
|
+
|
118
|
+
def get_info(self):
|
119
|
+
function = lineno = None
|
120
|
+
if self._function_lineno is not None:
|
121
|
+
function, lineno = self._function_lineno
|
122
|
+
return DebugSessionInfo(
|
123
|
+
self.rank, self.coords, self.hostname, self.actor_id, function, lineno
|
124
|
+
)
|
125
|
+
|
126
|
+
async def attach(self, line=None, suppress_output=False):
|
127
|
+
self._active = True
|
128
|
+
if not suppress_output:
|
129
|
+
print(f"Attached to debug session for rank {self.rank} ({self.hostname})")
|
130
|
+
self._task = asyncio.create_task(self._event_loop(line, suppress_output))
|
131
|
+
await self._task
|
132
|
+
if not suppress_output:
|
133
|
+
print(f"Detached from debug session for rank {self.rank} ({self.hostname})")
|
134
|
+
self._active = False
|
135
|
+
|
136
|
+
async def detach(self):
|
137
|
+
if self._active:
|
138
|
+
await self._message_queue.put("detach")
|
139
|
+
|
140
|
+
async def debugger_read(self, size: int) -> DebuggerWrite:
|
141
|
+
await self._message_queue.put("read")
|
142
|
+
input_data = await self._pending_send_to_actor.get()
|
143
|
+
if len(input_data) > size:
|
144
|
+
input_data = input_data[:size]
|
145
|
+
return DebuggerWrite(input_data, None, None)
|
146
|
+
|
147
|
+
async def debugger_write(self, write: DebuggerWrite) -> None:
|
148
|
+
if write.function is not None and write.lineno is not None:
|
149
|
+
self._function_lineno = (write.function, write.lineno)
|
150
|
+
await self._message_queue.put(("write", write))
|
151
|
+
|
152
|
+
|
153
|
+
class DebugCommand:
|
154
|
+
@staticmethod
|
155
|
+
def parse(line: str) -> Union["DebugCommand", None]:
|
156
|
+
parts = line.strip("\n").split(" ")
|
157
|
+
if len(parts) == 0:
|
158
|
+
return None
|
159
|
+
command = parts[0]
|
160
|
+
match command:
|
161
|
+
case "attach":
|
162
|
+
return Attach._parse(parts)
|
163
|
+
case "list":
|
164
|
+
return ListCommand()
|
165
|
+
case "quit":
|
166
|
+
return Quit()
|
167
|
+
case "cast":
|
168
|
+
return Cast._parse(parts)
|
169
|
+
case "help":
|
170
|
+
return Help()
|
171
|
+
case "continue":
|
172
|
+
return Continue()
|
173
|
+
case _:
|
174
|
+
print(
|
175
|
+
f"Unknown command {command}. Expected: attach | list | quit | cast | continue | help"
|
176
|
+
)
|
177
|
+
return None
|
178
|
+
|
179
|
+
|
180
|
+
@dataclass
|
181
|
+
class Attach(DebugCommand):
|
182
|
+
rank: int
|
183
|
+
|
184
|
+
@classmethod
|
185
|
+
def _parse(cls, parts: List[str]) -> "Attach":
|
186
|
+
if len(parts) != 2:
|
187
|
+
raise ValueError("Invalid attach command. Expected: attach <rank>")
|
188
|
+
try:
|
189
|
+
rank = int(parts[1])
|
190
|
+
except ValueError:
|
191
|
+
raise ValueError(f"Invalid rank {parts[1]}. Expected: int")
|
192
|
+
return cls(rank)
|
193
|
+
|
194
|
+
|
195
|
+
class ListCommand(DebugCommand):
|
196
|
+
pass
|
197
|
+
|
198
|
+
|
199
|
+
class Quit(DebugCommand):
|
200
|
+
pass
|
201
|
+
|
202
|
+
|
203
|
+
class Help(DebugCommand):
|
204
|
+
pass
|
205
|
+
|
206
|
+
|
207
|
+
class Continue(DebugCommand):
|
208
|
+
pass
|
209
|
+
|
210
|
+
|
211
|
+
@dataclass
|
212
|
+
class Cast(DebugCommand):
|
213
|
+
ranks: List[int] | None
|
214
|
+
command: str
|
215
|
+
|
216
|
+
@classmethod
|
217
|
+
def _parse(cls, parts: List[str]) -> "Cast":
|
218
|
+
if len(parts) < 3:
|
219
|
+
raise ValueError(
|
220
|
+
"Invalid cast command. Expected: cast {<r0,r1,...> | *} <command>"
|
221
|
+
)
|
222
|
+
str_ranks = parts[1]
|
223
|
+
command = " ".join(parts[2:])
|
224
|
+
if str_ranks == "*":
|
225
|
+
return cls(None, command)
|
226
|
+
else:
|
227
|
+
str_ranks = str_ranks.split(",")
|
228
|
+
if len(str_ranks) == 0:
|
229
|
+
raise ValueError(
|
230
|
+
"Invalid rank list for cast. Expected at least one rank."
|
231
|
+
)
|
232
|
+
ranks = []
|
233
|
+
for rank in str_ranks:
|
234
|
+
try:
|
235
|
+
ranks.append(int(rank))
|
236
|
+
except ValueError:
|
237
|
+
raise ValueError(f"Invalid rank {rank}. Expected: int")
|
238
|
+
return cls(ranks, command)
|
239
|
+
|
240
|
+
|
241
|
+
class DebugClient(Actor):
|
242
|
+
"""
|
243
|
+
Single actor for both remote debuggers and users to talk to.
|
244
|
+
|
245
|
+
Handles multiple sessions simultanesouly
|
246
|
+
"""
|
247
|
+
|
248
|
+
def __init__(self) -> None:
|
249
|
+
self.sessions = {} # rank -> DebugSession
|
250
|
+
|
251
|
+
@endpoint
|
252
|
+
async def wait_pending_session(self):
|
253
|
+
while len(self.sessions) == 0:
|
254
|
+
await asyncio.sleep(1)
|
255
|
+
|
256
|
+
@endpoint
|
257
|
+
async def list(self) -> List[Tuple[int, Dict[str, int], str, ActorId, str, int]]:
|
258
|
+
table_data = []
|
259
|
+
for _, session in self.sessions.items():
|
260
|
+
info = session.get_info()
|
261
|
+
table_data.append(
|
262
|
+
(
|
263
|
+
info.rank,
|
264
|
+
info.coords,
|
265
|
+
info.hostname,
|
266
|
+
info.actor_id,
|
267
|
+
info.function,
|
268
|
+
info.lineno,
|
269
|
+
)
|
270
|
+
)
|
271
|
+
table_data = sorted(table_data, key=lambda r: r[0])
|
272
|
+
|
273
|
+
headers = ["Rank", "Coords", "Hostname", "Actor ID", "Function", "Line No."]
|
274
|
+
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
275
|
+
|
276
|
+
return table_data
|
277
|
+
|
278
|
+
@endpoint
|
279
|
+
async def enter(self) -> None:
|
280
|
+
# pyre-ignore
|
281
|
+
await getattr(self, "list")._method(self) # noqa
|
282
|
+
|
283
|
+
while True:
|
284
|
+
try:
|
285
|
+
user_input = await _debugger_input("monarch_dbg> ")
|
286
|
+
command = DebugCommand.parse(user_input)
|
287
|
+
if isinstance(command, Help):
|
288
|
+
print("monarch_dbg commands:")
|
289
|
+
print("\tattach <rank> - attach to a debug session")
|
290
|
+
print("\tlist - list all debug sessions")
|
291
|
+
print("\tquit - exit the debugger, leaving all sessions in place")
|
292
|
+
print(
|
293
|
+
"\tcast {<r0,r1,...> | *} <command> - send a command to a comma-separated list of ranks, or all ranks"
|
294
|
+
)
|
295
|
+
print(
|
296
|
+
"\tcontinue - tell all ranks to continue execution, then exit the debugger"
|
297
|
+
)
|
298
|
+
print("\thelp - print this help message")
|
299
|
+
elif isinstance(command, Attach):
|
300
|
+
if command.rank not in self.sessions:
|
301
|
+
print(f"No debug session for rank {command.rank}")
|
302
|
+
else:
|
303
|
+
await self.sessions[command.rank].attach()
|
304
|
+
elif isinstance(command, ListCommand):
|
305
|
+
await getattr(self, "list")._method(self) # noqa
|
306
|
+
elif isinstance(command, Continue):
|
307
|
+
# Make sure all ranks have exited their debug sessions.
|
308
|
+
# If we sent "quit", it would raise BdbQuit, crashing
|
309
|
+
# the process, which probably isn't what we want.
|
310
|
+
while len(self.sessions) > 0:
|
311
|
+
tasks = []
|
312
|
+
for rank in self.sessions:
|
313
|
+
tasks.append(
|
314
|
+
self.sessions[rank].attach("c", suppress_output=True)
|
315
|
+
)
|
316
|
+
await asyncio.gather(*tasks)
|
317
|
+
return
|
318
|
+
elif isinstance(command, Quit):
|
319
|
+
return
|
320
|
+
elif isinstance(command, Cast):
|
321
|
+
if command.ranks is None:
|
322
|
+
ranks = self.sessions.keys()
|
323
|
+
else:
|
324
|
+
ranks = command.ranks
|
325
|
+
tasks = []
|
326
|
+
for rank in ranks:
|
327
|
+
if rank in self.sessions:
|
328
|
+
tasks.append(
|
329
|
+
self.sessions[rank].attach(
|
330
|
+
command.command,
|
331
|
+
suppress_output=True,
|
332
|
+
)
|
333
|
+
)
|
334
|
+
else:
|
335
|
+
print(f"No debug session for rank {rank}")
|
336
|
+
await asyncio.gather(*tasks)
|
337
|
+
except Exception as e:
|
338
|
+
print(f"Error processing command: {e}")
|
339
|
+
|
340
|
+
##########################################################################
|
341
|
+
# Debugger APIs
|
342
|
+
#
|
343
|
+
# These endpoints are called by the remote debuggers to establish sessions
|
344
|
+
# and communicate with them.
|
345
|
+
@endpoint
|
346
|
+
async def debugger_session_start(
|
347
|
+
self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
|
348
|
+
) -> None:
|
349
|
+
# Create a session if it doesn't exist
|
350
|
+
if rank not in self.sessions:
|
351
|
+
self.sessions[rank] = DebugSession(rank, coords, hostname, actor_id)
|
352
|
+
|
353
|
+
@endpoint
|
354
|
+
async def debugger_session_end(self, rank: int) -> None:
|
355
|
+
"""Detach from the current debug session."""
|
356
|
+
session = self.sessions.pop(rank)
|
357
|
+
await session.detach()
|
358
|
+
|
359
|
+
@endpoint
|
360
|
+
async def debugger_read(self, rank: int, size: int) -> DebuggerWrite | str:
|
361
|
+
"""Read from the debug session for the given rank."""
|
362
|
+
session = self.sessions[rank]
|
363
|
+
|
364
|
+
return await session.debugger_read(size)
|
365
|
+
|
366
|
+
@endpoint
|
367
|
+
async def debugger_write(self, rank: int, write: DebuggerWrite) -> None:
|
368
|
+
"""Write to the debug session for the given rank."""
|
369
|
+
session = self.sessions[rank]
|
370
|
+
await session.debugger_write(write)
|
371
|
+
|
372
|
+
|
373
|
+
async def init_debugging(actor_mesh: Actor) -> DebugClient:
|
374
|
+
debugger_proc_mesh = await local_proc_mesh(gpus=1, hosts=1)
|
375
|
+
debug_client_mesh = await debugger_proc_mesh.spawn("debug_client", DebugClient)
|
376
|
+
await actor_mesh._set_debug_client.call(debug_client_mesh)
|
377
|
+
return debug_client_mesh
|
monarch/mesh_controller.py
CHANGED
@@ -4,7 +4,10 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
import atexit
|
7
8
|
import logging
|
9
|
+
import os
|
10
|
+
import time
|
8
11
|
import traceback
|
9
12
|
from collections import deque
|
10
13
|
from logging import Logger
|
@@ -22,6 +25,8 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
|
|
22
25
|
ActorId,
|
23
26
|
)
|
24
27
|
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
|
28
|
+
from monarch._rust_bindings.monarch_hyperactor.shape import Point
|
29
|
+
|
25
30
|
from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
|
26
31
|
from monarch.common.client import Client
|
27
32
|
from monarch.common.controller_api import LogMessage, MessageResult
|
@@ -29,6 +34,7 @@ from monarch.common.device_mesh import DeviceMesh, no_mesh
|
|
29
34
|
from monarch.common.invocation import DeviceException, RemoteException
|
30
35
|
from monarch.controller.debugger import read as debugger_read, write as debugger_write
|
31
36
|
from monarch.proc_mesh import ProcMesh
|
37
|
+
from monarch.rust_local_mesh import _get_worker_exec_info
|
32
38
|
from pyre_extensions import none_throws
|
33
39
|
|
34
40
|
logger: Logger = logging.getLogger(__name__)
|
@@ -72,18 +78,8 @@ class Controller(_Controller):
|
|
72
78
|
def drain_and_stop(
|
73
79
|
self,
|
74
80
|
) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
|
75
|
-
|
76
|
-
|
77
|
-
for msg in self._drain_and_stop():
|
78
|
-
if isinstance(msg, client.WorkerResponse):
|
79
|
-
results.append(_worker_response_to_result(msg))
|
80
|
-
elif isinstance(msg, client.LogMessage):
|
81
|
-
results.append(LogMessage(msg.level, msg.message))
|
82
|
-
elif isinstance(msg, client.DebuggerMessage):
|
83
|
-
results.append(msg)
|
84
|
-
else:
|
85
|
-
raise RuntimeError(f"Unexpected message type {type(msg)}")
|
86
|
-
return results
|
81
|
+
self._drain_and_stop()
|
82
|
+
return []
|
87
83
|
|
88
84
|
def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
|
89
85
|
if not isinstance(message.action, DebuggerAction.Paused):
|
@@ -192,13 +188,75 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
|
|
192
188
|
raise RuntimeError(f"Unknown exception type: {type(exc)}")
|
193
189
|
|
194
190
|
|
191
|
+
def _initialize_env(worker_point: Point, proc_id: str) -> None:
|
192
|
+
worker_rank = worker_point.rank
|
193
|
+
try:
|
194
|
+
_, worker_env = _get_worker_exec_info()
|
195
|
+
local_rank = worker_point["gpus"]
|
196
|
+
gpus_per_host = worker_point.size("gpus")
|
197
|
+
num_worker_procs = len(worker_point.shape)
|
198
|
+
process_env = {
|
199
|
+
**worker_env,
|
200
|
+
"HYPERACTOR_MANAGED_SUBPROCESS": "1",
|
201
|
+
"CUDA_VISIBLE_DEVICES": str(local_rank),
|
202
|
+
"NCCL_HOSTID": f"{proc_id}_host_{worker_rank // gpus_per_host}",
|
203
|
+
# This is needed to avoid a hard failure in ncclx when we do not
|
204
|
+
# have backend topology info (eg. on RE).
|
205
|
+
"NCCL_IGNORE_TOPO_LOAD_FAILURE": "true",
|
206
|
+
"LOCAL_RANK": str(local_rank),
|
207
|
+
"RANK": str(worker_rank),
|
208
|
+
"WORLD_SIZE": str(num_worker_procs),
|
209
|
+
"LOCAL_WORLD_SIZE": str(gpus_per_host),
|
210
|
+
}
|
211
|
+
os.environ.update(process_env)
|
212
|
+
except Exception:
|
213
|
+
traceback.print_exc()
|
214
|
+
raise
|
215
|
+
|
216
|
+
|
217
|
+
class MeshClient(Client):
|
218
|
+
def shutdown(
|
219
|
+
self,
|
220
|
+
destroy_pg: bool = True,
|
221
|
+
error_reason: Optional[RemoteException | DeviceException | Exception] = None,
|
222
|
+
):
|
223
|
+
# return
|
224
|
+
if self.has_shutdown:
|
225
|
+
return
|
226
|
+
logger.info("shutting down the client gracefully")
|
227
|
+
|
228
|
+
atexit.unregister(self._atexit)
|
229
|
+
self._shutdown = True
|
230
|
+
|
231
|
+
# ensure all pending work is finished.
|
232
|
+
# all errors must be messaged back at this point
|
233
|
+
self.new_node_nocoalesce([], [], None, [])
|
234
|
+
self._request_status()
|
235
|
+
|
236
|
+
ttl = 60
|
237
|
+
start_time = time.time()
|
238
|
+
end_time = start_time + ttl
|
239
|
+
while ttl > 0 and self.last_assigned_seq > self.last_processed_seq:
|
240
|
+
ttl = end_time - time.time()
|
241
|
+
self.handle_next_message(ttl)
|
242
|
+
if self._pending_shutdown_error:
|
243
|
+
raise self._pending_shutdown_error
|
244
|
+
|
245
|
+
if ttl <= 0:
|
246
|
+
raise RuntimeError("shutdown timed out")
|
247
|
+
|
248
|
+
# we are not expecting anything more now, because we already
|
249
|
+
# waited for the responses
|
250
|
+
self.inner.drain_and_stop()
|
251
|
+
|
252
|
+
|
195
253
|
def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
|
196
254
|
# This argument to Controller
|
197
255
|
# is currently only used for debug printing. It should be fixed to
|
198
256
|
# report the proc ID instead of the rank it currently does.
|
199
257
|
gpus = proc_mesh.sizes.get("gpus", 1)
|
200
258
|
backend_ctrl = Controller(proc_mesh._proc_mesh)
|
201
|
-
client =
|
259
|
+
client = MeshClient(backend_ctrl, proc_mesh.size(), gpus)
|
202
260
|
dm = DeviceMesh(
|
203
261
|
client,
|
204
262
|
NDSlice.new_row_major(list(proc_mesh.sizes.values())),
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/pdb_wrapper.py
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import bdb
|
8
|
+
import inspect
|
9
|
+
import io
|
10
|
+
import pdb # noqa
|
11
|
+
import socket
|
12
|
+
import sys
|
13
|
+
from dataclasses import dataclass
|
14
|
+
|
15
|
+
from typing import Dict, TYPE_CHECKING
|
16
|
+
|
17
|
+
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
18
|
+
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from monarch.debugger import DebugClient
|
21
|
+
|
22
|
+
|
23
|
+
@dataclass
|
24
|
+
class DebuggerWrite:
|
25
|
+
payload: bytes
|
26
|
+
function: str | None
|
27
|
+
lineno: int | None
|
28
|
+
|
29
|
+
|
30
|
+
class PdbWrapper(pdb.Pdb):
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
rank: int,
|
34
|
+
coords: Dict[str, int],
|
35
|
+
actor_id: ActorId,
|
36
|
+
client_ref: "DebugClient",
|
37
|
+
header: str | None = None,
|
38
|
+
):
|
39
|
+
self.rank = rank
|
40
|
+
self.coords = coords
|
41
|
+
self.header = header
|
42
|
+
self.actor_id = actor_id
|
43
|
+
self.client_ref = client_ref
|
44
|
+
# pyre-ignore
|
45
|
+
super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
|
46
|
+
self._first = True
|
47
|
+
|
48
|
+
def setup(self, *args, **kwargs):
|
49
|
+
r = super().setup(*args, **kwargs)
|
50
|
+
if self._first:
|
51
|
+
self._first = False
|
52
|
+
# when we enter the debugger, we want to present the user's stack frame
|
53
|
+
# not the nested one inside session.run. This means that the local
|
54
|
+
# variables are what gets printed, etc. To do this
|
55
|
+
# we first execute up 2 to get to that frame.
|
56
|
+
self.do_up(2)
|
57
|
+
return r
|
58
|
+
|
59
|
+
def set_continue(self) -> None:
|
60
|
+
r = super().set_continue()
|
61
|
+
if not self.breaks:
|
62
|
+
# no more breakpoints so this debugger will not
|
63
|
+
# be used again, and we detach from the controller io.
|
64
|
+
self.client_ref.debugger_session_end.call_one(self.rank).get()
|
65
|
+
# break cycle with itself before we exit
|
66
|
+
self.stdin = sys.stdin
|
67
|
+
self.stdout = sys.stdout
|
68
|
+
return r
|
69
|
+
|
70
|
+
def set_trace(self):
|
71
|
+
self.client_ref.debugger_session_start.call_one(
|
72
|
+
self.rank, self.coords, socket.getfqdn(socket.gethostname()), self.actor_id
|
73
|
+
).get()
|
74
|
+
if self.header:
|
75
|
+
self.message(self.header)
|
76
|
+
super().set_trace()
|
77
|
+
|
78
|
+
|
79
|
+
class ReadWrapper(io.RawIOBase):
|
80
|
+
def __init__(self, session: "PdbWrapper"):
|
81
|
+
self.session = session
|
82
|
+
|
83
|
+
def readinto(self, b):
|
84
|
+
response = self.session.client_ref.debugger_read.call_one(
|
85
|
+
self.session.rank, len(b)
|
86
|
+
).get()
|
87
|
+
if response == "detach":
|
88
|
+
# this gets injected by the worker event loop to
|
89
|
+
# get the worker thread to exit on an Exit command.
|
90
|
+
raise bdb.BdbQuit
|
91
|
+
assert isinstance(response, DebuggerWrite) and len(response.payload) <= len(b)
|
92
|
+
b[: len(response.payload)] = response.payload
|
93
|
+
return len(response.payload)
|
94
|
+
|
95
|
+
def readable(self) -> bool:
|
96
|
+
return True
|
97
|
+
|
98
|
+
@classmethod
|
99
|
+
def create(cls, session: "PdbWrapper"):
|
100
|
+
return io.TextIOWrapper(io.BufferedReader(cls(session)))
|
101
|
+
|
102
|
+
|
103
|
+
class WriteWrapper:
|
104
|
+
def __init__(self, session: "PdbWrapper"):
|
105
|
+
self.session = session
|
106
|
+
|
107
|
+
def writable(self) -> bool:
|
108
|
+
return True
|
109
|
+
|
110
|
+
def write(self, s: str):
|
111
|
+
function = None
|
112
|
+
lineno = None
|
113
|
+
if self.session.curframe is not None:
|
114
|
+
# pyre-ignore
|
115
|
+
function = f"{inspect.getmodulename(self.session.curframe.f_code.co_filename)}.{self.session.curframe.f_code.co_name}"
|
116
|
+
# pyre-ignore
|
117
|
+
lineno = self.session.curframe.f_lineno
|
118
|
+
self.session.client_ref.debugger_write.call_one(
|
119
|
+
self.session.rank,
|
120
|
+
DebuggerWrite(
|
121
|
+
s.encode(),
|
122
|
+
function,
|
123
|
+
lineno,
|
124
|
+
),
|
125
|
+
).get()
|
126
|
+
|
127
|
+
def flush(self):
|
128
|
+
pass
|
129
|
+
|
130
|
+
|
131
|
+
def remote_breakpointhook(
|
132
|
+
rank: int, coords: Dict[str, int], actor_id: ActorId, client_ref: "DebugClient"
|
133
|
+
):
|
134
|
+
ds = PdbWrapper(rank, coords, actor_id, client_ref)
|
135
|
+
ds.set_trace()
|
monarch/telemetry.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
|
10
|
+
import logging
|
11
|
+
|
12
|
+
from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
|
13
|
+
forward_to_tracing,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
class TracingForwarder(logging.Handler):
|
18
|
+
def emit(self, record: logging.LogRecord) -> None:
|
19
|
+
forward_to_tracing(record)
|
tests/test_coalescing.py
CHANGED
tests/test_controller.py
CHANGED
@@ -96,7 +96,7 @@ remote_sleep = remote("time.sleep", propagate="inspect")
|
|
96
96
|
torch.cuda.device_count() < 2,
|
97
97
|
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
98
98
|
)
|
99
|
-
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
|
99
|
+
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
|
100
100
|
# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
|
101
101
|
# out is not counted as a failure, so we set a more restrictive timeout to
|
102
102
|
# ensure we see a hard failure in CI.
|
@@ -114,7 +114,7 @@ class TestController:
|
|
114
114
|
N,
|
115
115
|
gpu_per_host,
|
116
116
|
activate,
|
117
|
-
|
117
|
+
backend=str(backend_type),
|
118
118
|
)
|
119
119
|
|
120
120
|
def test_errors(self, backend_type):
|
@@ -512,6 +512,7 @@ class TestController:
|
|
512
512
|
monarch.random.make_deterministic()
|
513
513
|
for device in ("cpu", "cuda"):
|
514
514
|
a = monarch.random.get_state()
|
515
|
+
monarch.inspect(a)
|
515
516
|
first = torch.rand(1, device=device)
|
516
517
|
monarch.random.set_state(a)
|
517
518
|
second = torch.rand(1, device=device)
|
@@ -601,6 +602,15 @@ class TestController:
|
|
601
602
|
assert torch.equal(moved_tensor_a, torch.tensor([1.0]))
|
602
603
|
assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
|
603
604
|
|
605
|
+
def test_hanging_error(self, backend_type):
|
606
|
+
if backend_type != "mesh":
|
607
|
+
pytest.skip("only relevant for mesh backend")
|
608
|
+
with self.local_device_mesh(2, 2, backend_type) as device_mesh:
|
609
|
+
remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()
|
610
|
+
|
611
|
+
with pytest.raises(Exception, match="The size of tensor"):
|
612
|
+
device_mesh.client.shutdown()
|
613
|
+
|
604
614
|
def test_slice_mesh_pytree(self, backend_type):
|
605
615
|
with self.local_device_mesh(2, 2, backend_type) as device_mesh:
|
606
616
|
a = device_mesh.rank(("host")) + torch.zeros((1,), device="cuda")
|
tests/test_python_actors.py
CHANGED
@@ -4,8 +4,12 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
import asyncio
|
7
8
|
import operator
|
9
|
+
import os
|
10
|
+
import re
|
8
11
|
from types import ModuleType
|
12
|
+
from unittest.mock import AsyncMock, patch
|
9
13
|
|
10
14
|
import monarch
|
11
15
|
|
@@ -20,7 +24,9 @@ from monarch.actor_mesh import (
|
|
20
24
|
current_rank,
|
21
25
|
current_size,
|
22
26
|
endpoint,
|
27
|
+
MonarchContext,
|
23
28
|
)
|
29
|
+
from monarch.debugger import init_debugging
|
24
30
|
|
25
31
|
from monarch.mesh_controller import spawn_tensor_engine
|
26
32
|
|
@@ -384,6 +390,10 @@ def test_rust_binding_modules_correct() -> None:
|
|
384
390
|
check(bindings, "monarch._rust_bindings")
|
385
391
|
|
386
392
|
|
393
|
+
@pytest.mark.skipif(
|
394
|
+
torch.cuda.device_count() < 2,
|
395
|
+
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
396
|
+
)
|
387
397
|
def test_tensor_engine() -> None:
|
388
398
|
pm = proc_mesh(gpus=2).get()
|
389
399
|
|
@@ -399,3 +409,143 @@ def test_tensor_engine() -> None:
|
|
399
409
|
assert torch.allclose(torch.zeros(3, 4), f)
|
400
410
|
|
401
411
|
dm.exit()
|
412
|
+
|
413
|
+
|
414
|
+
def _debugee_actor_internal(rank):
|
415
|
+
if rank == 0:
|
416
|
+
breakpoint() # noqa
|
417
|
+
rank += 1
|
418
|
+
return rank
|
419
|
+
elif rank == 1:
|
420
|
+
breakpoint() # noqa
|
421
|
+
rank += 2
|
422
|
+
return rank
|
423
|
+
elif rank == 2:
|
424
|
+
breakpoint() # noqa
|
425
|
+
rank += 3
|
426
|
+
raise ValueError("bad rank")
|
427
|
+
elif rank == 3:
|
428
|
+
breakpoint() # noqa
|
429
|
+
rank += 4
|
430
|
+
return rank
|
431
|
+
|
432
|
+
|
433
|
+
class DebugeeActor(Actor):
|
434
|
+
@endpoint
|
435
|
+
async def to_debug(self):
|
436
|
+
rank = MonarchContext.get().point.rank
|
437
|
+
return _debugee_actor_internal(rank)
|
438
|
+
|
439
|
+
|
440
|
+
async def test_debug() -> None:
|
441
|
+
input_mock = AsyncMock()
|
442
|
+
input_mock.side_effect = [
|
443
|
+
"attach 1",
|
444
|
+
"n",
|
445
|
+
"n",
|
446
|
+
"n",
|
447
|
+
"n",
|
448
|
+
"detach",
|
449
|
+
"attach 1",
|
450
|
+
"detach",
|
451
|
+
"quit",
|
452
|
+
"cast 0,3 n",
|
453
|
+
"cast 0,3 n",
|
454
|
+
# Attaching to 0 and 3 ensures that when we call "list"
|
455
|
+
# the next time, their function/lineno info will be
|
456
|
+
# up-to-date.
|
457
|
+
"attach 0",
|
458
|
+
"detach",
|
459
|
+
"attach 3",
|
460
|
+
"detach",
|
461
|
+
"quit",
|
462
|
+
"attach 2",
|
463
|
+
"c",
|
464
|
+
"quit",
|
465
|
+
"continue",
|
466
|
+
]
|
467
|
+
|
468
|
+
outputs = []
|
469
|
+
|
470
|
+
def _patch_output(msg):
|
471
|
+
nonlocal outputs
|
472
|
+
outputs.append(msg)
|
473
|
+
|
474
|
+
with patch("monarch.debugger._debugger_input", side_effect=input_mock), patch(
|
475
|
+
"monarch.debugger._debugger_output", new=_patch_output
|
476
|
+
):
|
477
|
+
proc = await proc_mesh(hosts=2, gpus=2)
|
478
|
+
debugee = await proc.spawn("debugee", DebugeeActor)
|
479
|
+
debug_client = await init_debugging(debugee)
|
480
|
+
|
481
|
+
fut = debugee.to_debug.call()
|
482
|
+
await debug_client.wait_pending_session.call_one()
|
483
|
+
breakpoints = []
|
484
|
+
for i in range(10):
|
485
|
+
breakpoints = await debug_client.list.call_one()
|
486
|
+
if len(breakpoints) == 4:
|
487
|
+
break
|
488
|
+
await asyncio.sleep(1)
|
489
|
+
if i == 9:
|
490
|
+
raise RuntimeError("timed out waiting for breakpoints")
|
491
|
+
|
492
|
+
initial_linenos = {}
|
493
|
+
for i in range(len(breakpoints)):
|
494
|
+
rank, coords, _, _, function, lineno = breakpoints[i]
|
495
|
+
initial_linenos[rank] = lineno
|
496
|
+
assert rank == i
|
497
|
+
assert coords == {"hosts": rank % 2, "gpus": rank // 2}
|
498
|
+
assert function == "test_python_actors._debugee_actor_internal"
|
499
|
+
assert lineno == breakpoints[0][5] + 4 * rank
|
500
|
+
|
501
|
+
await debug_client.enter.call_one()
|
502
|
+
|
503
|
+
# Check that when detaching and re-attaching to a session, the last portion of the output is repeated
|
504
|
+
expected_last_output = [
|
505
|
+
r"--Return--",
|
506
|
+
r"\n",
|
507
|
+
r"> (/.*/)+test_python_actors.py\(\d+\)to_debug\(\)->3\n-> return _debugee_actor_internal\(rank\)",
|
508
|
+
r"\n",
|
509
|
+
r"\(Pdb\) ",
|
510
|
+
]
|
511
|
+
output_len = len(expected_last_output)
|
512
|
+
assert outputs[-2 * output_len : -output_len] == outputs[-output_len:]
|
513
|
+
for real_output, expected_output in zip(
|
514
|
+
outputs[-output_len:], expected_last_output
|
515
|
+
):
|
516
|
+
assert re.match(expected_output, real_output) is not None
|
517
|
+
|
518
|
+
breakpoints = await debug_client.list.call_one()
|
519
|
+
for i in range(len(breakpoints)):
|
520
|
+
if i == 1:
|
521
|
+
assert breakpoints[i][4] == "test_python_actors.to_debug"
|
522
|
+
else:
|
523
|
+
assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
|
524
|
+
assert breakpoints[i][5] == initial_linenos[i]
|
525
|
+
|
526
|
+
await debug_client.enter.call_one()
|
527
|
+
|
528
|
+
breakpoints = await debug_client.list.call_one()
|
529
|
+
for i in range(len(breakpoints)):
|
530
|
+
if i == 1:
|
531
|
+
assert breakpoints[i][4] == "test_python_actors.to_debug"
|
532
|
+
elif i in (0, 3):
|
533
|
+
assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
|
534
|
+
assert breakpoints[i][5] == initial_linenos[i] + 2
|
535
|
+
else:
|
536
|
+
assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
|
537
|
+
assert breakpoints[i][5] == initial_linenos[i]
|
538
|
+
|
539
|
+
await debug_client.enter.call_one()
|
540
|
+
|
541
|
+
breakpoints = await debug_client.list.call_one()
|
542
|
+
assert len(breakpoints) == 3
|
543
|
+
for i, rank in enumerate((0, 1, 3)):
|
544
|
+
assert breakpoints[i][0] == rank
|
545
|
+
|
546
|
+
await debug_client.enter.call_one()
|
547
|
+
breakpoints = await debug_client.list.call_one()
|
548
|
+
assert len(breakpoints) == 0
|
549
|
+
|
550
|
+
with pytest.raises(monarch.actor_mesh.ActorError, match="ValueError: bad rank"):
|
551
|
+
await fut
|
tests/test_remote_functions.py
CHANGED
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/RECORD
RENAMED
@@ -1,19 +1,21 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
3
|
-
monarch/_testing.py,sha256=
|
4
|
-
monarch/actor_mesh.py,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=FJb4gGPNDWqT1nPkxEYSX4hEsIbjb_v8Oa0RDwMcH5A,40302936
|
3
|
+
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
|
+
monarch/actor_mesh.py,sha256=8Ih3CIArLTyZmWSHppXm5N2WlAjmGXpaQhkkFtjJFxc,25351
|
5
5
|
monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
|
6
|
-
monarch/bootstrap_main.py,sha256=
|
6
|
+
monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
8
|
+
monarch/debugger.py,sha256=AdlvOG3X-9Pw9c1DLQYEy4vjEfh0ZtwtsNJEFLFzN8o,13312
|
8
9
|
monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
9
10
|
monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
|
10
11
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
11
12
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
12
|
-
monarch/mesh_controller.py,sha256=
|
13
|
-
monarch/monarch_controller,sha256=
|
13
|
+
monarch/mesh_controller.py,sha256=Xft2edk7rz8_PPe-iIUZ09P-j4JDPGADBGHBiuiZ7YY,10363
|
14
|
+
monarch/monarch_controller,sha256=mE9pvcBDKwW_4zOZlO17PJDk7W6z5skzIX5rxHQfKOs,20238936
|
14
15
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
15
16
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
16
17
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
18
|
+
monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
|
17
19
|
monarch/proc_mesh.py,sha256=xoaReM9Ab9TWkesxedWSyyk4TMD0HLV88dQ8CQcbqTI,6892
|
18
20
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
19
21
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
@@ -23,6 +25,7 @@ monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
|
|
23
25
|
monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
|
24
26
|
monarch/rust_local_mesh.py,sha256=7ASptybn3wy4J7eoBc7LhGW4j4AA6bigl5Kuhyflw8s,47405
|
25
27
|
monarch/sim_mesh.py,sha256=9wkS99L0EpG2Gldi-nzA-3ww7z__DQ7Qp2uReMfn188,12183
|
28
|
+
monarch/telemetry.py,sha256=7JUZWaoD2Yn5Ae_7kNhkLFRBLYaSGfH071_m_qfVehI,525
|
26
29
|
monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
|
27
30
|
monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
|
28
31
|
monarch/world_mesh.py,sha256=GqZpFoVNJPxYa70rLYgv0vu8Vg1nXqx_GYERRb1E9Pc,975
|
@@ -34,7 +37,7 @@ monarch/_monarch/worker/debugger.py,sha256=JJZwRPTgQO2emz-hrMelkOSxJFIR3dV4ZA6e7
|
|
34
37
|
monarch/_monarch/worker/logging.py,sha256=nJUkIuKhPqRZaNDOT7MVbFFjcITZQf_CiFRLFKJJqsw,3591
|
35
38
|
monarch/builtins/__init__.py,sha256=QcfnHZGbc2qktBg7DyZt2ruE6VahnIt4S8lEZLHdJqU,443
|
36
39
|
monarch/builtins/log.py,sha256=H1QkuVzwxyi36Zyv-XR0VN0QsNimBWwxE1__fjs0_2o,554
|
37
|
-
monarch/builtins/random.py,sha256=
|
40
|
+
monarch/builtins/random.py,sha256=wPbvscg7u53EXpMFo885fO2XOlsyjrNAJ4rBxLzfxdg,1839
|
38
41
|
monarch/common/_C.pyi,sha256=kHY2G3ksMAjQJ6IcPb4F1bBh5knzw5RVVNhhBlEmwFU,314
|
39
42
|
monarch/common/_C.so,sha256=gVDCDUQSKiPHwLPIpyxcRgiv8uF_quH1LpgI5Lhle9Y,715600
|
40
43
|
monarch/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -43,7 +46,7 @@ monarch/common/_device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4
|
|
43
46
|
monarch/common/_tensor_to_table.py,sha256=yRjCNwvtl188Z1Dwkx3ZU-Bh2mwYnQ0Lnue2RAztwvc,5753
|
44
47
|
monarch/common/base_tensor.py,sha256=ujRzR6lWaeCdPv2JX0vCR-VsCWn-3SHaJIkZH1Sw9FQ,1159
|
45
48
|
monarch/common/borrows.py,sha256=7KR62xoUat1T6FyADsdHsxVAVIJDvfJWUnPO-xx277U,5307
|
46
|
-
monarch/common/client.py,sha256=
|
49
|
+
monarch/common/client.py,sha256=axo37s_z17nYQGOZG5fi_0zUEJ_8qw7INjs-Kw2vaVo,24937
|
47
50
|
monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
|
48
51
|
monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
|
49
52
|
monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
|
@@ -133,16 +136,16 @@ tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
|
133
136
|
tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
|
134
137
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
135
138
|
tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
|
136
|
-
tests/test_coalescing.py,sha256
|
137
|
-
tests/test_controller.py,sha256=
|
139
|
+
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
140
|
+
tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
|
138
141
|
tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
|
139
142
|
tests/test_fault_tolerance.py,sha256=u4wmG1z5MZ6PY6us5zUZHJh2pUC3L7i0wsUfRDNHmxA,14144
|
140
143
|
tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
|
141
144
|
tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
|
142
145
|
tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
|
143
146
|
tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
|
144
|
-
tests/test_python_actors.py,sha256=
|
145
|
-
tests/test_remote_functions.py,sha256=
|
147
|
+
tests/test_python_actors.py,sha256=MPdXtnj4ZeyAaecDFJMXdz29KvimF9iB3bASgoo6-iM,16201
|
148
|
+
tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
|
146
149
|
tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
|
147
150
|
tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
|
148
151
|
tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
|
@@ -151,9 +154,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
151
154
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
152
155
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
153
156
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
154
|
-
torchmonarch_nightly-2025.6.
|
155
|
-
torchmonarch_nightly-2025.6.
|
156
|
-
torchmonarch_nightly-2025.6.
|
157
|
-
torchmonarch_nightly-2025.6.
|
158
|
-
torchmonarch_nightly-2025.6.
|
159
|
-
torchmonarch_nightly-2025.6.
|
157
|
+
torchmonarch_nightly-2025.6.13.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
158
|
+
torchmonarch_nightly-2025.6.13.dist-info/METADATA,sha256=WhintlKk3a9WRrjo-QLNntfi87q98I4gcZW_0f42q48,2772
|
159
|
+
torchmonarch_nightly-2025.6.13.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
160
|
+
torchmonarch_nightly-2025.6.13.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
161
|
+
torchmonarch_nightly-2025.6.13.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
162
|
+
torchmonarch_nightly-2025.6.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/top_level.txt
RENAMED
File without changes
|