torchmonarch-nightly 2025.9.10__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.11__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +1 -1
- monarch/_src/actor/bootstrap_main.py +1 -1
- monarch/_src/actor/debugger/breakpoint.py +30 -0
- monarch/_src/actor/debugger/debug_command.py +183 -0
- monarch/_src/actor/debugger/debug_controller.py +246 -0
- monarch/_src/actor/debugger/debug_io.py +68 -0
- monarch/_src/actor/debugger/debug_session.py +249 -0
- monarch/_src/actor/debugger/pdb_wrapper.py +1 -1
- monarch/_src/actor/host_mesh.py +10 -2
- monarch/_src/actor/proc_mesh.py +80 -19
- monarch/actor/__init__.py +1 -1
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/monarch_controller +0 -0
- monarch/tools/cli.py +26 -0
- monarch/tools/commands.py +15 -0
- monarch/tools/debug_env.py +34 -0
- monarch/tools/mesh_spec.py +2 -0
- tests/test_allocator.py +18 -9
- tests/test_debugger.py +29 -25
- tests/test_mock_cuda.py +11 -3
- torchmonarch_nightly-2025.9.11.data/scripts/process_allocator +0 -0
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/RECORD +28 -26
- monarch/_src/actor/debugger/debugger.py +0 -737
- monarch/_src/debug_cli/__init__.py +0 -7
- monarch/_src/debug_cli/debug_cli.py +0 -43
- monarch/debug_cli/__init__.py +0 -7
- monarch/debug_cli/__main__.py +0 -12
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.9.10.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,249 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-unsafe
|
8
|
+
import asyncio
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from typing import Dict, Generator, List, Optional, Tuple
|
11
|
+
|
12
|
+
from monarch._src.actor.debugger.debug_command import RanksType
|
13
|
+
from monarch._src.actor.debugger.debug_io import DebugIO, DebugIOError
|
14
|
+
|
15
|
+
from monarch._src.actor.debugger.pdb_wrapper import DebuggerWrite
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class DebugSessionInfo:
|
20
|
+
actor_name: str
|
21
|
+
rank: int
|
22
|
+
coords: Dict[str, int]
|
23
|
+
hostname: str
|
24
|
+
function: str | None
|
25
|
+
lineno: int | None
|
26
|
+
|
27
|
+
def __lt__(self, other):
|
28
|
+
if self.actor_name < other.actor_name:
|
29
|
+
return True
|
30
|
+
elif self.actor_name == other.actor_name:
|
31
|
+
return self.rank < other.rank
|
32
|
+
else:
|
33
|
+
return False
|
34
|
+
|
35
|
+
|
36
|
+
class DebugSession:
|
37
|
+
"""Represents a single session with a remote debugger."""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self, rank: int, coords: Dict[str, int], hostname: str, actor_name: str
|
41
|
+
):
|
42
|
+
self.rank = rank
|
43
|
+
self.coords = coords
|
44
|
+
self.hostname = hostname
|
45
|
+
self.actor_name = actor_name
|
46
|
+
self._active = False
|
47
|
+
self._message_queue = asyncio.Queue()
|
48
|
+
self._task = None
|
49
|
+
self._pending_send_to_actor = asyncio.Queue()
|
50
|
+
self._outputs_since_last_input = []
|
51
|
+
self._function_lineno = None
|
52
|
+
self._need_read = False
|
53
|
+
|
54
|
+
async def _event_loop(self, debug_io: DebugIO, line=None, suppress_output=False):
|
55
|
+
if not suppress_output:
|
56
|
+
# If the user had previously attached to this debug session,
|
57
|
+
# then it would have printed various messages from the
|
58
|
+
# message queue. When the user re-attaches, we want to
|
59
|
+
# print out all of the output that was printed since the
|
60
|
+
# last command sent to this session.
|
61
|
+
if len(self._outputs_since_last_input) > 0:
|
62
|
+
await debug_io.output(
|
63
|
+
f"<last pdb output for {self.actor_name} {self.rank} follows>\n"
|
64
|
+
)
|
65
|
+
for output in self._outputs_since_last_input:
|
66
|
+
await debug_io.output(output.payload.decode())
|
67
|
+
|
68
|
+
while True:
|
69
|
+
# When the user inputs "detach", it uses up a "read" message
|
70
|
+
# without actually responding to the actor being debugged. We
|
71
|
+
# can't manually reinsert the "read" message into the message queue,
|
72
|
+
# so instead the self._need_read flag indicates there's an additional
|
73
|
+
# "read" that we need to respond to.
|
74
|
+
if self._need_read:
|
75
|
+
self._need_read = False
|
76
|
+
message = "read"
|
77
|
+
else:
|
78
|
+
message = await self._message_queue.get()
|
79
|
+
if message == "detach":
|
80
|
+
# Return to the main outer debug loop.
|
81
|
+
break
|
82
|
+
elif message == "read":
|
83
|
+
try:
|
84
|
+
break_after = False
|
85
|
+
if line is not None:
|
86
|
+
break_after = True
|
87
|
+
else:
|
88
|
+
line = await debug_io.input()
|
89
|
+
if line == "detach":
|
90
|
+
self._need_read = True
|
91
|
+
break
|
92
|
+
else:
|
93
|
+
await self._pending_send_to_actor.put((line + "\n").encode())
|
94
|
+
# Cancel safety: don't clear the previous outputs until we know
|
95
|
+
# the actor will receive the input.
|
96
|
+
self._outputs_since_last_input = []
|
97
|
+
line = None
|
98
|
+
if break_after:
|
99
|
+
break
|
100
|
+
except (DebugIOError, asyncio.CancelledError):
|
101
|
+
# See earlier comment about this flag. If either of the awaits inside
|
102
|
+
# the try block is cancelled, we need to redo the read without actually
|
103
|
+
# reinserting "read" into the message queue.
|
104
|
+
self._need_read = True
|
105
|
+
raise
|
106
|
+
elif message[0] == "write":
|
107
|
+
output = message[1]
|
108
|
+
# If the user sees this output but then detaches from the session,
|
109
|
+
# its useful to store all outputs since the last input so that
|
110
|
+
# they can be printed again when the user re-attaches.
|
111
|
+
self._outputs_since_last_input.append(output)
|
112
|
+
if not suppress_output:
|
113
|
+
await debug_io.output(output.payload.decode())
|
114
|
+
|
115
|
+
if not suppress_output:
|
116
|
+
await debug_io.output(
|
117
|
+
f"Detaching from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
|
118
|
+
)
|
119
|
+
|
120
|
+
def get_info(self):
|
121
|
+
function = lineno = None
|
122
|
+
if self._function_lineno is not None:
|
123
|
+
function, lineno = self._function_lineno
|
124
|
+
return DebugSessionInfo(
|
125
|
+
self.actor_name, self.rank, self.coords, self.hostname, function, lineno
|
126
|
+
)
|
127
|
+
|
128
|
+
async def attach(self, debug_io: DebugIO, line=None, suppress_output=False):
|
129
|
+
self._active = True
|
130
|
+
if not suppress_output:
|
131
|
+
await debug_io.output(
|
132
|
+
f"Attached to debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
|
133
|
+
)
|
134
|
+
self._task = asyncio.create_task(
|
135
|
+
self._event_loop(debug_io, line, suppress_output)
|
136
|
+
)
|
137
|
+
await self._task
|
138
|
+
if not suppress_output:
|
139
|
+
await debug_io.output(
|
140
|
+
f"Detached from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
|
141
|
+
)
|
142
|
+
self._active = False
|
143
|
+
|
144
|
+
async def detach(self):
|
145
|
+
if self._active:
|
146
|
+
await self._message_queue.put("detach")
|
147
|
+
|
148
|
+
async def debugger_read(self, size: int) -> DebuggerWrite:
|
149
|
+
await self._message_queue.put("read")
|
150
|
+
input_data = await self._pending_send_to_actor.get()
|
151
|
+
if len(input_data) > size:
|
152
|
+
input_data = input_data[:size]
|
153
|
+
return DebuggerWrite(input_data, None, None)
|
154
|
+
|
155
|
+
async def debugger_write(self, write: DebuggerWrite) -> None:
|
156
|
+
if write.function is not None and write.lineno is not None:
|
157
|
+
self._function_lineno = (write.function, write.lineno)
|
158
|
+
await self._message_queue.put(("write", write))
|
159
|
+
|
160
|
+
|
161
|
+
class DebugSessions:
|
162
|
+
def __init__(self):
|
163
|
+
self._sessions: Dict[str, Dict[int, DebugSession]] = {}
|
164
|
+
|
165
|
+
def insert(self, session: DebugSession) -> None:
|
166
|
+
if session.actor_name not in self._sessions:
|
167
|
+
self._sessions[session.actor_name] = {session.rank: session}
|
168
|
+
elif session.rank not in self._sessions[session.actor_name]:
|
169
|
+
self._sessions[session.actor_name][session.rank] = session
|
170
|
+
else:
|
171
|
+
raise ValueError(
|
172
|
+
f"Debug session for rank {session.rank} already exists for actor {session.actor_name}"
|
173
|
+
)
|
174
|
+
|
175
|
+
def remove(self, actor_name: str, rank: int) -> DebugSession:
|
176
|
+
if actor_name not in self._sessions:
|
177
|
+
raise ValueError(f"No debug sessions for actor {actor_name}")
|
178
|
+
elif rank not in self._sessions[actor_name]:
|
179
|
+
raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
|
180
|
+
session = self._sessions[actor_name].pop(rank)
|
181
|
+
if len(self._sessions[actor_name]) == 0:
|
182
|
+
del self._sessions[actor_name]
|
183
|
+
return session
|
184
|
+
|
185
|
+
def get(self, actor_name: str, rank: int) -> DebugSession:
|
186
|
+
if actor_name not in self._sessions:
|
187
|
+
raise ValueError(f"No debug sessions for actor {actor_name}")
|
188
|
+
elif rank not in self._sessions[actor_name]:
|
189
|
+
raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
|
190
|
+
return self._sessions[actor_name][rank]
|
191
|
+
|
192
|
+
def iter(
|
193
|
+
self, selection: Optional[Tuple[str, Optional[RanksType]]]
|
194
|
+
) -> Generator[DebugSession, None, None]:
|
195
|
+
if selection is None:
|
196
|
+
for sessions in self._sessions.values():
|
197
|
+
for session in sessions.values():
|
198
|
+
yield session
|
199
|
+
return
|
200
|
+
actor_name, ranks = selection
|
201
|
+
if actor_name not in self._sessions:
|
202
|
+
return
|
203
|
+
sessions = self._sessions[actor_name]
|
204
|
+
if ranks is None:
|
205
|
+
for session in sessions.values():
|
206
|
+
yield session
|
207
|
+
elif isinstance(ranks, int):
|
208
|
+
if ranks in sessions:
|
209
|
+
yield sessions[ranks]
|
210
|
+
elif isinstance(ranks, list):
|
211
|
+
for rank in ranks:
|
212
|
+
if rank in sessions:
|
213
|
+
yield sessions[rank]
|
214
|
+
elif isinstance(ranks, dict):
|
215
|
+
dims = ranks
|
216
|
+
for session in sessions.values():
|
217
|
+
include_rank = True
|
218
|
+
for dim, ranks in dims.items():
|
219
|
+
if dim not in session.coords:
|
220
|
+
include_rank = False
|
221
|
+
break
|
222
|
+
elif (
|
223
|
+
isinstance(ranks, range) or isinstance(ranks, list)
|
224
|
+
) and session.coords[dim] not in ranks:
|
225
|
+
include_rank = False
|
226
|
+
break
|
227
|
+
elif isinstance(ranks, int) and session.coords[dim] != ranks:
|
228
|
+
include_rank = False
|
229
|
+
break
|
230
|
+
if include_rank:
|
231
|
+
yield session
|
232
|
+
elif isinstance(ranks, range):
|
233
|
+
for rank, session in sessions.items():
|
234
|
+
if rank in ranks:
|
235
|
+
yield session
|
236
|
+
|
237
|
+
def info(self) -> List[DebugSessionInfo]:
|
238
|
+
session_info = []
|
239
|
+
for sessions in self._sessions.values():
|
240
|
+
for session in sessions.values():
|
241
|
+
session_info.append(session.get_info())
|
242
|
+
return session_info
|
243
|
+
|
244
|
+
def __len__(self) -> int:
|
245
|
+
return sum(len(sessions) for sessions in self._sessions.values())
|
246
|
+
|
247
|
+
def __contains__(self, item: Tuple[str, int]) -> bool:
|
248
|
+
actor_name, rank = item
|
249
|
+
return actor_name in self._sessions and rank in self._sessions[actor_name]
|
@@ -22,7 +22,7 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
|
22
22
|
from monarch._src.actor.sync_state import fake_sync_state
|
23
23
|
|
24
24
|
if TYPE_CHECKING:
|
25
|
-
from monarch._src.actor.debugger.
|
25
|
+
from monarch._src.actor.debugger.debug_controller import DebugController
|
26
26
|
|
27
27
|
|
28
28
|
@dataclass
|
monarch/_src/actor/host_mesh.py
CHANGED
@@ -52,13 +52,21 @@ class HostMesh(MeshTrait):
|
|
52
52
|
interfaces with the underlying resource allocator of your choice.
|
53
53
|
"""
|
54
54
|
|
55
|
-
def __init__(
|
55
|
+
def __init__(
|
56
|
+
self,
|
57
|
+
shape: Shape,
|
58
|
+
allocator: AllocateMixin,
|
59
|
+
alloc_constraints: Optional[AllocConstraints] = None,
|
60
|
+
):
|
56
61
|
self._allocator = allocator
|
62
|
+
self._alloc_constraints = alloc_constraints
|
57
63
|
self._shape = shape
|
58
64
|
self._spawned = 0
|
59
65
|
|
60
66
|
def _alloc(self, hosts: int, gpus: int) -> "AllocHandle":
|
61
|
-
spec: AllocSpec = AllocSpec(
|
67
|
+
spec: AllocSpec = AllocSpec(
|
68
|
+
self._alloc_constraints or AllocConstraints(), hosts=hosts, gpus=gpus
|
69
|
+
)
|
62
70
|
return self._allocator.allocate(spec)
|
63
71
|
|
64
72
|
def spawn_procs(
|
monarch/_src/actor/proc_mesh.py
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
9
|
import asyncio
|
10
|
+
import importlib.metadata
|
11
|
+
import json
|
10
12
|
import logging
|
11
13
|
import os
|
12
14
|
import sys
|
@@ -31,6 +33,7 @@ from typing import (
|
|
31
33
|
TYPE_CHECKING,
|
32
34
|
TypeVar,
|
33
35
|
)
|
36
|
+
from urllib.parse import urlparse
|
34
37
|
from weakref import WeakValueDictionary
|
35
38
|
|
36
39
|
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
@@ -468,20 +471,19 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
468
471
|
# The workspace shape (i.e. only perform one rsync per host).
|
469
472
|
assert set(self._shape.labels).issubset({"gpus", "hosts"})
|
470
473
|
|
471
|
-
workspaces =
|
474
|
+
workspaces = {}
|
472
475
|
for src_dir, dst_dir in workspace.dirs.items():
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
),
|
481
|
-
shape=WorkspaceShape.shared("gpus"),
|
476
|
+
local = Path(src_dir)
|
477
|
+
workspaces[local] = WorkspaceConfig(
|
478
|
+
local=local,
|
479
|
+
remote=RemoteWorkspace(
|
480
|
+
location=WorkspaceLocation.FromEnvVar(
|
481
|
+
env="WORKSPACE_DIR",
|
482
|
+
relpath=dst_dir,
|
482
483
|
),
|
483
|
-
|
484
|
+
shape=WorkspaceShape.shared("gpus"),
|
484
485
|
),
|
486
|
+
method=CodeSyncMethod.Rsync(),
|
485
487
|
)
|
486
488
|
|
487
489
|
# If `conda` is set, also sync the currently activated conda env.
|
@@ -496,23 +498,82 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
496
498
|
while conda_prefix.is_symlink():
|
497
499
|
conda_prefix = conda_prefix.parent / conda_prefix.readlink()
|
498
500
|
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
501
|
+
# Build a list of additional paths prefixes to fixup when syncing
|
502
|
+
# the conda env.
|
503
|
+
conda_prefix_replacements = {}
|
504
|
+
|
505
|
+
# Auto-detect editable installs and implicitly add workspaces for
|
506
|
+
# them.
|
507
|
+
# NOTE(agallagher): There's sometimes a `python3.1` symlink to
|
508
|
+
# `python3.10`, so avoid it.
|
509
|
+
(lib_python,) = [
|
510
|
+
dirpath
|
511
|
+
for dirpath in conda_prefix.glob("lib/python*")
|
512
|
+
if not os.path.islink(dirpath)
|
513
|
+
]
|
514
|
+
for direct_url in lib_python.glob(
|
515
|
+
"site-packages/*.dist-info/direct_url.json"
|
516
|
+
):
|
517
|
+
# Parse the direct_url.json to see if it's an editable install
|
518
|
+
# (https://packaging.python.org/en/latest/specifications/direct-url/#example-pip-commands-and-their-effect-on-direct-url-json).
|
519
|
+
with open(direct_url) as f:
|
520
|
+
info = json.load(f)
|
521
|
+
if not info.get("dir_info", {}).get("editable", False):
|
522
|
+
continue
|
523
|
+
|
524
|
+
# Extract the workspace path from the URL (e.g. `file///my/workspace/`).
|
525
|
+
url = urlparse(info["url"])
|
526
|
+
assert url.scheme == "file", f"expected file:// URL, got {url.scheme}"
|
527
|
+
|
528
|
+
# Get the project name, so we can use it below to create a unique-ish
|
529
|
+
# remote directory.
|
530
|
+
dist = importlib.metadata.PathDistribution(direct_url.parent)
|
531
|
+
name = dist.metadata["Name"]
|
532
|
+
|
533
|
+
local = Path(url.path)
|
534
|
+
|
535
|
+
# Check if we've already defined a workspace for this local path.
|
536
|
+
existing = workspaces.get(local)
|
537
|
+
if existing is not None:
|
538
|
+
assert existing.method == CodeSyncMethod.Rsync()
|
539
|
+
remote = existing.remote
|
540
|
+
else:
|
541
|
+
# Otherwise, add the workspace to the list.
|
542
|
+
remote = RemoteWorkspace(
|
503
543
|
location=WorkspaceLocation.FromEnvVar(
|
504
|
-
env="
|
505
|
-
relpath="",
|
544
|
+
env="WORKSPACE_DIR",
|
545
|
+
relpath=f"__editable__.{name}",
|
506
546
|
),
|
507
547
|
shape=WorkspaceShape.shared("gpus"),
|
548
|
+
)
|
549
|
+
workspaces[local] = WorkspaceConfig(
|
550
|
+
local=local,
|
551
|
+
remote=remote,
|
552
|
+
method=CodeSyncMethod.Rsync(),
|
553
|
+
)
|
554
|
+
|
555
|
+
logging.info(
|
556
|
+
f"Syncing editable install of {name} from {local} (to {remote.location})"
|
557
|
+
)
|
558
|
+
|
559
|
+
# Make sure we fixup path prefixes to the editable install.
|
560
|
+
conda_prefix_replacements[local] = remote.location
|
561
|
+
|
562
|
+
workspaces[conda_prefix] = WorkspaceConfig(
|
563
|
+
local=conda_prefix,
|
564
|
+
remote=RemoteWorkspace(
|
565
|
+
location=WorkspaceLocation.FromEnvVar(
|
566
|
+
env="CONDA_PREFIX",
|
567
|
+
relpath="",
|
508
568
|
),
|
509
|
-
|
569
|
+
shape=WorkspaceShape.shared("gpus"),
|
510
570
|
),
|
571
|
+
method=CodeSyncMethod.CondaSync(conda_prefix_replacements),
|
511
572
|
)
|
512
573
|
|
513
574
|
assert self._code_sync_client is not None
|
514
575
|
await self._code_sync_client.sync_workspaces(
|
515
|
-
workspaces=workspaces,
|
576
|
+
workspaces=list(workspaces.values()),
|
516
577
|
auto_reload=auto_reload,
|
517
578
|
)
|
518
579
|
|
monarch/actor/__init__.py
CHANGED
@@ -27,7 +27,7 @@ from monarch._src.actor.actor_mesh import (
|
|
27
27
|
send,
|
28
28
|
ValueMesh,
|
29
29
|
)
|
30
|
-
from monarch._src.actor.debugger.
|
30
|
+
from monarch._src.actor.debugger.debug_controller import debug_controller
|
31
31
|
from monarch._src.actor.endpoint import endpoint
|
32
32
|
from monarch._src.actor.future import Future
|
33
33
|
|
Binary file
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/tools/cli.py
CHANGED
@@ -13,6 +13,7 @@ from monarch.tools.commands import (
|
|
13
13
|
bounce,
|
14
14
|
component_args_from_cli,
|
15
15
|
create,
|
16
|
+
debug,
|
16
17
|
info,
|
17
18
|
kill,
|
18
19
|
stop,
|
@@ -22,6 +23,8 @@ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/con
|
|
22
23
|
Config,
|
23
24
|
defaults,
|
24
25
|
)
|
26
|
+
|
27
|
+
from monarch.tools.debug_env import _get_debug_server_host, _get_debug_server_port
|
25
28
|
from torchx.specs.finder import get_component
|
26
29
|
|
27
30
|
|
@@ -141,6 +144,25 @@ class StopCmd:
|
|
141
144
|
stop(args.server_handle)
|
142
145
|
|
143
146
|
|
147
|
+
class DebugCmd:
|
148
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
149
|
+
subparser.add_argument(
|
150
|
+
"--host",
|
151
|
+
type=str,
|
152
|
+
default=_get_debug_server_host(),
|
153
|
+
help="Hostname where the debug server is running",
|
154
|
+
)
|
155
|
+
subparser.add_argument(
|
156
|
+
"--port",
|
157
|
+
type=int,
|
158
|
+
default=_get_debug_server_port(),
|
159
|
+
help="Port that the debug server is listening on",
|
160
|
+
)
|
161
|
+
|
162
|
+
def run(self, args: argparse.Namespace) -> None:
|
163
|
+
debug(args.host, args.port)
|
164
|
+
|
165
|
+
|
144
166
|
def get_parser() -> argparse.ArgumentParser:
|
145
167
|
parser = argparse.ArgumentParser(description="Monarch CLI")
|
146
168
|
subparser = parser.add_subparsers(title="COMMANDS")
|
@@ -149,6 +171,7 @@ def get_parser() -> argparse.ArgumentParser:
|
|
149
171
|
"create": CreateCmd(),
|
150
172
|
"info": InfoCmd(),
|
151
173
|
"kill": KillCmd(),
|
174
|
+
"debug": DebugCmd(),
|
152
175
|
# --- placeholder subcommands (not yet implemented) ---
|
153
176
|
"bounce": BounceCmd(),
|
154
177
|
"stop": StopCmd(),
|
@@ -162,6 +185,9 @@ def get_parser() -> argparse.ArgumentParser:
|
|
162
185
|
def main(argv: list[str] = sys.argv[1:]) -> None:
|
163
186
|
parser = get_parser()
|
164
187
|
args = parser.parse_args(argv)
|
188
|
+
if not hasattr(args, "func"):
|
189
|
+
parser.print_help()
|
190
|
+
sys.exit(1)
|
165
191
|
args.func(args)
|
166
192
|
|
167
193
|
|
monarch/tools/commands.py
CHANGED
@@ -11,6 +11,7 @@ import asyncio
|
|
11
11
|
import inspect
|
12
12
|
import logging
|
13
13
|
import os
|
14
|
+
import subprocess
|
14
15
|
import tempfile
|
15
16
|
from datetime import datetime, timedelta
|
16
17
|
from pathlib import Path
|
@@ -366,3 +367,17 @@ def bounce(server_handle: str) -> None:
|
|
366
367
|
def stop(server_handle: str) -> None:
|
367
368
|
"""Stops the server's unix processes without tearing down the server's job."""
|
368
369
|
raise NotImplementedError("`stop` is not yet implemented")
|
370
|
+
|
371
|
+
|
372
|
+
def debug(host: str, port: int) -> None:
|
373
|
+
"""Connect to the debug server running on the provided host and port."""
|
374
|
+
for cmd in ["ncat", "nc", "netcat"]:
|
375
|
+
try:
|
376
|
+
subprocess.run([cmd, f"{host}", f"{port}"], check=True)
|
377
|
+
return
|
378
|
+
except FileNotFoundError:
|
379
|
+
pass
|
380
|
+
|
381
|
+
logging.error(
|
382
|
+
"Could not find a suitable netcat binary. Please install one and try again."
|
383
|
+
)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-unsafe
|
8
|
+
import os
|
9
|
+
|
10
|
+
|
11
|
+
_MONARCH_DEBUG_SERVER_HOST_ENV_VAR = "MONARCH_DEBUG_SERVER_HOST"
|
12
|
+
_MONARCH_DEBUG_SERVER_HOST_DEFAULT = "localhost"
|
13
|
+
_MONARCH_DEBUG_SERVER_PORT_ENV_VAR = "MONARCH_DEBUG_SERVER_PORT"
|
14
|
+
_MONARCH_DEBUG_SERVER_PORT_DEFAULT = "27000"
|
15
|
+
_MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR = "MONARCH_DEBUG_SERVER_PROTOCOL"
|
16
|
+
_MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT = "tcp"
|
17
|
+
|
18
|
+
|
19
|
+
def _get_debug_server_host():
|
20
|
+
return os.environ.get(
|
21
|
+
_MONARCH_DEBUG_SERVER_HOST_ENV_VAR, _MONARCH_DEBUG_SERVER_HOST_DEFAULT
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
def _get_debug_server_port():
|
26
|
+
return os.environ.get(
|
27
|
+
_MONARCH_DEBUG_SERVER_PORT_ENV_VAR, _MONARCH_DEBUG_SERVER_PORT_DEFAULT
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
def _get_debug_server_protocol():
|
32
|
+
return os.environ.get(
|
33
|
+
_MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR, _MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT
|
34
|
+
)
|
monarch/tools/mesh_spec.py
CHANGED
@@ -40,6 +40,7 @@ class MeshSpec:
|
|
40
40
|
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
|
41
41
|
hostnames: list[str] = field(default_factory=list)
|
42
42
|
state: specs.AppState = specs.AppState.UNSUBMITTED
|
43
|
+
image: str = _UNSET_STR
|
43
44
|
|
44
45
|
def server_addrs(
|
45
46
|
self, transport: Optional[str] = None, port: Optional[int] = None
|
@@ -81,6 +82,7 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
|
|
81
82
|
if role.name == mesh_name:
|
82
83
|
return MeshSpec(
|
83
84
|
name=mesh_name,
|
85
|
+
image=role.image,
|
84
86
|
num_hosts=role.num_replicas,
|
85
87
|
host_type=appdef.metadata.get(
|
86
88
|
_tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR
|
tests/test_allocator.py
CHANGED
@@ -30,6 +30,7 @@ from monarch._rust_bindings.monarch_hyperactor.channel import (
|
|
30
30
|
ChannelAddr,
|
31
31
|
ChannelTransport,
|
32
32
|
)
|
33
|
+
from monarch._src.actor.actor_mesh import IN_PAR
|
33
34
|
|
34
35
|
from monarch._src.actor.allocator import (
|
35
36
|
ALLOC_LABEL_PROC_MESH_NAME,
|
@@ -57,6 +58,10 @@ SERVER_READY = "monarch.tools.commands.server_ready"
|
|
57
58
|
UNUSED = "__UNUSED__"
|
58
59
|
|
59
60
|
|
61
|
+
def _get_hostname() -> str:
|
62
|
+
return "0.0.0.0" if not IN_PAR else "localhost"
|
63
|
+
|
64
|
+
|
60
65
|
class EnvCheckActor(Actor):
|
61
66
|
"""Actor that checks for the presence of an environment variable"""
|
62
67
|
|
@@ -321,7 +326,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
321
326
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
322
327
|
|
323
328
|
values = await actor.compute_world_size.call(
|
324
|
-
master_addr=
|
329
|
+
master_addr=_get_hostname(),
|
325
330
|
master_port=get_free_port(),
|
326
331
|
)
|
327
332
|
|
@@ -547,10 +552,10 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
547
552
|
actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
|
548
553
|
|
549
554
|
results_a = await actor_a.compute_world_size.call(
|
550
|
-
master_addr=
|
555
|
+
master_addr=_get_hostname(), master_port=get_free_port()
|
551
556
|
)
|
552
557
|
results_b = await actor_b.compute_world_size.call(
|
553
|
-
master_addr=
|
558
|
+
master_addr=_get_hostname(), master_port=get_free_port()
|
554
559
|
)
|
555
560
|
|
556
561
|
self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
|
@@ -604,12 +609,14 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
604
609
|
name="x",
|
605
610
|
num_hosts=1,
|
606
611
|
transport="tcp",
|
607
|
-
hostnames=[
|
612
|
+
hostnames=[_get_hostname()],
|
608
613
|
)
|
609
614
|
],
|
610
615
|
)
|
611
616
|
port = get_free_port()
|
612
|
-
with remote_process_allocator(
|
617
|
+
with remote_process_allocator(
|
618
|
+
addr=f"tcp!{get_sockaddr(_get_hostname(), port)}"
|
619
|
+
):
|
613
620
|
with mock.patch(SERVER_READY, return_value=server):
|
614
621
|
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
615
622
|
allocator = RemoteAllocator(
|
@@ -620,7 +627,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
620
627
|
proc_mesh = ProcMesh.from_alloc(alloc)
|
621
628
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
622
629
|
results = await actor.compute_world_size.call(
|
623
|
-
master_addr=
|
630
|
+
master_addr=_get_hostname(), master_port=get_free_port()
|
624
631
|
)
|
625
632
|
self.assert_computed_world_size(results, 4) # 1x4 mesh
|
626
633
|
|
@@ -634,12 +641,14 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
634
641
|
name="x",
|
635
642
|
num_hosts=1,
|
636
643
|
transport="tcp",
|
637
|
-
hostnames=[
|
644
|
+
hostnames=[_get_hostname()],
|
638
645
|
)
|
639
646
|
],
|
640
647
|
)
|
641
648
|
port = get_free_port()
|
642
|
-
with remote_process_allocator(
|
649
|
+
with remote_process_allocator(
|
650
|
+
addr=f"tcp!{get_sockaddr(_get_hostname(), port)}"
|
651
|
+
):
|
643
652
|
with mock.patch(SERVER_READY, return_value=server):
|
644
653
|
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
645
654
|
allocator = RemoteAllocator(
|
@@ -658,7 +667,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
658
667
|
proc_mesh = ProcMesh.from_alloc(alloc)
|
659
668
|
actor = await proc_mesh.spawn("test_actor", TestActor)
|
660
669
|
results = await actor.compute_world_size.call(
|
661
|
-
master_addr=
|
670
|
+
master_addr=_get_hostname(), master_port=get_free_port()
|
662
671
|
)
|
663
672
|
self.assert_computed_world_size(results, 3) # 1x3 mesh
|
664
673
|
|