torchmonarch-nightly 2025.9.9__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.11__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. monarch/__init__.py +7 -0
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/_src/actor/actor_mesh.py +1 -1
  4. monarch/_src/actor/bootstrap_main.py +7 -2
  5. monarch/_src/actor/debugger/breakpoint.py +30 -0
  6. monarch/_src/actor/debugger/debug_command.py +183 -0
  7. monarch/_src/actor/debugger/debug_controller.py +246 -0
  8. monarch/_src/actor/debugger/debug_io.py +68 -0
  9. monarch/_src/actor/debugger/debug_session.py +249 -0
  10. monarch/_src/actor/debugger/pdb_wrapper.py +1 -1
  11. monarch/_src/actor/host_mesh.py +10 -2
  12. monarch/_src/actor/pickle.py +4 -10
  13. monarch/_src/actor/proc_mesh.py +80 -19
  14. monarch/_src/tensor_engine/rdma.py +2 -0
  15. monarch/actor/__init__.py +1 -1
  16. monarch/gradient/_gradient_generator.so +0 -0
  17. monarch/monarch_controller +0 -0
  18. monarch/tools/cli.py +26 -0
  19. monarch/tools/commands.py +15 -0
  20. monarch/tools/debug_env.py +34 -0
  21. monarch/tools/mesh_spec.py +2 -0
  22. tests/test_allocator.py +18 -9
  23. tests/test_debugger.py +29 -25
  24. tests/test_mock_cuda.py +11 -3
  25. torchmonarch_nightly-2025.9.11.data/scripts/process_allocator +0 -0
  26. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/METADATA +1 -1
  27. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/RECORD +31 -29
  28. monarch/_src/actor/debugger/debugger.py +0 -737
  29. monarch/_src/debug_cli/__init__.py +0 -7
  30. monarch/_src/debug_cli/debug_cli.py +0 -43
  31. monarch/debug_cli/__init__.py +0 -7
  32. monarch/debug_cli/__main__.py +0 -12
  33. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/WHEEL +0 -0
  34. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/entry_points.txt +0 -0
  35. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/licenses/LICENSE +0 -0
  36. {torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,249 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+ import asyncio
9
+ from dataclasses import dataclass
10
+ from typing import Dict, Generator, List, Optional, Tuple
11
+
12
+ from monarch._src.actor.debugger.debug_command import RanksType
13
+ from monarch._src.actor.debugger.debug_io import DebugIO, DebugIOError
14
+
15
+ from monarch._src.actor.debugger.pdb_wrapper import DebuggerWrite
16
+
17
+
18
+ @dataclass
19
+ class DebugSessionInfo:
20
+ actor_name: str
21
+ rank: int
22
+ coords: Dict[str, int]
23
+ hostname: str
24
+ function: str | None
25
+ lineno: int | None
26
+
27
+ def __lt__(self, other):
28
+ if self.actor_name < other.actor_name:
29
+ return True
30
+ elif self.actor_name == other.actor_name:
31
+ return self.rank < other.rank
32
+ else:
33
+ return False
34
+
35
+
36
+ class DebugSession:
37
+ """Represents a single session with a remote debugger."""
38
+
39
+ def __init__(
40
+ self, rank: int, coords: Dict[str, int], hostname: str, actor_name: str
41
+ ):
42
+ self.rank = rank
43
+ self.coords = coords
44
+ self.hostname = hostname
45
+ self.actor_name = actor_name
46
+ self._active = False
47
+ self._message_queue = asyncio.Queue()
48
+ self._task = None
49
+ self._pending_send_to_actor = asyncio.Queue()
50
+ self._outputs_since_last_input = []
51
+ self._function_lineno = None
52
+ self._need_read = False
53
+
54
+ async def _event_loop(self, debug_io: DebugIO, line=None, suppress_output=False):
55
+ if not suppress_output:
56
+ # If the user had previously attached to this debug session,
57
+ # then it would have printed various messages from the
58
+ # message queue. When the user re-attaches, we want to
59
+ # print out all of the output that was printed since the
60
+ # last command sent to this session.
61
+ if len(self._outputs_since_last_input) > 0:
62
+ await debug_io.output(
63
+ f"<last pdb output for {self.actor_name} {self.rank} follows>\n"
64
+ )
65
+ for output in self._outputs_since_last_input:
66
+ await debug_io.output(output.payload.decode())
67
+
68
+ while True:
69
+ # When the user inputs "detach", it uses up a "read" message
70
+ # without actually responding to the actor being debugged. We
71
+ # can't manually reinsert the "read" message into the message queue,
72
+ # so instead the self._need_read flag indicates there's an additional
73
+ # "read" that we need to respond to.
74
+ if self._need_read:
75
+ self._need_read = False
76
+ message = "read"
77
+ else:
78
+ message = await self._message_queue.get()
79
+ if message == "detach":
80
+ # Return to the main outer debug loop.
81
+ break
82
+ elif message == "read":
83
+ try:
84
+ break_after = False
85
+ if line is not None:
86
+ break_after = True
87
+ else:
88
+ line = await debug_io.input()
89
+ if line == "detach":
90
+ self._need_read = True
91
+ break
92
+ else:
93
+ await self._pending_send_to_actor.put((line + "\n").encode())
94
+ # Cancel safety: don't clear the previous outputs until we know
95
+ # the actor will receive the input.
96
+ self._outputs_since_last_input = []
97
+ line = None
98
+ if break_after:
99
+ break
100
+ except (DebugIOError, asyncio.CancelledError):
101
+ # See earlier comment about this flag. If either of the awaits inside
102
+ # the try block is cancelled, we need to redo the read without actually
103
+ # reinserting "read" into the message queue.
104
+ self._need_read = True
105
+ raise
106
+ elif message[0] == "write":
107
+ output = message[1]
108
+ # If the user sees this output but then detaches from the session,
109
+ # its useful to store all outputs since the last input so that
110
+ # they can be printed again when the user re-attaches.
111
+ self._outputs_since_last_input.append(output)
112
+ if not suppress_output:
113
+ await debug_io.output(output.payload.decode())
114
+
115
+ if not suppress_output:
116
+ await debug_io.output(
117
+ f"Detaching from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
118
+ )
119
+
120
+ def get_info(self):
121
+ function = lineno = None
122
+ if self._function_lineno is not None:
123
+ function, lineno = self._function_lineno
124
+ return DebugSessionInfo(
125
+ self.actor_name, self.rank, self.coords, self.hostname, function, lineno
126
+ )
127
+
128
+ async def attach(self, debug_io: DebugIO, line=None, suppress_output=False):
129
+ self._active = True
130
+ if not suppress_output:
131
+ await debug_io.output(
132
+ f"Attached to debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
133
+ )
134
+ self._task = asyncio.create_task(
135
+ self._event_loop(debug_io, line, suppress_output)
136
+ )
137
+ await self._task
138
+ if not suppress_output:
139
+ await debug_io.output(
140
+ f"Detached from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
141
+ )
142
+ self._active = False
143
+
144
+ async def detach(self):
145
+ if self._active:
146
+ await self._message_queue.put("detach")
147
+
148
+ async def debugger_read(self, size: int) -> DebuggerWrite:
149
+ await self._message_queue.put("read")
150
+ input_data = await self._pending_send_to_actor.get()
151
+ if len(input_data) > size:
152
+ input_data = input_data[:size]
153
+ return DebuggerWrite(input_data, None, None)
154
+
155
+ async def debugger_write(self, write: DebuggerWrite) -> None:
156
+ if write.function is not None and write.lineno is not None:
157
+ self._function_lineno = (write.function, write.lineno)
158
+ await self._message_queue.put(("write", write))
159
+
160
+
161
+ class DebugSessions:
162
+ def __init__(self):
163
+ self._sessions: Dict[str, Dict[int, DebugSession]] = {}
164
+
165
+ def insert(self, session: DebugSession) -> None:
166
+ if session.actor_name not in self._sessions:
167
+ self._sessions[session.actor_name] = {session.rank: session}
168
+ elif session.rank not in self._sessions[session.actor_name]:
169
+ self._sessions[session.actor_name][session.rank] = session
170
+ else:
171
+ raise ValueError(
172
+ f"Debug session for rank {session.rank} already exists for actor {session.actor_name}"
173
+ )
174
+
175
+ def remove(self, actor_name: str, rank: int) -> DebugSession:
176
+ if actor_name not in self._sessions:
177
+ raise ValueError(f"No debug sessions for actor {actor_name}")
178
+ elif rank not in self._sessions[actor_name]:
179
+ raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
180
+ session = self._sessions[actor_name].pop(rank)
181
+ if len(self._sessions[actor_name]) == 0:
182
+ del self._sessions[actor_name]
183
+ return session
184
+
185
+ def get(self, actor_name: str, rank: int) -> DebugSession:
186
+ if actor_name not in self._sessions:
187
+ raise ValueError(f"No debug sessions for actor {actor_name}")
188
+ elif rank not in self._sessions[actor_name]:
189
+ raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
190
+ return self._sessions[actor_name][rank]
191
+
192
+ def iter(
193
+ self, selection: Optional[Tuple[str, Optional[RanksType]]]
194
+ ) -> Generator[DebugSession, None, None]:
195
+ if selection is None:
196
+ for sessions in self._sessions.values():
197
+ for session in sessions.values():
198
+ yield session
199
+ return
200
+ actor_name, ranks = selection
201
+ if actor_name not in self._sessions:
202
+ return
203
+ sessions = self._sessions[actor_name]
204
+ if ranks is None:
205
+ for session in sessions.values():
206
+ yield session
207
+ elif isinstance(ranks, int):
208
+ if ranks in sessions:
209
+ yield sessions[ranks]
210
+ elif isinstance(ranks, list):
211
+ for rank in ranks:
212
+ if rank in sessions:
213
+ yield sessions[rank]
214
+ elif isinstance(ranks, dict):
215
+ dims = ranks
216
+ for session in sessions.values():
217
+ include_rank = True
218
+ for dim, ranks in dims.items():
219
+ if dim not in session.coords:
220
+ include_rank = False
221
+ break
222
+ elif (
223
+ isinstance(ranks, range) or isinstance(ranks, list)
224
+ ) and session.coords[dim] not in ranks:
225
+ include_rank = False
226
+ break
227
+ elif isinstance(ranks, int) and session.coords[dim] != ranks:
228
+ include_rank = False
229
+ break
230
+ if include_rank:
231
+ yield session
232
+ elif isinstance(ranks, range):
233
+ for rank, session in sessions.items():
234
+ if rank in ranks:
235
+ yield session
236
+
237
+ def info(self) -> List[DebugSessionInfo]:
238
+ session_info = []
239
+ for sessions in self._sessions.values():
240
+ for session in sessions.values():
241
+ session_info.append(session.get_info())
242
+ return session_info
243
+
244
+ def __len__(self) -> int:
245
+ return sum(len(sessions) for sessions in self._sessions.values())
246
+
247
+ def __contains__(self, item: Tuple[str, int]) -> bool:
248
+ actor_name, rank = item
249
+ return actor_name in self._sessions and rank in self._sessions[actor_name]
@@ -22,7 +22,7 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
22
22
  from monarch._src.actor.sync_state import fake_sync_state
23
23
 
24
24
  if TYPE_CHECKING:
25
- from monarch._src.actor.debugger.debugger import DebugController
25
+ from monarch._src.actor.debugger.debug_controller import DebugController
26
26
 
27
27
 
28
28
  @dataclass
@@ -52,13 +52,21 @@ class HostMesh(MeshTrait):
52
52
  interfaces with the underlying resource allocator of your choice.
53
53
  """
54
54
 
55
- def __init__(self, shape: Shape, allocator: AllocateMixin):
55
+ def __init__(
56
+ self,
57
+ shape: Shape,
58
+ allocator: AllocateMixin,
59
+ alloc_constraints: Optional[AllocConstraints] = None,
60
+ ):
56
61
  self._allocator = allocator
62
+ self._alloc_constraints = alloc_constraints
57
63
  self._shape = shape
58
64
  self._spawned = 0
59
65
 
60
66
  def _alloc(self, hosts: int, gpus: int) -> "AllocHandle":
61
- spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
67
+ spec: AllocSpec = AllocSpec(
68
+ self._alloc_constraints or AllocConstraints(), hosts=hosts, gpus=gpus
69
+ )
62
70
  return self._allocator.allocate(spec)
63
71
 
64
72
  def spawn_procs(
@@ -8,18 +8,15 @@
8
8
 
9
9
  import io
10
10
  import pickle
11
- import sys
12
11
  from contextlib import contextmanager, ExitStack
13
12
  from typing import Any, Callable, Iterable, List, Tuple
14
13
 
15
14
  import cloudpickle
16
15
 
17
-
18
- def maybe_torch():
19
- """
20
- We have to do some special pickling if torch is loaded but not if it isn't loaded?
21
- """
22
- return sys.modules.get("torch")
16
+ try:
17
+ import torch # @manual
18
+ except ImportError:
19
+ torch = None
23
20
 
24
21
 
25
22
  _orig_function_getstate = cloudpickle.cloudpickle._function_getstate
@@ -79,7 +76,6 @@ def flatten(obj: Any, filter: Callable[[Any], bool]) -> Tuple[List[Any], bytes]:
79
76
 
80
77
  def unflatten(data: bytes, values: Iterable[Any]) -> Any:
81
78
  with ExitStack() as stack:
82
- torch = maybe_torch()
83
79
  if torch is not None:
84
80
  stack.enter_context(load_tensors_on_cpu())
85
81
  stack.enter_context(torch.utils._python_dispatch._disable_current_modes())
@@ -91,8 +87,6 @@ def unflatten(data: bytes, values: Iterable[Any]) -> Any:
91
87
  def load_tensors_on_cpu():
92
88
  # Ensure that any tensors load from CPU via monkeypatching how Storages are
93
89
  # loaded.
94
- import torch
95
-
96
90
  old = torch.storage._load_from_bytes
97
91
  try:
98
92
  torch.storage._load_from_bytes = lambda b: torch.load(
@@ -7,6 +7,8 @@
7
7
  # pyre-strict
8
8
 
9
9
  import asyncio
10
+ import importlib.metadata
11
+ import json
10
12
  import logging
11
13
  import os
12
14
  import sys
@@ -31,6 +33,7 @@ from typing import (
31
33
  TYPE_CHECKING,
32
34
  TypeVar,
33
35
  )
36
+ from urllib.parse import urlparse
34
37
  from weakref import WeakValueDictionary
35
38
 
36
39
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
@@ -468,20 +471,19 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
468
471
  # The workspace shape (i.e. only perform one rsync per host).
469
472
  assert set(self._shape.labels).issubset({"gpus", "hosts"})
470
473
 
471
- workspaces = []
474
+ workspaces = {}
472
475
  for src_dir, dst_dir in workspace.dirs.items():
473
- workspaces.append(
474
- WorkspaceConfig(
475
- local=Path(src_dir),
476
- remote=RemoteWorkspace(
477
- location=WorkspaceLocation.FromEnvVar(
478
- env="WORKSPACE_DIR",
479
- relpath=dst_dir,
480
- ),
481
- shape=WorkspaceShape.shared("gpus"),
476
+ local = Path(src_dir)
477
+ workspaces[local] = WorkspaceConfig(
478
+ local=local,
479
+ remote=RemoteWorkspace(
480
+ location=WorkspaceLocation.FromEnvVar(
481
+ env="WORKSPACE_DIR",
482
+ relpath=dst_dir,
482
483
  ),
483
- method=CodeSyncMethod.Rsync,
484
+ shape=WorkspaceShape.shared("gpus"),
484
485
  ),
486
+ method=CodeSyncMethod.Rsync(),
485
487
  )
486
488
 
487
489
  # If `conda` is set, also sync the currently activated conda env.
@@ -496,23 +498,82 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
496
498
  while conda_prefix.is_symlink():
497
499
  conda_prefix = conda_prefix.parent / conda_prefix.readlink()
498
500
 
499
- workspaces.append(
500
- WorkspaceConfig(
501
- local=conda_prefix,
502
- remote=RemoteWorkspace(
501
+ # Build a list of additional paths prefixes to fixup when syncing
502
+ # the conda env.
503
+ conda_prefix_replacements = {}
504
+
505
+ # Auto-detect editable installs and implicitly add workspaces for
506
+ # them.
507
+ # NOTE(agallagher): There's sometimes a `python3.1` symlink to
508
+ # `python3.10`, so avoid it.
509
+ (lib_python,) = [
510
+ dirpath
511
+ for dirpath in conda_prefix.glob("lib/python*")
512
+ if not os.path.islink(dirpath)
513
+ ]
514
+ for direct_url in lib_python.glob(
515
+ "site-packages/*.dist-info/direct_url.json"
516
+ ):
517
+ # Parse the direct_url.json to see if it's an editable install
518
+ # (https://packaging.python.org/en/latest/specifications/direct-url/#example-pip-commands-and-their-effect-on-direct-url-json).
519
+ with open(direct_url) as f:
520
+ info = json.load(f)
521
+ if not info.get("dir_info", {}).get("editable", False):
522
+ continue
523
+
524
+ # Extract the workspace path from the URL (e.g. `file///my/workspace/`).
525
+ url = urlparse(info["url"])
526
+ assert url.scheme == "file", f"expected file:// URL, got {url.scheme}"
527
+
528
+ # Get the project name, so we can use it below to create a unique-ish
529
+ # remote directory.
530
+ dist = importlib.metadata.PathDistribution(direct_url.parent)
531
+ name = dist.metadata["Name"]
532
+
533
+ local = Path(url.path)
534
+
535
+ # Check if we've already defined a workspace for this local path.
536
+ existing = workspaces.get(local)
537
+ if existing is not None:
538
+ assert existing.method == CodeSyncMethod.Rsync()
539
+ remote = existing.remote
540
+ else:
541
+ # Otherwise, add the workspace to the list.
542
+ remote = RemoteWorkspace(
503
543
  location=WorkspaceLocation.FromEnvVar(
504
- env="CONDA_PREFIX",
505
- relpath="",
544
+ env="WORKSPACE_DIR",
545
+ relpath=f"__editable__.{name}",
506
546
  ),
507
547
  shape=WorkspaceShape.shared("gpus"),
548
+ )
549
+ workspaces[local] = WorkspaceConfig(
550
+ local=local,
551
+ remote=remote,
552
+ method=CodeSyncMethod.Rsync(),
553
+ )
554
+
555
+ logging.info(
556
+ f"Syncing editable install of {name} from {local} (to {remote.location})"
557
+ )
558
+
559
+ # Make sure we fixup path prefixes to the editable install.
560
+ conda_prefix_replacements[local] = remote.location
561
+
562
+ workspaces[conda_prefix] = WorkspaceConfig(
563
+ local=conda_prefix,
564
+ remote=RemoteWorkspace(
565
+ location=WorkspaceLocation.FromEnvVar(
566
+ env="CONDA_PREFIX",
567
+ relpath="",
508
568
  ),
509
- method=CodeSyncMethod.CondaSync,
569
+ shape=WorkspaceShape.shared("gpus"),
510
570
  ),
571
+ method=CodeSyncMethod.CondaSync(conda_prefix_replacements),
511
572
  )
512
573
 
513
574
  assert self._code_sync_client is not None
514
575
  await self._code_sync_client.sync_workspaces(
515
- workspaces=workspaces,
576
+ workspaces=list(workspaces.values()),
516
577
  auto_reload=auto_reload,
517
578
  )
518
579
 
@@ -127,6 +127,8 @@ class RDMABuffer:
127
127
  storage = data.untyped_storage()
128
128
  addr: int = storage.data_ptr()
129
129
  size = storage.element_size() * data.numel()
130
+ if size == 0:
131
+ raise ValueError("Cannot create RDMABuffer with size 0.")
130
132
  ctx = context()
131
133
  self._buffer: _RdmaBuffer = _RdmaBuffer.create_rdma_buffer_blocking(
132
134
  addr=addr,
monarch/actor/__init__.py CHANGED
@@ -27,7 +27,7 @@ from monarch._src.actor.actor_mesh import (
27
27
  send,
28
28
  ValueMesh,
29
29
  )
30
- from monarch._src.actor.debugger.debugger import debug_controller
30
+ from monarch._src.actor.debugger.debug_controller import debug_controller
31
31
  from monarch._src.actor.endpoint import endpoint
32
32
  from monarch._src.actor.future import Future
33
33
 
Binary file
Binary file
monarch/tools/cli.py CHANGED
@@ -13,6 +13,7 @@ from monarch.tools.commands import (
13
13
  bounce,
14
14
  component_args_from_cli,
15
15
  create,
16
+ debug,
16
17
  info,
17
18
  kill,
18
19
  stop,
@@ -22,6 +23,8 @@ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/con
22
23
  Config,
23
24
  defaults,
24
25
  )
26
+
27
+ from monarch.tools.debug_env import _get_debug_server_host, _get_debug_server_port
25
28
  from torchx.specs.finder import get_component
26
29
 
27
30
 
@@ -141,6 +144,25 @@ class StopCmd:
141
144
  stop(args.server_handle)
142
145
 
143
146
 
147
+ class DebugCmd:
148
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
149
+ subparser.add_argument(
150
+ "--host",
151
+ type=str,
152
+ default=_get_debug_server_host(),
153
+ help="Hostname where the debug server is running",
154
+ )
155
+ subparser.add_argument(
156
+ "--port",
157
+ type=int,
158
+ default=_get_debug_server_port(),
159
+ help="Port that the debug server is listening on",
160
+ )
161
+
162
+ def run(self, args: argparse.Namespace) -> None:
163
+ debug(args.host, args.port)
164
+
165
+
144
166
  def get_parser() -> argparse.ArgumentParser:
145
167
  parser = argparse.ArgumentParser(description="Monarch CLI")
146
168
  subparser = parser.add_subparsers(title="COMMANDS")
@@ -149,6 +171,7 @@ def get_parser() -> argparse.ArgumentParser:
149
171
  "create": CreateCmd(),
150
172
  "info": InfoCmd(),
151
173
  "kill": KillCmd(),
174
+ "debug": DebugCmd(),
152
175
  # --- placeholder subcommands (not yet implemented) ---
153
176
  "bounce": BounceCmd(),
154
177
  "stop": StopCmd(),
@@ -162,6 +185,9 @@ def get_parser() -> argparse.ArgumentParser:
162
185
  def main(argv: list[str] = sys.argv[1:]) -> None:
163
186
  parser = get_parser()
164
187
  args = parser.parse_args(argv)
188
+ if not hasattr(args, "func"):
189
+ parser.print_help()
190
+ sys.exit(1)
165
191
  args.func(args)
166
192
 
167
193
 
monarch/tools/commands.py CHANGED
@@ -11,6 +11,7 @@ import asyncio
11
11
  import inspect
12
12
  import logging
13
13
  import os
14
+ import subprocess
14
15
  import tempfile
15
16
  from datetime import datetime, timedelta
16
17
  from pathlib import Path
@@ -366,3 +367,17 @@ def bounce(server_handle: str) -> None:
366
367
  def stop(server_handle: str) -> None:
367
368
  """Stops the server's unix processes without tearing down the server's job."""
368
369
  raise NotImplementedError("`stop` is not yet implemented")
370
+
371
+
372
+ def debug(host: str, port: int) -> None:
373
+ """Connect to the debug server running on the provided host and port."""
374
+ for cmd in ["ncat", "nc", "netcat"]:
375
+ try:
376
+ subprocess.run([cmd, f"{host}", f"{port}"], check=True)
377
+ return
378
+ except FileNotFoundError:
379
+ pass
380
+
381
+ logging.error(
382
+ "Could not find a suitable netcat binary. Please install one and try again."
383
+ )
@@ -0,0 +1,34 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+ import os
9
+
10
+
11
+ _MONARCH_DEBUG_SERVER_HOST_ENV_VAR = "MONARCH_DEBUG_SERVER_HOST"
12
+ _MONARCH_DEBUG_SERVER_HOST_DEFAULT = "localhost"
13
+ _MONARCH_DEBUG_SERVER_PORT_ENV_VAR = "MONARCH_DEBUG_SERVER_PORT"
14
+ _MONARCH_DEBUG_SERVER_PORT_DEFAULT = "27000"
15
+ _MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR = "MONARCH_DEBUG_SERVER_PROTOCOL"
16
+ _MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT = "tcp"
17
+
18
+
19
+ def _get_debug_server_host():
20
+ return os.environ.get(
21
+ _MONARCH_DEBUG_SERVER_HOST_ENV_VAR, _MONARCH_DEBUG_SERVER_HOST_DEFAULT
22
+ )
23
+
24
+
25
+ def _get_debug_server_port():
26
+ return os.environ.get(
27
+ _MONARCH_DEBUG_SERVER_PORT_ENV_VAR, _MONARCH_DEBUG_SERVER_PORT_DEFAULT
28
+ )
29
+
30
+
31
+ def _get_debug_server_protocol():
32
+ return os.environ.get(
33
+ _MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR, _MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT
34
+ )
@@ -40,6 +40,7 @@ class MeshSpec:
40
40
  port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
41
41
  hostnames: list[str] = field(default_factory=list)
42
42
  state: specs.AppState = specs.AppState.UNSUBMITTED
43
+ image: str = _UNSET_STR
43
44
 
44
45
  def server_addrs(
45
46
  self, transport: Optional[str] = None, port: Optional[int] = None
@@ -81,6 +82,7 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
81
82
  if role.name == mesh_name:
82
83
  return MeshSpec(
83
84
  name=mesh_name,
85
+ image=role.image,
84
86
  num_hosts=role.num_replicas,
85
87
  host_type=appdef.metadata.get(
86
88
  _tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR