torchmonarch-nightly 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.13__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
monarch/_testing.py CHANGED
@@ -10,7 +10,7 @@ import logging
10
10
  import tempfile
11
11
  import time
12
12
  from contextlib import contextmanager, ExitStack
13
- from typing import Callable, Generator, Optional
13
+ from typing import Any, Callable, Dict, Generator, Literal, Optional
14
14
 
15
15
  import monarch_supervisor
16
16
  from monarch.common.client import Client
@@ -18,6 +18,8 @@ from monarch.common.device_mesh import DeviceMesh
18
18
  from monarch.common.invocation import DeviceException, RemoteException
19
19
  from monarch.common.shape import NDSlice
20
20
  from monarch.controller.backend import ProcessBackend
21
+ from monarch.mesh_controller import spawn_tensor_engine
22
+ from monarch.proc_mesh import proc_mesh, ProcMesh
21
23
  from monarch.python_local_mesh import PythonLocalContext
22
24
  from monarch.rust_local_mesh import (
23
25
  local_mesh,
@@ -50,6 +52,7 @@ class TestingContext:
50
52
  self.cleanup = ExitStack()
51
53
  self._py_process_cache = {}
52
54
  self._rust_process_cache = None
55
+ self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
53
56
 
54
57
  @contextmanager
55
58
  def _get_context(self, num_hosts, gpu_per_host):
@@ -75,16 +78,14 @@ class TestingContext:
75
78
 
76
79
  @contextmanager
77
80
  def local_py_device_mesh(
78
- self, num_hosts, gpu_per_host, activate=True
81
+ self,
82
+ num_hosts,
83
+ gpu_per_host,
79
84
  ) -> Generator[DeviceMesh, None, None]:
80
85
  ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
81
86
  dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
82
87
  try:
83
- if activate:
84
- with dm.activate():
85
- yield dm
86
- else:
87
- yield dm
88
+ yield dm
88
89
  dm.client.shutdown(destroy_pg=False)
89
90
  except Exception:
90
91
  # abnormal exit, so we just make sure we do not try to communicate in destructors,
@@ -97,7 +98,6 @@ class TestingContext:
97
98
  self,
98
99
  num_hosts,
99
100
  gpu_per_host,
100
- activate: bool = True,
101
101
  controller_params=None,
102
102
  ) -> Generator[DeviceMesh, None, None]:
103
103
  # Create a new system and mesh for test.
@@ -115,11 +115,7 @@ class TestingContext:
115
115
  controller_params=controller_params,
116
116
  ) as dm:
117
117
  try:
118
- if activate:
119
- with dm.activate():
120
- yield dm
121
- else:
122
- yield dm
118
+ yield dm
123
119
  dm.exit()
124
120
  except Exception:
125
121
  dm.client._shutdown = True
@@ -129,21 +125,57 @@ class TestingContext:
129
125
  # pyre-ignore: Undefined attribute
130
126
  dm.client.inner._actor.stop()
131
127
 
128
+ @contextmanager
129
+ def local_engine_on_proc_mesh(
130
+ self,
131
+ num_hosts,
132
+ gpu_per_host,
133
+ ) -> Generator[DeviceMesh, None, None]:
134
+ key = (num_hosts, gpu_per_host)
135
+ if key not in self._proc_mesh_cache:
136
+ self._proc_mesh_cache[key] = proc_mesh(
137
+ hosts=num_hosts, gpus=gpu_per_host
138
+ ).get()
139
+
140
+ dm = spawn_tensor_engine(self._proc_mesh_cache[key])
141
+ dm = dm.rename(hosts="host", gpus="gpu")
142
+ try:
143
+ yield dm
144
+ dm.exit()
145
+ except Exception as e:
146
+ # abnormal exit, so we just make sure we do not try to communicate in destructors,
147
+ # but we do notn wait for workers to exit since we do not know what state they are in.
148
+ dm.client._shutdown = True
149
+ raise
150
+
132
151
  @contextmanager
133
152
  def local_device_mesh(
134
- self, num_hosts, gpu_per_host, activate=True, rust=False, controller_params=None
153
+ self,
154
+ num_hosts,
155
+ gpu_per_host,
156
+ activate=True,
157
+ backend: Literal["py", "rs", "mesh"] = "py",
158
+ controller_params=None,
135
159
  ) -> Generator[DeviceMesh, None, None]:
136
160
  start = time.time()
137
- if rust:
161
+ if backend == "rs":
138
162
  generator = self.local_rust_device_mesh(
139
- num_hosts, gpu_per_host, activate, controller_params=controller_params
163
+ num_hosts, gpu_per_host, controller_params=controller_params
140
164
  )
165
+ elif backend == "py":
166
+ generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
167
+ elif backend == "mesh":
168
+ generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
141
169
  else:
142
- generator = self.local_py_device_mesh(num_hosts, gpu_per_host, activate)
170
+ raise ValueError(f"invalid backend: {backend}")
143
171
  with generator as dm:
144
172
  end = time.time()
145
173
  logging.info("initialized mesh in {:.2f}s".format(end - start))
146
- yield dm
174
+ if activate:
175
+ with dm.activate():
176
+ yield dm
177
+ else:
178
+ yield dm
147
179
  start = time.time()
148
180
  end = time.time()
149
181
  logging.info("shutdown mesh in {:.2f}s".format(end - start))
monarch/actor_mesh.py CHANGED
@@ -15,6 +15,7 @@ import inspect
15
15
  import itertools
16
16
  import logging
17
17
  import random
18
+ import sys
18
19
  import traceback
19
20
 
20
21
  from dataclasses import dataclass
@@ -37,6 +38,7 @@ from typing import (
37
38
  ParamSpec,
38
39
  Tuple,
39
40
  Type,
41
+ TYPE_CHECKING,
40
42
  TypeVar,
41
43
  )
42
44
 
@@ -57,6 +59,10 @@ from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Sh
57
59
 
58
60
  from monarch.common.pickle_flatten import flatten, unflatten
59
61
  from monarch.common.shape import MeshTrait, NDSlice
62
+ from monarch.pdb_wrapper import remote_breakpointhook
63
+
64
+ if TYPE_CHECKING:
65
+ from monarch.debugger import DebugClient
60
66
 
61
67
  logger: logging.Logger = logging.getLogger(__name__)
62
68
 
@@ -519,7 +525,14 @@ class _Actor:
519
525
  enter_span(
520
526
  the_method.__module__, message.method, str(ctx.mailbox.actor_id)
521
527
  )
522
- result = await the_method(self.instance, *args, **kwargs)
528
+ try:
529
+ result = await the_method(self.instance, *args, **kwargs)
530
+ except Exception as e:
531
+ logging.critical(
532
+ "Unahndled exception in actor endpoint",
533
+ exc_info=e,
534
+ )
535
+ raise e
523
536
  exit_span()
524
537
  return result
525
538
 
@@ -624,6 +637,19 @@ class Actor(MeshTrait):
624
637
  "actor implementations are not meshes, but we can't convince the typechecker of it..."
625
638
  )
626
639
 
640
+ @endpoint
641
+ async def _set_debug_client(self, client: "DebugClient") -> None:
642
+ point = MonarchContext.get().point
643
+ # For some reason, using a lambda instead of functools.partial
644
+ # confuses the pdb wrapper implementation.
645
+ sys.breakpointhook = functools.partial( # pyre-ignore
646
+ remote_breakpointhook,
647
+ point.rank,
648
+ point.shape.coordinates(point.rank),
649
+ MonarchContext.get().mailbox.actor_id,
650
+ client,
651
+ )
652
+
627
653
 
628
654
  class ActorMeshRef(MeshTrait):
629
655
  def __init__(
monarch/bootstrap_main.py CHANGED
@@ -30,28 +30,9 @@ def invoke_main():
30
30
  # behavior of std out as if it were a terminal.
31
31
  sys.stdout.reconfigure(line_buffering=True)
32
32
  global bootstrap_main
33
- from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
34
- forward_to_tracing,
35
- )
36
33
 
37
34
  # TODO: figure out what from worker_main.py we should reproduce here.
38
-
39
- class TracingForwarder(logging.Handler):
40
- def emit(self, record: logging.LogRecord) -> None:
41
- try:
42
- forward_to_tracing(
43
- record.getMessage(),
44
- record.filename or "",
45
- record.lineno or 0,
46
- record.levelno,
47
- )
48
- except AttributeError:
49
- forward_to_tracing(
50
- record.__str__(),
51
- record.filename or "",
52
- record.lineno or 0,
53
- record.levelno,
54
- )
35
+ from monarch.telemetry import TracingForwarder
55
36
 
56
37
  if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
57
38
  raise RuntimeError("Error during bootstrap for testing")
@@ -16,11 +16,6 @@ def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
16
16
  torch.manual_seed(seed ^ process_idx)
17
17
 
18
18
 
19
- @remote(propagate=lambda: 0)
20
- def initial_seed_remote() -> int:
21
- return torch.initial_seed()
22
-
23
-
24
19
  @remote(propagate=lambda: torch.zeros(1))
25
20
  def get_rng_state_remote() -> torch.Tensor:
26
21
  return torch.get_rng_state()
@@ -67,3 +62,7 @@ def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
67
62
  @remote(propagate="inspect")
68
63
  def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
69
64
  torch.cuda.set_rng_state_all(states)
65
+
66
+
67
+ # initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework
68
+ # def initial_seed_remote() -> int: ...
monarch/common/client.py CHANGED
@@ -103,6 +103,13 @@ class Client:
103
103
  # workers.
104
104
  self.last_processed_seq = -1
105
105
 
106
+ # an error that we have received but know for certain has not
107
+ # been propagated to a future. This will be reported on shutdown
108
+ # to avoid hiding the error. This is best effort: we only keep
109
+ # the error until the point the a future is dependent on
110
+ # _any_ error, not particularly the tracked one.
111
+ self._pending_shutdown_error = None
112
+
106
113
  self.recorder = Recorder()
107
114
 
108
115
  self.pending_results: Dict[
@@ -174,6 +181,8 @@ class Client:
174
181
  destroy_pg: bool = True,
175
182
  error_reason: Optional[RemoteException | DeviceException | Exception] = None,
176
183
  ) -> None:
184
+ if self.has_shutdown:
185
+ return
177
186
  logger.info("shutting down the client gracefully")
178
187
 
179
188
  atexit.unregister(self._atexit)
@@ -303,6 +312,7 @@ class Client:
303
312
 
304
313
  if error is not None:
305
314
  logging.info("Received error for seq %s: %s", seq, error)
315
+ self._pending_shutdown_error = error
306
316
  # We should not have set result if we have an error.
307
317
  assert result is None
308
318
  if not isinstance(error, RemoteException):
@@ -326,7 +336,11 @@ class Client:
326
336
 
327
337
  fut, _ = self.pending_results[seq]
328
338
  if fut is not None:
329
- fut._set_result(result if error is None else error)
339
+ if error is None:
340
+ fut._set_result(result)
341
+ else:
342
+ fut._set_result(error)
343
+ self._pending_shutdown_error = None
330
344
  elif result is not None:
331
345
  logger.debug(f"{seq}: unused result {result}")
332
346
  elif error is not None:
monarch/debugger.py ADDED
@@ -0,0 +1,377 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import asyncio
8
+ import logging
9
+ import sys
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Tuple, Union
12
+
13
+ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
14
+ from monarch.actor_mesh import Actor, endpoint
15
+
16
+ from monarch.pdb_wrapper import DebuggerWrite
17
+
18
+ from monarch.proc_mesh import local_proc_mesh
19
+ from tabulate import tabulate
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ CANCEL_TOKEN = object()
26
+
27
+
28
+ async def _debugger_input(prompt=""):
29
+ return await asyncio.to_thread(input, prompt)
30
+
31
+
32
+ def _debugger_output(msg):
33
+ sys.stdout.write(msg)
34
+ sys.stdout.flush()
35
+
36
+
37
+ @dataclass
38
+ class DebugSessionInfo:
39
+ rank: int
40
+ coords: Dict[str, int]
41
+ hostname: str
42
+ actor_id: ActorId
43
+ function: str | None
44
+ lineno: int | None
45
+
46
+
47
+ class DebugSession:
48
+ """Represents a single session with a remote debugger."""
49
+
50
+ def __init__(
51
+ self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
52
+ ):
53
+ self.rank = rank
54
+ self.coords = coords
55
+ self.hostname = hostname
56
+ self.actor_id = actor_id
57
+ self._active = False
58
+ self._message_queue = asyncio.Queue()
59
+ self._task = None
60
+ self._pending_send_to_actor = asyncio.Queue()
61
+ self._outputs_since_last_input = []
62
+ self._function_lineno = None
63
+ self._need_read = False
64
+
65
+ async def _event_loop(self, line=None, suppress_output=False):
66
+ if not suppress_output:
67
+ # If the user had previously attached to this debug session,
68
+ # then it would have printed various messages from the
69
+ # message queue. When the user re-attaches, we want to
70
+ # print out all of the output that was printed since the
71
+ # last command sent to this session.
72
+ for output in self._outputs_since_last_input:
73
+ _debugger_output(output.payload.decode())
74
+
75
+ while True:
76
+ # When the user inputs "detach", it uses up a "read" message
77
+ # without actually responding to the actor being debugged. We
78
+ # can't manually reinsert the "read" message into the message queue,
79
+ # so instead the self._need_read flag indicates there's an additional
80
+ # "read" that we need to respond to.
81
+ if self._need_read:
82
+ self._need_read = False
83
+ message = "read"
84
+ else:
85
+ message = await self._message_queue.get()
86
+ if message == "detach":
87
+ # Return to the main outer debug loop.
88
+ break
89
+ elif message == "read":
90
+ break_after = False
91
+ if line is not None:
92
+ break_after = True
93
+ else:
94
+ line = await _debugger_input()
95
+ if line.strip("\n") == "detach":
96
+ self._need_read = True
97
+ break
98
+ else:
99
+ self._outputs_since_last_input = []
100
+ await self._pending_send_to_actor.put((line + "\n").encode())
101
+ line = None
102
+ if break_after:
103
+ break
104
+ elif message[0] == "write":
105
+ output = message[1]
106
+ # If the user sees this output but then detaches from the session,
107
+ # its useful to store all outputs since the last input so that
108
+ # they can be printed again when the user re-attaches.
109
+ self._outputs_since_last_input.append(output)
110
+ if not suppress_output:
111
+ _debugger_output(output.payload.decode())
112
+
113
+ if not suppress_output:
114
+ print(
115
+ f"Detaching from debug session for rank {self.rank} ({self.hostname})"
116
+ )
117
+
118
+ def get_info(self):
119
+ function = lineno = None
120
+ if self._function_lineno is not None:
121
+ function, lineno = self._function_lineno
122
+ return DebugSessionInfo(
123
+ self.rank, self.coords, self.hostname, self.actor_id, function, lineno
124
+ )
125
+
126
+ async def attach(self, line=None, suppress_output=False):
127
+ self._active = True
128
+ if not suppress_output:
129
+ print(f"Attached to debug session for rank {self.rank} ({self.hostname})")
130
+ self._task = asyncio.create_task(self._event_loop(line, suppress_output))
131
+ await self._task
132
+ if not suppress_output:
133
+ print(f"Detached from debug session for rank {self.rank} ({self.hostname})")
134
+ self._active = False
135
+
136
+ async def detach(self):
137
+ if self._active:
138
+ await self._message_queue.put("detach")
139
+
140
+ async def debugger_read(self, size: int) -> DebuggerWrite:
141
+ await self._message_queue.put("read")
142
+ input_data = await self._pending_send_to_actor.get()
143
+ if len(input_data) > size:
144
+ input_data = input_data[:size]
145
+ return DebuggerWrite(input_data, None, None)
146
+
147
+ async def debugger_write(self, write: DebuggerWrite) -> None:
148
+ if write.function is not None and write.lineno is not None:
149
+ self._function_lineno = (write.function, write.lineno)
150
+ await self._message_queue.put(("write", write))
151
+
152
+
153
+ class DebugCommand:
154
+ @staticmethod
155
+ def parse(line: str) -> Union["DebugCommand", None]:
156
+ parts = line.strip("\n").split(" ")
157
+ if len(parts) == 0:
158
+ return None
159
+ command = parts[0]
160
+ match command:
161
+ case "attach":
162
+ return Attach._parse(parts)
163
+ case "list":
164
+ return ListCommand()
165
+ case "quit":
166
+ return Quit()
167
+ case "cast":
168
+ return Cast._parse(parts)
169
+ case "help":
170
+ return Help()
171
+ case "continue":
172
+ return Continue()
173
+ case _:
174
+ print(
175
+ f"Unknown command {command}. Expected: attach | list | quit | cast | continue | help"
176
+ )
177
+ return None
178
+
179
+
180
+ @dataclass
181
+ class Attach(DebugCommand):
182
+ rank: int
183
+
184
+ @classmethod
185
+ def _parse(cls, parts: List[str]) -> "Attach":
186
+ if len(parts) != 2:
187
+ raise ValueError("Invalid attach command. Expected: attach <rank>")
188
+ try:
189
+ rank = int(parts[1])
190
+ except ValueError:
191
+ raise ValueError(f"Invalid rank {parts[1]}. Expected: int")
192
+ return cls(rank)
193
+
194
+
195
+ class ListCommand(DebugCommand):
196
+ pass
197
+
198
+
199
+ class Quit(DebugCommand):
200
+ pass
201
+
202
+
203
+ class Help(DebugCommand):
204
+ pass
205
+
206
+
207
+ class Continue(DebugCommand):
208
+ pass
209
+
210
+
211
+ @dataclass
212
+ class Cast(DebugCommand):
213
+ ranks: List[int] | None
214
+ command: str
215
+
216
+ @classmethod
217
+ def _parse(cls, parts: List[str]) -> "Cast":
218
+ if len(parts) < 3:
219
+ raise ValueError(
220
+ "Invalid cast command. Expected: cast {<r0,r1,...> | *} <command>"
221
+ )
222
+ str_ranks = parts[1]
223
+ command = " ".join(parts[2:])
224
+ if str_ranks == "*":
225
+ return cls(None, command)
226
+ else:
227
+ str_ranks = str_ranks.split(",")
228
+ if len(str_ranks) == 0:
229
+ raise ValueError(
230
+ "Invalid rank list for cast. Expected at least one rank."
231
+ )
232
+ ranks = []
233
+ for rank in str_ranks:
234
+ try:
235
+ ranks.append(int(rank))
236
+ except ValueError:
237
+ raise ValueError(f"Invalid rank {rank}. Expected: int")
238
+ return cls(ranks, command)
239
+
240
+
241
+ class DebugClient(Actor):
242
+ """
243
+ Single actor for both remote debuggers and users to talk to.
244
+
245
+ Handles multiple sessions simultanesouly
246
+ """
247
+
248
+ def __init__(self) -> None:
249
+ self.sessions = {} # rank -> DebugSession
250
+
251
+ @endpoint
252
+ async def wait_pending_session(self):
253
+ while len(self.sessions) == 0:
254
+ await asyncio.sleep(1)
255
+
256
+ @endpoint
257
+ async def list(self) -> List[Tuple[int, Dict[str, int], str, ActorId, str, int]]:
258
+ table_data = []
259
+ for _, session in self.sessions.items():
260
+ info = session.get_info()
261
+ table_data.append(
262
+ (
263
+ info.rank,
264
+ info.coords,
265
+ info.hostname,
266
+ info.actor_id,
267
+ info.function,
268
+ info.lineno,
269
+ )
270
+ )
271
+ table_data = sorted(table_data, key=lambda r: r[0])
272
+
273
+ headers = ["Rank", "Coords", "Hostname", "Actor ID", "Function", "Line No."]
274
+ print(tabulate(table_data, headers=headers, tablefmt="grid"))
275
+
276
+ return table_data
277
+
278
+ @endpoint
279
+ async def enter(self) -> None:
280
+ # pyre-ignore
281
+ await getattr(self, "list")._method(self) # noqa
282
+
283
+ while True:
284
+ try:
285
+ user_input = await _debugger_input("monarch_dbg> ")
286
+ command = DebugCommand.parse(user_input)
287
+ if isinstance(command, Help):
288
+ print("monarch_dbg commands:")
289
+ print("\tattach <rank> - attach to a debug session")
290
+ print("\tlist - list all debug sessions")
291
+ print("\tquit - exit the debugger, leaving all sessions in place")
292
+ print(
293
+ "\tcast {<r0,r1,...> | *} <command> - send a command to a comma-separated list of ranks, or all ranks"
294
+ )
295
+ print(
296
+ "\tcontinue - tell all ranks to continue execution, then exit the debugger"
297
+ )
298
+ print("\thelp - print this help message")
299
+ elif isinstance(command, Attach):
300
+ if command.rank not in self.sessions:
301
+ print(f"No debug session for rank {command.rank}")
302
+ else:
303
+ await self.sessions[command.rank].attach()
304
+ elif isinstance(command, ListCommand):
305
+ await getattr(self, "list")._method(self) # noqa
306
+ elif isinstance(command, Continue):
307
+ # Make sure all ranks have exited their debug sessions.
308
+ # If we sent "quit", it would raise BdbQuit, crashing
309
+ # the process, which probably isn't what we want.
310
+ while len(self.sessions) > 0:
311
+ tasks = []
312
+ for rank in self.sessions:
313
+ tasks.append(
314
+ self.sessions[rank].attach("c", suppress_output=True)
315
+ )
316
+ await asyncio.gather(*tasks)
317
+ return
318
+ elif isinstance(command, Quit):
319
+ return
320
+ elif isinstance(command, Cast):
321
+ if command.ranks is None:
322
+ ranks = self.sessions.keys()
323
+ else:
324
+ ranks = command.ranks
325
+ tasks = []
326
+ for rank in ranks:
327
+ if rank in self.sessions:
328
+ tasks.append(
329
+ self.sessions[rank].attach(
330
+ command.command,
331
+ suppress_output=True,
332
+ )
333
+ )
334
+ else:
335
+ print(f"No debug session for rank {rank}")
336
+ await asyncio.gather(*tasks)
337
+ except Exception as e:
338
+ print(f"Error processing command: {e}")
339
+
340
+ ##########################################################################
341
+ # Debugger APIs
342
+ #
343
+ # These endpoints are called by the remote debuggers to establish sessions
344
+ # and communicate with them.
345
+ @endpoint
346
+ async def debugger_session_start(
347
+ self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
348
+ ) -> None:
349
+ # Create a session if it doesn't exist
350
+ if rank not in self.sessions:
351
+ self.sessions[rank] = DebugSession(rank, coords, hostname, actor_id)
352
+
353
+ @endpoint
354
+ async def debugger_session_end(self, rank: int) -> None:
355
+ """Detach from the current debug session."""
356
+ session = self.sessions.pop(rank)
357
+ await session.detach()
358
+
359
+ @endpoint
360
+ async def debugger_read(self, rank: int, size: int) -> DebuggerWrite | str:
361
+ """Read from the debug session for the given rank."""
362
+ session = self.sessions[rank]
363
+
364
+ return await session.debugger_read(size)
365
+
366
+ @endpoint
367
+ async def debugger_write(self, rank: int, write: DebuggerWrite) -> None:
368
+ """Write to the debug session for the given rank."""
369
+ session = self.sessions[rank]
370
+ await session.debugger_write(write)
371
+
372
+
373
+ async def init_debugging(actor_mesh: Actor) -> DebugClient:
374
+ debugger_proc_mesh = await local_proc_mesh(gpus=1, hosts=1)
375
+ debug_client_mesh = await debugger_proc_mesh.spawn("debug_client", DebugClient)
376
+ await actor_mesh._set_debug_client.call(debug_client_mesh)
377
+ return debug_client_mesh
@@ -4,7 +4,10 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ import atexit
7
8
  import logging
9
+ import os
10
+ import time
8
11
  import traceback
9
12
  from collections import deque
10
13
  from logging import Logger
@@ -22,6 +25,8 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
22
25
  ActorId,
23
26
  )
24
27
  from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
28
+ from monarch._rust_bindings.monarch_hyperactor.shape import Point
29
+
25
30
  from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
26
31
  from monarch.common.client import Client
27
32
  from monarch.common.controller_api import LogMessage, MessageResult
@@ -29,6 +34,7 @@ from monarch.common.device_mesh import DeviceMesh, no_mesh
29
34
  from monarch.common.invocation import DeviceException, RemoteException
30
35
  from monarch.controller.debugger import read as debugger_read, write as debugger_write
31
36
  from monarch.proc_mesh import ProcMesh
37
+ from monarch.rust_local_mesh import _get_worker_exec_info
32
38
  from pyre_extensions import none_throws
33
39
 
34
40
  logger: Logger = logging.getLogger(__name__)
@@ -72,18 +78,8 @@ class Controller(_Controller):
72
78
  def drain_and_stop(
73
79
  self,
74
80
  ) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
75
- logger.info("rust controller shutting down")
76
- results = []
77
- for msg in self._drain_and_stop():
78
- if isinstance(msg, client.WorkerResponse):
79
- results.append(_worker_response_to_result(msg))
80
- elif isinstance(msg, client.LogMessage):
81
- results.append(LogMessage(msg.level, msg.message))
82
- elif isinstance(msg, client.DebuggerMessage):
83
- results.append(msg)
84
- else:
85
- raise RuntimeError(f"Unexpected message type {type(msg)}")
86
- return results
81
+ self._drain_and_stop()
82
+ return []
87
83
 
88
84
  def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
89
85
  if not isinstance(message.action, DebuggerAction.Paused):
@@ -192,13 +188,75 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
192
188
  raise RuntimeError(f"Unknown exception type: {type(exc)}")
193
189
 
194
190
 
191
+ def _initialize_env(worker_point: Point, proc_id: str) -> None:
192
+ worker_rank = worker_point.rank
193
+ try:
194
+ _, worker_env = _get_worker_exec_info()
195
+ local_rank = worker_point["gpus"]
196
+ gpus_per_host = worker_point.size("gpus")
197
+ num_worker_procs = len(worker_point.shape)
198
+ process_env = {
199
+ **worker_env,
200
+ "HYPERACTOR_MANAGED_SUBPROCESS": "1",
201
+ "CUDA_VISIBLE_DEVICES": str(local_rank),
202
+ "NCCL_HOSTID": f"{proc_id}_host_{worker_rank // gpus_per_host}",
203
+ # This is needed to avoid a hard failure in ncclx when we do not
204
+ # have backend topology info (eg. on RE).
205
+ "NCCL_IGNORE_TOPO_LOAD_FAILURE": "true",
206
+ "LOCAL_RANK": str(local_rank),
207
+ "RANK": str(worker_rank),
208
+ "WORLD_SIZE": str(num_worker_procs),
209
+ "LOCAL_WORLD_SIZE": str(gpus_per_host),
210
+ }
211
+ os.environ.update(process_env)
212
+ except Exception:
213
+ traceback.print_exc()
214
+ raise
215
+
216
+
217
+ class MeshClient(Client):
218
+ def shutdown(
219
+ self,
220
+ destroy_pg: bool = True,
221
+ error_reason: Optional[RemoteException | DeviceException | Exception] = None,
222
+ ):
223
+ # return
224
+ if self.has_shutdown:
225
+ return
226
+ logger.info("shutting down the client gracefully")
227
+
228
+ atexit.unregister(self._atexit)
229
+ self._shutdown = True
230
+
231
+ # ensure all pending work is finished.
232
+ # all errors must be messaged back at this point
233
+ self.new_node_nocoalesce([], [], None, [])
234
+ self._request_status()
235
+
236
+ ttl = 60
237
+ start_time = time.time()
238
+ end_time = start_time + ttl
239
+ while ttl > 0 and self.last_assigned_seq > self.last_processed_seq:
240
+ ttl = end_time - time.time()
241
+ self.handle_next_message(ttl)
242
+ if self._pending_shutdown_error:
243
+ raise self._pending_shutdown_error
244
+
245
+ if ttl <= 0:
246
+ raise RuntimeError("shutdown timed out")
247
+
248
+ # we are not expecting anything more now, because we already
249
+ # waited for the responses
250
+ self.inner.drain_and_stop()
251
+
252
+
195
253
  def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
196
254
  # This argument to Controller
197
255
  # is currently only used for debug printing. It should be fixed to
198
256
  # report the proc ID instead of the rank it currently does.
199
257
  gpus = proc_mesh.sizes.get("gpus", 1)
200
258
  backend_ctrl = Controller(proc_mesh._proc_mesh)
201
- client = Client(backend_ctrl, proc_mesh.size(), gpus)
259
+ client = MeshClient(backend_ctrl, proc_mesh.size(), gpus)
202
260
  dm = DeviceMesh(
203
261
  client,
204
262
  NDSlice.new_row_major(list(proc_mesh.sizes.values())),
Binary file
monarch/pdb_wrapper.py ADDED
@@ -0,0 +1,135 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import bdb
8
+ import inspect
9
+ import io
10
+ import pdb # noqa
11
+ import socket
12
+ import sys
13
+ from dataclasses import dataclass
14
+
15
+ from typing import Dict, TYPE_CHECKING
16
+
17
+ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
18
+
19
+ if TYPE_CHECKING:
20
+ from monarch.debugger import DebugClient
21
+
22
+
23
+ @dataclass
24
+ class DebuggerWrite:
25
+ payload: bytes
26
+ function: str | None
27
+ lineno: int | None
28
+
29
+
30
+ class PdbWrapper(pdb.Pdb):
31
+ def __init__(
32
+ self,
33
+ rank: int,
34
+ coords: Dict[str, int],
35
+ actor_id: ActorId,
36
+ client_ref: "DebugClient",
37
+ header: str | None = None,
38
+ ):
39
+ self.rank = rank
40
+ self.coords = coords
41
+ self.header = header
42
+ self.actor_id = actor_id
43
+ self.client_ref = client_ref
44
+ # pyre-ignore
45
+ super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
46
+ self._first = True
47
+
48
+ def setup(self, *args, **kwargs):
49
+ r = super().setup(*args, **kwargs)
50
+ if self._first:
51
+ self._first = False
52
+ # when we enter the debugger, we want to present the user's stack frame
53
+ # not the nested one inside session.run. This means that the local
54
+ # variables are what gets printed, etc. To do this
55
+ # we first execute up 2 to get to that frame.
56
+ self.do_up(2)
57
+ return r
58
+
59
+ def set_continue(self) -> None:
60
+ r = super().set_continue()
61
+ if not self.breaks:
62
+ # no more breakpoints so this debugger will not
63
+ # be used again, and we detach from the controller io.
64
+ self.client_ref.debugger_session_end.call_one(self.rank).get()
65
+ # break cycle with itself before we exit
66
+ self.stdin = sys.stdin
67
+ self.stdout = sys.stdout
68
+ return r
69
+
70
+ def set_trace(self):
71
+ self.client_ref.debugger_session_start.call_one(
72
+ self.rank, self.coords, socket.getfqdn(socket.gethostname()), self.actor_id
73
+ ).get()
74
+ if self.header:
75
+ self.message(self.header)
76
+ super().set_trace()
77
+
78
+
79
+ class ReadWrapper(io.RawIOBase):
80
+ def __init__(self, session: "PdbWrapper"):
81
+ self.session = session
82
+
83
+ def readinto(self, b):
84
+ response = self.session.client_ref.debugger_read.call_one(
85
+ self.session.rank, len(b)
86
+ ).get()
87
+ if response == "detach":
88
+ # this gets injected by the worker event loop to
89
+ # get the worker thread to exit on an Exit command.
90
+ raise bdb.BdbQuit
91
+ assert isinstance(response, DebuggerWrite) and len(response.payload) <= len(b)
92
+ b[: len(response.payload)] = response.payload
93
+ return len(response.payload)
94
+
95
+ def readable(self) -> bool:
96
+ return True
97
+
98
+ @classmethod
99
+ def create(cls, session: "PdbWrapper"):
100
+ return io.TextIOWrapper(io.BufferedReader(cls(session)))
101
+
102
+
103
+ class WriteWrapper:
104
+ def __init__(self, session: "PdbWrapper"):
105
+ self.session = session
106
+
107
+ def writable(self) -> bool:
108
+ return True
109
+
110
+ def write(self, s: str):
111
+ function = None
112
+ lineno = None
113
+ if self.session.curframe is not None:
114
+ # pyre-ignore
115
+ function = f"{inspect.getmodulename(self.session.curframe.f_code.co_filename)}.{self.session.curframe.f_code.co_name}"
116
+ # pyre-ignore
117
+ lineno = self.session.curframe.f_lineno
118
+ self.session.client_ref.debugger_write.call_one(
119
+ self.session.rank,
120
+ DebuggerWrite(
121
+ s.encode(),
122
+ function,
123
+ lineno,
124
+ ),
125
+ ).get()
126
+
127
+ def flush(self):
128
+ pass
129
+
130
+
131
+ def remote_breakpointhook(
132
+ rank: int, coords: Dict[str, int], actor_id: ActorId, client_ref: "DebugClient"
133
+ ):
134
+ ds = PdbWrapper(rank, coords, actor_id, client_ref)
135
+ ds.set_trace()
monarch/telemetry.py ADDED
@@ -0,0 +1,19 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+
10
+ import logging
11
+
12
+ from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
13
+ forward_to_tracing,
14
+ )
15
+
16
+
17
+ class TracingForwarder(logging.Handler):
18
+ def emit(self, record: logging.LogRecord) -> None:
19
+ forward_to_tracing(record)
tests/test_coalescing.py CHANGED
@@ -78,7 +78,7 @@ class TestCoalescing:
78
78
  num_hosts,
79
79
  gpu_per_host,
80
80
  activate,
81
- rust=backend_type == BackendType.RS,
81
+ backend=str(backend_type),
82
82
  )
83
83
 
84
84
  @property
tests/test_controller.py CHANGED
@@ -96,7 +96,7 @@ remote_sleep = remote("time.sleep", propagate="inspect")
96
96
  torch.cuda.device_count() < 2,
97
97
  reason="Not enough GPUs, this test requires at least 2 GPUs",
98
98
  )
99
- @pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
99
+ @pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
100
100
  # Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
101
101
  # out is not counted as a failure, so we set a more restrictive timeout to
102
102
  # ensure we see a hard failure in CI.
@@ -114,7 +114,7 @@ class TestController:
114
114
  N,
115
115
  gpu_per_host,
116
116
  activate,
117
- rust=backend_type == BackendType.RS,
117
+ backend=str(backend_type),
118
118
  )
119
119
 
120
120
  def test_errors(self, backend_type):
@@ -512,6 +512,7 @@ class TestController:
512
512
  monarch.random.make_deterministic()
513
513
  for device in ("cpu", "cuda"):
514
514
  a = monarch.random.get_state()
515
+ monarch.inspect(a)
515
516
  first = torch.rand(1, device=device)
516
517
  monarch.random.set_state(a)
517
518
  second = torch.rand(1, device=device)
@@ -601,6 +602,15 @@ class TestController:
601
602
  assert torch.equal(moved_tensor_a, torch.tensor([1.0]))
602
603
  assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
603
604
 
605
+ def test_hanging_error(self, backend_type):
606
+ if backend_type != "mesh":
607
+ pytest.skip("only relevant for mesh backend")
608
+ with self.local_device_mesh(2, 2, backend_type) as device_mesh:
609
+ remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()
610
+
611
+ with pytest.raises(Exception, match="The size of tensor"):
612
+ device_mesh.client.shutdown()
613
+
604
614
  def test_slice_mesh_pytree(self, backend_type):
605
615
  with self.local_device_mesh(2, 2, backend_type) as device_mesh:
606
616
  a = device_mesh.rank(("host")) + torch.zeros((1,), device="cuda")
@@ -4,8 +4,12 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ import asyncio
7
8
  import operator
9
+ import os
10
+ import re
8
11
  from types import ModuleType
12
+ from unittest.mock import AsyncMock, patch
9
13
 
10
14
  import monarch
11
15
 
@@ -20,7 +24,9 @@ from monarch.actor_mesh import (
20
24
  current_rank,
21
25
  current_size,
22
26
  endpoint,
27
+ MonarchContext,
23
28
  )
29
+ from monarch.debugger import init_debugging
24
30
 
25
31
  from monarch.mesh_controller import spawn_tensor_engine
26
32
 
@@ -384,6 +390,10 @@ def test_rust_binding_modules_correct() -> None:
384
390
  check(bindings, "monarch._rust_bindings")
385
391
 
386
392
 
393
+ @pytest.mark.skipif(
394
+ torch.cuda.device_count() < 2,
395
+ reason="Not enough GPUs, this test requires at least 2 GPUs",
396
+ )
387
397
  def test_tensor_engine() -> None:
388
398
  pm = proc_mesh(gpus=2).get()
389
399
 
@@ -399,3 +409,143 @@ def test_tensor_engine() -> None:
399
409
  assert torch.allclose(torch.zeros(3, 4), f)
400
410
 
401
411
  dm.exit()
412
+
413
+
414
+ def _debugee_actor_internal(rank):
415
+ if rank == 0:
416
+ breakpoint() # noqa
417
+ rank += 1
418
+ return rank
419
+ elif rank == 1:
420
+ breakpoint() # noqa
421
+ rank += 2
422
+ return rank
423
+ elif rank == 2:
424
+ breakpoint() # noqa
425
+ rank += 3
426
+ raise ValueError("bad rank")
427
+ elif rank == 3:
428
+ breakpoint() # noqa
429
+ rank += 4
430
+ return rank
431
+
432
+
433
+ class DebugeeActor(Actor):
434
+ @endpoint
435
+ async def to_debug(self):
436
+ rank = MonarchContext.get().point.rank
437
+ return _debugee_actor_internal(rank)
438
+
439
+
440
+ async def test_debug() -> None:
441
+ input_mock = AsyncMock()
442
+ input_mock.side_effect = [
443
+ "attach 1",
444
+ "n",
445
+ "n",
446
+ "n",
447
+ "n",
448
+ "detach",
449
+ "attach 1",
450
+ "detach",
451
+ "quit",
452
+ "cast 0,3 n",
453
+ "cast 0,3 n",
454
+ # Attaching to 0 and 3 ensures that when we call "list"
455
+ # the next time, their function/lineno info will be
456
+ # up-to-date.
457
+ "attach 0",
458
+ "detach",
459
+ "attach 3",
460
+ "detach",
461
+ "quit",
462
+ "attach 2",
463
+ "c",
464
+ "quit",
465
+ "continue",
466
+ ]
467
+
468
+ outputs = []
469
+
470
+ def _patch_output(msg):
471
+ nonlocal outputs
472
+ outputs.append(msg)
473
+
474
+ with patch("monarch.debugger._debugger_input", side_effect=input_mock), patch(
475
+ "monarch.debugger._debugger_output", new=_patch_output
476
+ ):
477
+ proc = await proc_mesh(hosts=2, gpus=2)
478
+ debugee = await proc.spawn("debugee", DebugeeActor)
479
+ debug_client = await init_debugging(debugee)
480
+
481
+ fut = debugee.to_debug.call()
482
+ await debug_client.wait_pending_session.call_one()
483
+ breakpoints = []
484
+ for i in range(10):
485
+ breakpoints = await debug_client.list.call_one()
486
+ if len(breakpoints) == 4:
487
+ break
488
+ await asyncio.sleep(1)
489
+ if i == 9:
490
+ raise RuntimeError("timed out waiting for breakpoints")
491
+
492
+ initial_linenos = {}
493
+ for i in range(len(breakpoints)):
494
+ rank, coords, _, _, function, lineno = breakpoints[i]
495
+ initial_linenos[rank] = lineno
496
+ assert rank == i
497
+ assert coords == {"hosts": rank % 2, "gpus": rank // 2}
498
+ assert function == "test_python_actors._debugee_actor_internal"
499
+ assert lineno == breakpoints[0][5] + 4 * rank
500
+
501
+ await debug_client.enter.call_one()
502
+
503
+ # Check that when detaching and re-attaching to a session, the last portion of the output is repeated
504
+ expected_last_output = [
505
+ r"--Return--",
506
+ r"\n",
507
+ r"> (/.*/)+test_python_actors.py\(\d+\)to_debug\(\)->3\n-> return _debugee_actor_internal\(rank\)",
508
+ r"\n",
509
+ r"\(Pdb\) ",
510
+ ]
511
+ output_len = len(expected_last_output)
512
+ assert outputs[-2 * output_len : -output_len] == outputs[-output_len:]
513
+ for real_output, expected_output in zip(
514
+ outputs[-output_len:], expected_last_output
515
+ ):
516
+ assert re.match(expected_output, real_output) is not None
517
+
518
+ breakpoints = await debug_client.list.call_one()
519
+ for i in range(len(breakpoints)):
520
+ if i == 1:
521
+ assert breakpoints[i][4] == "test_python_actors.to_debug"
522
+ else:
523
+ assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
524
+ assert breakpoints[i][5] == initial_linenos[i]
525
+
526
+ await debug_client.enter.call_one()
527
+
528
+ breakpoints = await debug_client.list.call_one()
529
+ for i in range(len(breakpoints)):
530
+ if i == 1:
531
+ assert breakpoints[i][4] == "test_python_actors.to_debug"
532
+ elif i in (0, 3):
533
+ assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
534
+ assert breakpoints[i][5] == initial_linenos[i] + 2
535
+ else:
536
+ assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
537
+ assert breakpoints[i][5] == initial_linenos[i]
538
+
539
+ await debug_client.enter.call_one()
540
+
541
+ breakpoints = await debug_client.list.call_one()
542
+ assert len(breakpoints) == 3
543
+ for i, rank in enumerate((0, 1, 3)):
544
+ assert breakpoints[i][0] == rank
545
+
546
+ await debug_client.enter.call_one()
547
+ breakpoints = await debug_client.list.call_one()
548
+ assert len(breakpoints) == 0
549
+
550
+ with pytest.raises(monarch.actor_mesh.ActorError, match="ValueError: bad rank"):
551
+ await fut
@@ -169,7 +169,7 @@ class RemoteFunctionsTestBase:
169
169
  num_hosts,
170
170
  gpu_per_host,
171
171
  activate,
172
- rust=backend_type == BackendType.RS,
172
+ backend=str(backend_type),
173
173
  )
174
174
 
175
175
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.12
3
+ Version: 2025.6.13
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,19 +1,21 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=VPU8MhCnz10umRwSqv99QvwFkr2q0N0DiOTpZ37Ecl0,40645344
3
- monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
4
- monarch/actor_mesh.py,sha256=ovi5RBxobGEcg7zKkzhRc83n82KOD6ermhuloHKbuFs,24420
2
+ monarch/_rust_bindings.so,sha256=FJb4gGPNDWqT1nPkxEYSX4hEsIbjb_v8Oa0RDwMcH5A,40302936
3
+ monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
4
+ monarch/actor_mesh.py,sha256=8Ih3CIArLTyZmWSHppXm5N2WlAjmGXpaQhkkFtjJFxc,25351
5
5
  monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
6
- monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
6
+ monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
7
7
  monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
8
+ monarch/debugger.py,sha256=AdlvOG3X-9Pw9c1DLQYEy4vjEfh0ZtwtsNJEFLFzN8o,13312
8
9
  monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
9
10
  monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
10
11
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
11
12
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
12
- monarch/mesh_controller.py,sha256=Rr4VNUNN0pJdThbPmbCoaPWid4QpTNHya9xYpmjTkW0,8575
13
- monarch/monarch_controller,sha256=MECcriPRnSdI_NpAG6y-GiK2-DqnDsLBfyOHVdqewRU,20397992
13
+ monarch/mesh_controller.py,sha256=Xft2edk7rz8_PPe-iIUZ09P-j4JDPGADBGHBiuiZ7YY,10363
14
+ monarch/monarch_controller,sha256=mE9pvcBDKwW_4zOZlO17PJDk7W6z5skzIX5rxHQfKOs,20238936
14
15
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
15
16
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
16
17
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
18
+ monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
17
19
  monarch/proc_mesh.py,sha256=xoaReM9Ab9TWkesxedWSyyk4TMD0HLV88dQ8CQcbqTI,6892
18
20
  monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
19
21
  monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
@@ -23,6 +25,7 @@ monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
23
25
  monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
24
26
  monarch/rust_local_mesh.py,sha256=7ASptybn3wy4J7eoBc7LhGW4j4AA6bigl5Kuhyflw8s,47405
25
27
  monarch/sim_mesh.py,sha256=9wkS99L0EpG2Gldi-nzA-3ww7z__DQ7Qp2uReMfn188,12183
28
+ monarch/telemetry.py,sha256=7JUZWaoD2Yn5Ae_7kNhkLFRBLYaSGfH071_m_qfVehI,525
26
29
  monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
27
30
  monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
28
31
  monarch/world_mesh.py,sha256=GqZpFoVNJPxYa70rLYgv0vu8Vg1nXqx_GYERRb1E9Pc,975
@@ -34,7 +37,7 @@ monarch/_monarch/worker/debugger.py,sha256=JJZwRPTgQO2emz-hrMelkOSxJFIR3dV4ZA6e7
34
37
  monarch/_monarch/worker/logging.py,sha256=nJUkIuKhPqRZaNDOT7MVbFFjcITZQf_CiFRLFKJJqsw,3591
35
38
  monarch/builtins/__init__.py,sha256=QcfnHZGbc2qktBg7DyZt2ruE6VahnIt4S8lEZLHdJqU,443
36
39
  monarch/builtins/log.py,sha256=H1QkuVzwxyi36Zyv-XR0VN0QsNimBWwxE1__fjs0_2o,554
37
- monarch/builtins/random.py,sha256=xVt0cJBRBhCOH1Eioy8O511rp7HKFSCVXRwjBy02K5I,1798
40
+ monarch/builtins/random.py,sha256=wPbvscg7u53EXpMFo885fO2XOlsyjrNAJ4rBxLzfxdg,1839
38
41
  monarch/common/_C.pyi,sha256=kHY2G3ksMAjQJ6IcPb4F1bBh5knzw5RVVNhhBlEmwFU,314
39
42
  monarch/common/_C.so,sha256=gVDCDUQSKiPHwLPIpyxcRgiv8uF_quH1LpgI5Lhle9Y,715600
40
43
  monarch/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -43,7 +46,7 @@ monarch/common/_device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4
43
46
  monarch/common/_tensor_to_table.py,sha256=yRjCNwvtl188Z1Dwkx3ZU-Bh2mwYnQ0Lnue2RAztwvc,5753
44
47
  monarch/common/base_tensor.py,sha256=ujRzR6lWaeCdPv2JX0vCR-VsCWn-3SHaJIkZH1Sw9FQ,1159
45
48
  monarch/common/borrows.py,sha256=7KR62xoUat1T6FyADsdHsxVAVIJDvfJWUnPO-xx277U,5307
46
- monarch/common/client.py,sha256=BaBhOzQaNsqTa-BGy7_IknQxpnpK0j4C5QsNyFHZHW4,24343
49
+ monarch/common/client.py,sha256=axo37s_z17nYQGOZG5fi_0zUEJ_8qw7INjs-Kw2vaVo,24937
47
50
  monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
48
51
  monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
49
52
  monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
@@ -133,16 +136,16 @@ tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
133
136
  tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
134
137
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
135
138
  tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
136
- tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
137
- tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
139
+ tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
140
+ tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
138
141
  tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
139
142
  tests/test_fault_tolerance.py,sha256=u4wmG1z5MZ6PY6us5zUZHJh2pUC3L7i0wsUfRDNHmxA,14144
140
143
  tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
141
144
  tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
142
145
  tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
143
146
  tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
144
- tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
145
- tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
147
+ tests/test_python_actors.py,sha256=MPdXtnj4ZeyAaecDFJMXdz29KvimF9iB3bASgoo6-iM,16201
148
+ tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
146
149
  tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
147
150
  tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
148
151
  tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
@@ -151,9 +154,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
151
154
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
152
155
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
153
156
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
154
- torchmonarch_nightly-2025.6.12.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
155
- torchmonarch_nightly-2025.6.12.dist-info/METADATA,sha256=mBsDu66W3vkM2SdaxX7hw8_B6kl_XgQZT7nQKZhVkMk,2772
156
- torchmonarch_nightly-2025.6.12.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
157
- torchmonarch_nightly-2025.6.12.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
158
- torchmonarch_nightly-2025.6.12.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
159
- torchmonarch_nightly-2025.6.12.dist-info/RECORD,,
157
+ torchmonarch_nightly-2025.6.13.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
158
+ torchmonarch_nightly-2025.6.13.dist-info/METADATA,sha256=WhintlKk3a9WRrjo-QLNntfi87q98I4gcZW_0f42q48,2772
159
+ torchmonarch_nightly-2025.6.13.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
160
+ torchmonarch_nightly-2025.6.13.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
161
+ torchmonarch_nightly-2025.6.13.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
162
+ torchmonarch_nightly-2025.6.13.dist-info/RECORD,,