torchmonarch-nightly 2025.6.8__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
monarch/actor_mesh.py CHANGED
@@ -83,7 +83,7 @@ class MonarchContext:
83
83
 
84
84
 
85
85
  _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
86
- "monarch.service._context"
86
+ "monarch.actor_mesh._context"
87
87
  )
88
88
 
89
89
 
@@ -677,7 +677,7 @@ class ActorError(Exception):
677
677
  def __init__(
678
678
  self,
679
679
  exception: Exception,
680
- message: str = "A remote service call has failed asynchronously.",
680
+ message: str = "A remote actor call has failed asynchronously.",
681
681
  ) -> None:
682
682
  self.exception = exception
683
683
  self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
@@ -688,7 +688,7 @@ class ActorError(Exception):
688
688
  actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
689
689
  return (
690
690
  f"{self.message}\n"
691
- f"Traceback of where the service call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
691
+ f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
692
692
  )
693
693
 
694
694
 
@@ -0,0 +1,209 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import logging
8
+ import traceback
9
+ from collections import deque
10
+ from logging import Logger
11
+ from typing import List, NamedTuple, Optional, Union
12
+
13
+ import torch.utils._python_dispatch
14
+
15
+ from monarch import NDSlice
16
+ from monarch._rust_bindings.monarch_extension import client, debugger
17
+ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monarch/monarch_extension:monarch_extension
18
+ WorldState,
19
+ )
20
+ from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
21
+ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
22
+ ActorId,
23
+ )
24
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
25
+ from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
26
+ from monarch.common.client import Client
27
+ from monarch.common.controller_api import LogMessage, MessageResult
28
+ from monarch.common.device_mesh import DeviceMesh, no_mesh
29
+ from monarch.common.invocation import DeviceException, RemoteException
30
+ from monarch.controller.debugger import read as debugger_read, write as debugger_write
31
+ from monarch.proc_mesh import ProcMesh
32
+ from pyre_extensions import none_throws
33
+
34
+ logger: Logger = logging.getLogger(__name__)
35
+
36
+
37
+ class Controller(_Controller):
38
+ def __init__(self, workers: HyProcMesh) -> None:
39
+ super().__init__()
40
+ # Buffer for messages unrelated to debugging that are received while a
41
+ # debugger session is active.
42
+ self._non_debugger_pending_messages: deque[
43
+ Optional[client.LogMessage | client.WorkerResponse]
44
+ ] = deque()
45
+ self._pending_debugger_sessions: deque[ActorId] = deque()
46
+
47
+ def next_message(
48
+ self, timeout: Optional[float]
49
+ ) -> Optional[LogMessage | MessageResult]:
50
+ if self._non_debugger_pending_messages:
51
+ msg = self._non_debugger_pending_messages.popleft()
52
+ else:
53
+ msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
54
+ if msg is None:
55
+ return None
56
+
57
+ if isinstance(msg, client.WorkerResponse):
58
+ return _worker_response_to_result(msg)
59
+ elif isinstance(msg, client.LogMessage):
60
+ return LogMessage(msg.level, msg.message)
61
+ elif isinstance(msg, client.DebuggerMessage):
62
+ self._run_debugger_loop(msg)
63
+
64
+ def send(
65
+ self,
66
+ ranks: Union[NDSlice, List[NDSlice]],
67
+ msg: NamedTuple,
68
+ ) -> None:
69
+ with torch.utils._python_dispatch._disable_current_modes():
70
+ return super().send(ranks, msg)
71
+
72
+ def drain_and_stop(
73
+ self,
74
+ ) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
75
+ logger.info("rust controller shutting down")
76
+ results = []
77
+ for msg in self._drain_and_stop():
78
+ if isinstance(msg, client.WorkerResponse):
79
+ results.append(_worker_response_to_result(msg))
80
+ elif isinstance(msg, client.LogMessage):
81
+ results.append(LogMessage(msg.level, msg.message))
82
+ elif isinstance(msg, client.DebuggerMessage):
83
+ results.append(msg)
84
+ else:
85
+ raise RuntimeError(f"Unexpected message type {type(msg)}")
86
+ return results
87
+
88
+ def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
89
+ if not isinstance(message.action, DebuggerAction.Paused):
90
+ raise RuntimeError(
91
+ f"Unexpected debugger message {message} when no debugger session is running"
92
+ )
93
+
94
+ self._pending_debugger_sessions.append(message.debugger_actor_id)
95
+ while self._pending_debugger_sessions:
96
+ debugger_actor_id = self._pending_debugger_sessions.popleft()
97
+ rank = debugger_actor_id.rank
98
+ proc_id = debugger_actor_id.proc_id
99
+ debugger_write(
100
+ f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
101
+ )
102
+
103
+ self._debugger_attach(debugger_actor_id)
104
+ while True:
105
+ # TODO: Add appropriate timeout.
106
+ msg = self._get_next_message(timeout_msec=None)
107
+
108
+ if not isinstance(msg, client.DebuggerMessage):
109
+ self._non_debugger_pending_messages.append(msg)
110
+ continue
111
+
112
+ if msg.debugger_actor_id != debugger_actor_id:
113
+ if isinstance(msg.action, DebuggerAction.Paused):
114
+ self._pending_debugger_sessions.append(msg.debugger_actor_id)
115
+ continue
116
+ else:
117
+ raise RuntimeError(
118
+ f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
119
+ f"when debugging rank {debugger_actor_id.rank}"
120
+ )
121
+
122
+ action = msg.action
123
+ if isinstance(action, DebuggerAction.Detach):
124
+ break
125
+ elif isinstance(action, DebuggerAction.Read):
126
+ self._debugger_write(
127
+ debugger_actor_id, debugger_read(action.requested_size)
128
+ )
129
+ elif isinstance(action, DebuggerAction.Write):
130
+ debugger_write(
131
+ debugger.get_bytes_from_write_action(action).decode()
132
+ )
133
+ else:
134
+ raise RuntimeError(
135
+ f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
136
+ )
137
+
138
+ def worker_world_state(self) -> WorldState:
139
+ raise NotImplementedError("worker world state")
140
+
141
+ def stop_mesh(self):
142
+ # I think this is a noop?
143
+
144
+ pass
145
+
146
+
147
+ # TODO: Handling conversion of the response can move to a separate module over time
148
+ # especially as we have structured error messages.
149
+ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
150
+ if not result.is_exception():
151
+ # The result of the message needs to be unwrapped on a real device.
152
+ # Staying as a fake tensor will fail the tensor deserialization.
153
+ with no_mesh.activate():
154
+ return MessageResult(result.seq, result.result(), None)
155
+ exc = none_throws(result.exception())
156
+ if isinstance(exc, client.Error):
157
+ worker_frames = [
158
+ traceback.FrameSummary("<unknown>", None, frame)
159
+ for frame in exc.backtrace.split("\\n")
160
+ ]
161
+ logger.error(f"Worker {exc.actor_id} failed")
162
+ return MessageResult(
163
+ seq=result.seq,
164
+ result=None,
165
+ error=RemoteException(
166
+ seq=exc.caused_by_seq,
167
+ exception=RuntimeError(exc.backtrace),
168
+ controller_frame_index=0, # TODO: T225205291 fix this once we have recording support in rust
169
+ controller_frames=None,
170
+ worker_frames=worker_frames,
171
+ source_actor_id=exc.actor_id,
172
+ message=f"Worker {exc.actor_id} failed",
173
+ ),
174
+ )
175
+ elif isinstance(exc, client.Failure):
176
+ frames = [
177
+ traceback.FrameSummary("<unknown>", None, frame)
178
+ for frame in exc.backtrace.split("\n")
179
+ ]
180
+ reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
181
+ logger.error(reason)
182
+ return MessageResult(
183
+ seq=0, # seq is not consumed for DeviceException; it will be directly thrown by the client
184
+ result=None,
185
+ error=DeviceException(
186
+ exception=RuntimeError(reason),
187
+ frames=frames,
188
+ source_actor_id=exc.actor_id,
189
+ message=reason,
190
+ ),
191
+ )
192
+ else:
193
+ raise RuntimeError(f"Unknown exception type: {type(exc)}")
194
+
195
+
196
+ def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
197
+ # This argument to Controller
198
+ # is currently only used for debug printing. It should be fixed to
199
+ # report the proc ID instead of the rank it currently does.
200
+ gpus = proc_mesh.sizes.get("gpus", 1)
201
+ backend_ctrl = Controller(proc_mesh._proc_mesh)
202
+ client = Client(backend_ctrl, proc_mesh.size(), gpus)
203
+ dm = DeviceMesh(
204
+ client,
205
+ NDSlice.new_row_major(list(proc_mesh.sizes.values())),
206
+ tuple(proc_mesh.sizes.keys()),
207
+ )
208
+ dm.exit = lambda: client.shutdown()
209
+ return dm
Binary file
@@ -7,7 +7,12 @@
7
7
  import operator
8
8
  from types import ModuleType
9
9
 
10
+ import monarch
11
+
12
+ import pytest
13
+
10
14
  import torch
15
+
11
16
  from monarch.actor_mesh import (
12
17
  Accumulator,
13
18
  Actor,
@@ -17,6 +22,8 @@ from monarch.actor_mesh import (
17
22
  endpoint,
18
23
  )
19
24
 
25
+ from monarch.mesh_controller import spawn_tensor_engine
26
+
20
27
  from monarch.proc_mesh import local_proc_mesh, proc_mesh
21
28
  from monarch.rdma import RDMABuffer
22
29
 
@@ -375,3 +382,20 @@ def test_rust_binding_modules_correct() -> None:
375
382
  assert value.__module__ == path
376
383
 
377
384
  check(bindings, "monarch._rust_bindings")
385
+
386
+
387
+ def test_tensor_engine() -> None:
388
+ pm = proc_mesh(gpus=2).get()
389
+
390
+ dm = spawn_tensor_engine(pm)
391
+ with dm.activate():
392
+ r = monarch.inspect(2 * torch.zeros(3, 4))
393
+
394
+ fm = dm.flatten("all")
395
+ with fm.activate():
396
+ f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
397
+
398
+ assert torch.allclose(torch.zeros(3, 4), r)
399
+ assert torch.allclose(torch.zeros(3, 4), f)
400
+
401
+ dm.exit()
@@ -14,6 +14,7 @@ import monarch
14
14
 
15
15
  import pytest
16
16
  import torch
17
+ import torch.utils._python_dispatch
17
18
  from monarch import fetch_shard, no_mesh, remote, Stream
18
19
  from monarch.common.device_mesh import DeviceMesh
19
20
  from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
@@ -180,3 +181,37 @@ class TestRustBackend(TestCase):
180
181
 
181
182
  self.assertIsNotNone(mesh_info.mesh_labels)
182
183
  self.assertEqual(len(mesh_info.devices_labels), 2)
184
+
185
+ def test_ivalue_problems(self) -> None:
186
+ with local_mesh(hosts=1, gpu_per_host=1):
187
+ from typing import cast
188
+
189
+ from monarch.common.messages import CallFunction, CommandGroup
190
+
191
+ a = cast(monarch.Tensor, torch.rand(3, 4))
192
+ result = monarch.Tensor(a._fake, a.mesh, a.stream)
193
+ msg = CallFunction(
194
+ 0,
195
+ result,
196
+ (),
197
+ monarch.common.function.ResolvableFunctionFromPath(
198
+ "torch.ops.aten.mul.Tensor"
199
+ ),
200
+ (2, a),
201
+ {},
202
+ a.stream._to_ref(a.mesh.client),
203
+ a.mesh,
204
+ [],
205
+ )
206
+ # Internally, this will call CallFunction(...).to_rust_message().
207
+ # The 2 arg will be converted to an IValue tensor via rust + C++.
208
+ # Then when the CommandGroup message gets converted to rust, it
209
+ # will attempt to clone the rust CallFunction message, which will
210
+ # attempt to clone the IValue tensor, which will cause a crash.
211
+ # Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
212
+ # intercepts the following two calls:
213
+ # aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
214
+ # aten.clone.default () (2,) {}
215
+
216
+ with torch.utils._python_dispatch._disable_current_modes():
217
+ CommandGroup([msg]).to_rust_message()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.8
3
+ Version: 2025.6.10
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,7 +1,7 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=HiisXwHtZrYKATL6RdJxw2u_y7Wjgjtwt52V1LIR6ss,39151608
2
+ monarch/_rust_bindings.so,sha256=0-svsKnUJboaOBd5i-LOfpHiRRAgVLX_1Hq_YYREQi8,39756680
3
3
  monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
4
- monarch/actor_mesh.py,sha256=5DbU9OrmNk5I9yasmE-rkTgHyO07oiLlAG0jbJBOXgI,23000
4
+ monarch/actor_mesh.py,sha256=AKdjPg3FM6Yt35uFPBnP7fNVEu6busu5BXVWLwjU2A4,23000
5
5
  monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
6
6
  monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
7
7
  monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
@@ -9,7 +9,8 @@ monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
9
9
  monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
10
10
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
11
11
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
12
- monarch/monarch_controller,sha256=5TKjcz7U7K8OttrwYv-w7yYtPUm2aMOQV4gt0u_Vj5c,20385960
12
+ monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
13
+ monarch/monarch_controller,sha256=Q1eR_EVJqDQLrJZ_6p1ldxVDAU1OmN5lSSuctDcaAFY,20396832
13
14
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
14
15
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
15
16
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -139,9 +140,9 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
139
140
  tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
140
141
  tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
141
142
  tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
142
- tests/test_python_actors.py,sha256=fDvHUIWNZeL3CWnTJMbdh98i1tnH1-LJEG1pIFkGYF8,10898
143
+ tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
143
144
  tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
144
- tests/test_rust_backend.py,sha256=nXSa0ZQ0NniZm4PzvKhrWvVLD-RKvIWYkPXm1BEBXq8,6235
145
+ tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
145
146
  tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
146
147
  tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
147
148
  tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -149,9 +150,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
149
150
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
150
151
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
151
152
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
152
- torchmonarch_nightly-2025.6.8.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
153
- torchmonarch_nightly-2025.6.8.dist-info/METADATA,sha256=AfGuuk6TyhejOLotJWjRt3Hsl80lkEWS4iOaZ61YHj4,2771
154
- torchmonarch_nightly-2025.6.8.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
155
- torchmonarch_nightly-2025.6.8.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
156
- torchmonarch_nightly-2025.6.8.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
157
- torchmonarch_nightly-2025.6.8.dist-info/RECORD,,
153
+ torchmonarch_nightly-2025.6.10.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
154
+ torchmonarch_nightly-2025.6.10.dist-info/METADATA,sha256=DR1GtSFqtqsjhKWi38uGcvhw2p3ycHYSOwDmsErwLj0,2772
155
+ torchmonarch_nightly-2025.6.10.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
156
+ torchmonarch_nightly-2025.6.10.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
157
+ torchmonarch_nightly-2025.6.10.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
158
+ torchmonarch_nightly-2025.6.10.dist-info/RECORD,,