torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +874 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +270 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +500 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +56 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +51 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +12 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/mesh_controller.py +201 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +95 -35
- monarch/tools/mesh_spec.py +55 -0
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +373 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +162 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +55 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
monarch/debugger.py
DELETED
@@ -1,379 +0,0 @@
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
-
# All rights reserved.
|
3
|
-
#
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
6
|
-
|
7
|
-
import asyncio
|
8
|
-
import logging
|
9
|
-
import sys
|
10
|
-
from dataclasses import dataclass
|
11
|
-
from typing import Dict, List, Tuple, Union
|
12
|
-
|
13
|
-
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
14
|
-
from monarch.actor_mesh import Actor, ActorMeshRef, endpoint
|
15
|
-
|
16
|
-
from monarch.pdb_wrapper import DebuggerWrite
|
17
|
-
|
18
|
-
from monarch.proc_mesh import local_proc_mesh
|
19
|
-
from tabulate import tabulate
|
20
|
-
|
21
|
-
|
22
|
-
logger = logging.getLogger(__name__)
|
23
|
-
|
24
|
-
|
25
|
-
CANCEL_TOKEN = object()
|
26
|
-
|
27
|
-
|
28
|
-
async def _debugger_input(prompt=""):
|
29
|
-
return await asyncio.to_thread(input, prompt)
|
30
|
-
|
31
|
-
|
32
|
-
def _debugger_output(msg):
|
33
|
-
sys.stdout.write(msg)
|
34
|
-
sys.stdout.flush()
|
35
|
-
|
36
|
-
|
37
|
-
@dataclass
|
38
|
-
class DebugSessionInfo:
|
39
|
-
rank: int
|
40
|
-
coords: Dict[str, int]
|
41
|
-
hostname: str
|
42
|
-
actor_id: ActorId
|
43
|
-
function: str | None
|
44
|
-
lineno: int | None
|
45
|
-
|
46
|
-
|
47
|
-
class DebugSession:
|
48
|
-
"""Represents a single session with a remote debugger."""
|
49
|
-
|
50
|
-
def __init__(
|
51
|
-
self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
|
52
|
-
):
|
53
|
-
self.rank = rank
|
54
|
-
self.coords = coords
|
55
|
-
self.hostname = hostname
|
56
|
-
self.actor_id = actor_id
|
57
|
-
self._active = False
|
58
|
-
self._message_queue = asyncio.Queue()
|
59
|
-
self._task = None
|
60
|
-
self._pending_send_to_actor = asyncio.Queue()
|
61
|
-
self._outputs_since_last_input = []
|
62
|
-
self._function_lineno = None
|
63
|
-
self._need_read = False
|
64
|
-
|
65
|
-
async def _event_loop(self, line=None, suppress_output=False):
|
66
|
-
if not suppress_output:
|
67
|
-
# If the user had previously attached to this debug session,
|
68
|
-
# then it would have printed various messages from the
|
69
|
-
# message queue. When the user re-attaches, we want to
|
70
|
-
# print out all of the output that was printed since the
|
71
|
-
# last command sent to this session.
|
72
|
-
for output in self._outputs_since_last_input:
|
73
|
-
_debugger_output(output.payload.decode())
|
74
|
-
|
75
|
-
while True:
|
76
|
-
# When the user inputs "detach", it uses up a "read" message
|
77
|
-
# without actually responding to the actor being debugged. We
|
78
|
-
# can't manually reinsert the "read" message into the message queue,
|
79
|
-
# so instead the self._need_read flag indicates there's an additional
|
80
|
-
# "read" that we need to respond to.
|
81
|
-
if self._need_read:
|
82
|
-
self._need_read = False
|
83
|
-
message = "read"
|
84
|
-
else:
|
85
|
-
message = await self._message_queue.get()
|
86
|
-
if message == "detach":
|
87
|
-
# Return to the main outer debug loop.
|
88
|
-
break
|
89
|
-
elif message == "read":
|
90
|
-
break_after = False
|
91
|
-
if line is not None:
|
92
|
-
break_after = True
|
93
|
-
else:
|
94
|
-
line = await _debugger_input()
|
95
|
-
if line.strip("\n") == "detach":
|
96
|
-
self._need_read = True
|
97
|
-
break
|
98
|
-
else:
|
99
|
-
self._outputs_since_last_input = []
|
100
|
-
await self._pending_send_to_actor.put((line + "\n").encode())
|
101
|
-
line = None
|
102
|
-
if break_after:
|
103
|
-
break
|
104
|
-
elif message[0] == "write":
|
105
|
-
output = message[1]
|
106
|
-
# If the user sees this output but then detaches from the session,
|
107
|
-
# its useful to store all outputs since the last input so that
|
108
|
-
# they can be printed again when the user re-attaches.
|
109
|
-
self._outputs_since_last_input.append(output)
|
110
|
-
if not suppress_output:
|
111
|
-
_debugger_output(output.payload.decode())
|
112
|
-
|
113
|
-
if not suppress_output:
|
114
|
-
print(
|
115
|
-
f"Detaching from debug session for rank {self.rank} ({self.hostname})"
|
116
|
-
)
|
117
|
-
|
118
|
-
def get_info(self):
|
119
|
-
function = lineno = None
|
120
|
-
if self._function_lineno is not None:
|
121
|
-
function, lineno = self._function_lineno
|
122
|
-
return DebugSessionInfo(
|
123
|
-
self.rank, self.coords, self.hostname, self.actor_id, function, lineno
|
124
|
-
)
|
125
|
-
|
126
|
-
async def attach(self, line=None, suppress_output=False):
|
127
|
-
self._active = True
|
128
|
-
if not suppress_output:
|
129
|
-
print(f"Attached to debug session for rank {self.rank} ({self.hostname})")
|
130
|
-
self._task = asyncio.create_task(self._event_loop(line, suppress_output))
|
131
|
-
await self._task
|
132
|
-
if not suppress_output:
|
133
|
-
print(f"Detached from debug session for rank {self.rank} ({self.hostname})")
|
134
|
-
self._active = False
|
135
|
-
|
136
|
-
async def detach(self):
|
137
|
-
if self._active:
|
138
|
-
await self._message_queue.put("detach")
|
139
|
-
|
140
|
-
async def debugger_read(self, size: int) -> DebuggerWrite:
|
141
|
-
await self._message_queue.put("read")
|
142
|
-
input_data = await self._pending_send_to_actor.get()
|
143
|
-
if len(input_data) > size:
|
144
|
-
input_data = input_data[:size]
|
145
|
-
return DebuggerWrite(input_data, None, None)
|
146
|
-
|
147
|
-
async def debugger_write(self, write: DebuggerWrite) -> None:
|
148
|
-
if write.function is not None and write.lineno is not None:
|
149
|
-
self._function_lineno = (write.function, write.lineno)
|
150
|
-
await self._message_queue.put(("write", write))
|
151
|
-
|
152
|
-
|
153
|
-
class DebugCommand:
|
154
|
-
@staticmethod
|
155
|
-
def parse(line: str) -> Union["DebugCommand", None]:
|
156
|
-
parts = line.strip("\n").split(" ")
|
157
|
-
if len(parts) == 0:
|
158
|
-
return None
|
159
|
-
command = parts[0]
|
160
|
-
match command:
|
161
|
-
case "attach":
|
162
|
-
return Attach._parse(parts)
|
163
|
-
case "list":
|
164
|
-
return ListCommand()
|
165
|
-
case "quit":
|
166
|
-
return Quit()
|
167
|
-
case "cast":
|
168
|
-
return Cast._parse(parts)
|
169
|
-
case "help":
|
170
|
-
return Help()
|
171
|
-
case "continue":
|
172
|
-
return Continue()
|
173
|
-
case _:
|
174
|
-
print(
|
175
|
-
f"Unknown command {command}. Expected: attach | list | quit | cast | continue | help"
|
176
|
-
)
|
177
|
-
return None
|
178
|
-
|
179
|
-
|
180
|
-
@dataclass
|
181
|
-
class Attach(DebugCommand):
|
182
|
-
rank: int
|
183
|
-
|
184
|
-
@classmethod
|
185
|
-
def _parse(cls, parts: List[str]) -> "Attach":
|
186
|
-
if len(parts) != 2:
|
187
|
-
raise ValueError("Invalid attach command. Expected: attach <rank>")
|
188
|
-
try:
|
189
|
-
rank = int(parts[1])
|
190
|
-
except ValueError:
|
191
|
-
raise ValueError(f"Invalid rank {parts[1]}. Expected: int")
|
192
|
-
return cls(rank)
|
193
|
-
|
194
|
-
|
195
|
-
class ListCommand(DebugCommand):
|
196
|
-
pass
|
197
|
-
|
198
|
-
|
199
|
-
class Quit(DebugCommand):
|
200
|
-
pass
|
201
|
-
|
202
|
-
|
203
|
-
class Help(DebugCommand):
|
204
|
-
pass
|
205
|
-
|
206
|
-
|
207
|
-
class Continue(DebugCommand):
|
208
|
-
pass
|
209
|
-
|
210
|
-
|
211
|
-
@dataclass
|
212
|
-
class Cast(DebugCommand):
|
213
|
-
ranks: List[int] | None
|
214
|
-
command: str
|
215
|
-
|
216
|
-
@classmethod
|
217
|
-
def _parse(cls, parts: List[str]) -> "Cast":
|
218
|
-
if len(parts) < 3:
|
219
|
-
raise ValueError(
|
220
|
-
"Invalid cast command. Expected: cast {<r0,r1,...> | *} <command>"
|
221
|
-
)
|
222
|
-
str_ranks = parts[1]
|
223
|
-
command = " ".join(parts[2:])
|
224
|
-
if str_ranks == "*":
|
225
|
-
return cls(None, command)
|
226
|
-
else:
|
227
|
-
str_ranks = str_ranks.split(",")
|
228
|
-
if len(str_ranks) == 0:
|
229
|
-
raise ValueError(
|
230
|
-
"Invalid rank list for cast. Expected at least one rank."
|
231
|
-
)
|
232
|
-
ranks = []
|
233
|
-
for rank in str_ranks:
|
234
|
-
try:
|
235
|
-
ranks.append(int(rank))
|
236
|
-
except ValueError:
|
237
|
-
raise ValueError(f"Invalid rank {rank}. Expected: int")
|
238
|
-
return cls(ranks, command)
|
239
|
-
|
240
|
-
|
241
|
-
class DebugClient(Actor):
|
242
|
-
"""
|
243
|
-
Single actor for both remote debuggers and users to talk to.
|
244
|
-
|
245
|
-
Handles multiple sessions simultanesouly
|
246
|
-
"""
|
247
|
-
|
248
|
-
def __init__(self) -> None:
|
249
|
-
self.sessions = {} # rank -> DebugSession
|
250
|
-
|
251
|
-
@endpoint
|
252
|
-
async def wait_pending_session(self):
|
253
|
-
while len(self.sessions) == 0:
|
254
|
-
await asyncio.sleep(1)
|
255
|
-
|
256
|
-
@endpoint
|
257
|
-
async def list(self) -> List[Tuple[int, Dict[str, int], str, ActorId, str, int]]:
|
258
|
-
table_data = []
|
259
|
-
for _, session in self.sessions.items():
|
260
|
-
info = session.get_info()
|
261
|
-
table_data.append(
|
262
|
-
(
|
263
|
-
info.rank,
|
264
|
-
info.coords,
|
265
|
-
info.hostname,
|
266
|
-
info.actor_id,
|
267
|
-
info.function,
|
268
|
-
info.lineno,
|
269
|
-
)
|
270
|
-
)
|
271
|
-
table_data = sorted(table_data, key=lambda r: r[0])
|
272
|
-
|
273
|
-
headers = ["Rank", "Coords", "Hostname", "Actor ID", "Function", "Line No."]
|
274
|
-
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
275
|
-
|
276
|
-
return table_data
|
277
|
-
|
278
|
-
@endpoint
|
279
|
-
async def enter(self) -> None:
|
280
|
-
# pyre-ignore
|
281
|
-
await getattr(self, "list")._method(self) # noqa
|
282
|
-
|
283
|
-
while True:
|
284
|
-
try:
|
285
|
-
user_input = await _debugger_input("monarch_dbg> ")
|
286
|
-
command = DebugCommand.parse(user_input)
|
287
|
-
if isinstance(command, Help):
|
288
|
-
print("monarch_dbg commands:")
|
289
|
-
print("\tattach <rank> - attach to a debug session")
|
290
|
-
print("\tlist - list all debug sessions")
|
291
|
-
print("\tquit - exit the debugger, leaving all sessions in place")
|
292
|
-
print(
|
293
|
-
"\tcast {<r0,r1,...> | *} <command> - send a command to a comma-separated list of ranks, or all ranks"
|
294
|
-
)
|
295
|
-
print(
|
296
|
-
"\tcontinue - tell all ranks to continue execution, then exit the debugger"
|
297
|
-
)
|
298
|
-
print("\thelp - print this help message")
|
299
|
-
elif isinstance(command, Attach):
|
300
|
-
if command.rank not in self.sessions:
|
301
|
-
print(f"No debug session for rank {command.rank}")
|
302
|
-
else:
|
303
|
-
await self.sessions[command.rank].attach()
|
304
|
-
elif isinstance(command, ListCommand):
|
305
|
-
await getattr(self, "list")._method(self) # noqa
|
306
|
-
elif isinstance(command, Continue):
|
307
|
-
# Make sure all ranks have exited their debug sessions.
|
308
|
-
# If we sent "quit", it would raise BdbQuit, crashing
|
309
|
-
# the process, which probably isn't what we want.
|
310
|
-
while len(self.sessions) > 0:
|
311
|
-
tasks = []
|
312
|
-
for rank in self.sessions:
|
313
|
-
tasks.append(
|
314
|
-
self.sessions[rank].attach("c", suppress_output=True)
|
315
|
-
)
|
316
|
-
await asyncio.gather(*tasks)
|
317
|
-
return
|
318
|
-
elif isinstance(command, Quit):
|
319
|
-
return
|
320
|
-
elif isinstance(command, Cast):
|
321
|
-
if command.ranks is None:
|
322
|
-
ranks = self.sessions.keys()
|
323
|
-
else:
|
324
|
-
ranks = command.ranks
|
325
|
-
tasks = []
|
326
|
-
for rank in ranks:
|
327
|
-
if rank in self.sessions:
|
328
|
-
tasks.append(
|
329
|
-
self.sessions[rank].attach(
|
330
|
-
command.command,
|
331
|
-
suppress_output=True,
|
332
|
-
)
|
333
|
-
)
|
334
|
-
else:
|
335
|
-
print(f"No debug session for rank {rank}")
|
336
|
-
await asyncio.gather(*tasks)
|
337
|
-
except Exception as e:
|
338
|
-
print(f"Error processing command: {e}")
|
339
|
-
|
340
|
-
##########################################################################
|
341
|
-
# Debugger APIs
|
342
|
-
#
|
343
|
-
# These endpoints are called by the remote debuggers to establish sessions
|
344
|
-
# and communicate with them.
|
345
|
-
@endpoint
|
346
|
-
async def debugger_session_start(
|
347
|
-
self, rank: int, coords: Dict[str, int], hostname: str, actor_id: ActorId
|
348
|
-
) -> None:
|
349
|
-
# Create a session if it doesn't exist
|
350
|
-
if rank not in self.sessions:
|
351
|
-
self.sessions[rank] = DebugSession(rank, coords, hostname, actor_id)
|
352
|
-
|
353
|
-
@endpoint
|
354
|
-
async def debugger_session_end(self, rank: int) -> None:
|
355
|
-
"""Detach from the current debug session."""
|
356
|
-
session = self.sessions.pop(rank)
|
357
|
-
await session.detach()
|
358
|
-
|
359
|
-
@endpoint
|
360
|
-
async def debugger_read(self, rank: int, size: int) -> DebuggerWrite | str:
|
361
|
-
"""Read from the debug session for the given rank."""
|
362
|
-
session = self.sessions[rank]
|
363
|
-
|
364
|
-
return await session.debugger_read(size)
|
365
|
-
|
366
|
-
@endpoint
|
367
|
-
async def debugger_write(self, rank: int, write: DebuggerWrite) -> None:
|
368
|
-
"""Write to the debug session for the given rank."""
|
369
|
-
session = self.sessions[rank]
|
370
|
-
await session.debugger_write(write)
|
371
|
-
|
372
|
-
|
373
|
-
async def init_debugging(
|
374
|
-
actor_mesh: ActorMeshRef,
|
375
|
-
) -> ActorMeshRef[DebugClient]:
|
376
|
-
debugger_proc_mesh = await local_proc_mesh(gpus=1, hosts=1)
|
377
|
-
debug_client_mesh = await debugger_proc_mesh.spawn("debug_client", DebugClient)
|
378
|
-
await actor_mesh._set_debug_client.call(debug_client_mesh)
|
379
|
-
return debug_client_mesh
|
monarch/future.py
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
-
# All rights reserved.
|
3
|
-
#
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
6
|
-
|
7
|
-
import asyncio
|
8
|
-
from functools import partial
|
9
|
-
from typing import Generator, Generic, Optional, TypeVar
|
10
|
-
|
11
|
-
R = TypeVar("R")
|
12
|
-
|
13
|
-
|
14
|
-
def _incomplete(impl, self):
|
15
|
-
try:
|
16
|
-
return self._set_result(impl())
|
17
|
-
except Exception as e:
|
18
|
-
self._set_exception(e)
|
19
|
-
raise
|
20
|
-
|
21
|
-
|
22
|
-
async def _aincomplete(impl, self):
|
23
|
-
try:
|
24
|
-
return self._set_result(await impl())
|
25
|
-
except Exception as e:
|
26
|
-
self._set_exception(e)
|
27
|
-
raise
|
28
|
-
|
29
|
-
|
30
|
-
# TODO: consolidate with monarch.common.future
|
31
|
-
class ActorFuture(Generic[R]):
|
32
|
-
def __init__(self, impl, blocking_impl=None):
|
33
|
-
if blocking_impl is None:
|
34
|
-
blocking_impl = partial(asyncio.run, impl())
|
35
|
-
self._get = partial(_incomplete, blocking_impl)
|
36
|
-
self._aget = partial(_aincomplete, impl)
|
37
|
-
|
38
|
-
def get(self, timeout: Optional[float] = None) -> R:
|
39
|
-
if timeout is not None:
|
40
|
-
return asyncio.run(asyncio.wait_for(self._aget(self), timeout))
|
41
|
-
return self._get(self)
|
42
|
-
|
43
|
-
def __await__(self) -> Generator[R, None, R]:
|
44
|
-
return self._aget(self).__await__()
|
45
|
-
|
46
|
-
def _set_result(self, result):
|
47
|
-
def f(self):
|
48
|
-
return result
|
49
|
-
|
50
|
-
async def af(self):
|
51
|
-
return result
|
52
|
-
|
53
|
-
self._get, self._aget = f, af
|
54
|
-
return result
|
55
|
-
|
56
|
-
def _set_exception(self, e):
|
57
|
-
def f(self):
|
58
|
-
raise e
|
59
|
-
|
60
|
-
async def af(self):
|
61
|
-
raise e
|
62
|
-
|
63
|
-
self._get, self._aget = f, af
|
64
|
-
|
65
|
-
# compatibility with old tensor engine Future objects
|
66
|
-
# hopefully we do not need done(), add_callback because
|
67
|
-
# they are harder to implement right.
|
68
|
-
def result(self, timeout: Optional[float] = None) -> R:
|
69
|
-
return self.get(timeout)
|
70
|
-
|
71
|
-
def exception(self, timeout: Optional[float] = None):
|
72
|
-
try:
|
73
|
-
self.get(timeout)
|
74
|
-
return None
|
75
|
-
except Exception as e:
|
76
|
-
return e
|
monarch/rdma.py
DELETED
@@ -1,162 +0,0 @@
|
|
1
|
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
-
# All rights reserved.
|
3
|
-
#
|
4
|
-
# This source code is licensed under the BSD-style license found in the
|
5
|
-
# LICENSE file in the root directory of this source tree.
|
6
|
-
|
7
|
-
import ctypes
|
8
|
-
|
9
|
-
from dataclasses import dataclass
|
10
|
-
from typing import cast, Dict, Optional, Tuple
|
11
|
-
|
12
|
-
import torch
|
13
|
-
|
14
|
-
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
15
|
-
|
16
|
-
from monarch.actor_mesh import (
|
17
|
-
_ActorMeshRefImpl,
|
18
|
-
Actor,
|
19
|
-
ActorMeshRef,
|
20
|
-
endpoint,
|
21
|
-
MonarchContext,
|
22
|
-
)
|
23
|
-
|
24
|
-
|
25
|
-
@dataclass
|
26
|
-
class LocalRDMARecord:
|
27
|
-
data: torch.Tensor
|
28
|
-
|
29
|
-
|
30
|
-
_local_buffers: Dict[int, "LocalRDMARecord"] = {}
|
31
|
-
|
32
|
-
|
33
|
-
def _get_bytes(storage: torch.Tensor, offset: int, size: int) -> bytearray:
|
34
|
-
"""Extracts a bytearray from a 1D, 1byte per item tensor."""
|
35
|
-
if offset + size > storage.numel():
|
36
|
-
raise ValueError(f"Read out of range: {offset + size} > {storage.size()}")
|
37
|
-
addr = storage.data_ptr()
|
38
|
-
if storage.device.type != "cpu":
|
39
|
-
result = bytearray(size)
|
40
|
-
result_tensor = torch.frombuffer(
|
41
|
-
result,
|
42
|
-
dtype=torch.uint8,
|
43
|
-
)
|
44
|
-
source_tensor = storage[offset:]
|
45
|
-
result_tensor.copy_(source_tensor)
|
46
|
-
else:
|
47
|
-
ctypes_array = (ctypes.c_byte * size).from_address(addr)
|
48
|
-
result = bytearray(ctypes_array)
|
49
|
-
return result
|
50
|
-
|
51
|
-
|
52
|
-
class RDMAManager(Actor):
|
53
|
-
@staticmethod
|
54
|
-
def on_proc(proc_id: str) -> "RDMAManager":
|
55
|
-
ctx = MonarchContext.get()
|
56
|
-
return cast(
|
57
|
-
RDMAManager,
|
58
|
-
ActorMeshRef(
|
59
|
-
RDMAManager,
|
60
|
-
_ActorMeshRefImpl.from_actor_id(
|
61
|
-
ctx.mailbox,
|
62
|
-
ActorId.from_string(f"{proc_id}.rdma_manager[0]"),
|
63
|
-
),
|
64
|
-
ctx.mailbox,
|
65
|
-
),
|
66
|
-
)
|
67
|
-
|
68
|
-
@endpoint
|
69
|
-
async def drop(self, addr: int) -> None:
|
70
|
-
if addr in _local_buffers:
|
71
|
-
del _local_buffers[addr]
|
72
|
-
|
73
|
-
@endpoint
|
74
|
-
async def fetch(self, addr: int, offset: int, nbytes: int) -> bytearray:
|
75
|
-
if addr not in _local_buffers:
|
76
|
-
raise ValueError(f"Unknown buffer {addr}")
|
77
|
-
storage = _local_buffers[addr].data
|
78
|
-
return _get_bytes(storage, offset, nbytes)
|
79
|
-
|
80
|
-
@endpoint
|
81
|
-
async def put(self, addr: int, offset: int, bytes: bytearray) -> None:
|
82
|
-
if addr not in _local_buffers:
|
83
|
-
raise ValueError(f"Unknown buffer {addr}")
|
84
|
-
storage = _local_buffers[addr].data
|
85
|
-
storage[offset : offset + len(bytes)] = torch.frombuffer(
|
86
|
-
bytes, dtype=storage.dtype
|
87
|
-
)
|
88
|
-
|
89
|
-
|
90
|
-
def _assert_tensor_is_1d_contiguous_uint8(t: torch.Tensor) -> None:
|
91
|
-
if t.ndim != 1:
|
92
|
-
raise ValueError(f"Tensor must be 1D, got {t.ndim}D")
|
93
|
-
if t.dtype != torch.uint8:
|
94
|
-
raise ValueError(f"Tensor must be uint8, got {t.dtype}")
|
95
|
-
if not t.is_contiguous():
|
96
|
-
raise ValueError("Tensor must be contiguous")
|
97
|
-
|
98
|
-
|
99
|
-
class RDMABuffer:
|
100
|
-
def __init__(self, data: torch.Tensor) -> None:
|
101
|
-
"""
|
102
|
-
RDMABuffer only supports 1D contiguous tensors that are 1 byte per item.
|
103
|
-
|
104
|
-
To create a 1 byte, 1D view, use t.view(torch.uint8).flatten()
|
105
|
-
|
106
|
-
TODO: Create TensorBuffer, which will be main user API supporting non-contiguous , multi-byte-per-elment tensors
|
107
|
-
"""
|
108
|
-
_assert_tensor_is_1d_contiguous_uint8(data)
|
109
|
-
assert data.storage_offset() == 0
|
110
|
-
storage = data.untyped_storage()
|
111
|
-
self.addr: int = storage.data_ptr()
|
112
|
-
self.begin = 0
|
113
|
-
self.end: int = storage.size()
|
114
|
-
self.proc_id: str = MonarchContext.get().proc_id
|
115
|
-
self.local_data: object = None
|
116
|
-
_local_buffers[self.addr] = LocalRDMARecord(data)
|
117
|
-
|
118
|
-
def drop(self) -> None:
|
119
|
-
if self.proc_id is None:
|
120
|
-
del _local_buffers[self.addr]
|
121
|
-
return
|
122
|
-
rmda_actor = RDMAManager.on_proc(self.proc_id)
|
123
|
-
# pyre-ignore[16]: Undefined attribute [16]: `Endpoint` has no attribute `cast`.
|
124
|
-
rmda_actor.drop.cast(self.addr)
|
125
|
-
|
126
|
-
def __getstate__(self) -> Tuple[int, int, int, Optional[str]]:
|
127
|
-
proc_id = self.proc_id
|
128
|
-
# locally created RDMABuffer being set remotely,
|
129
|
-
# record its proc_id so we know how to establish connections to it
|
130
|
-
if proc_id is None:
|
131
|
-
proc_id = MonarchContext.get().proc_id
|
132
|
-
return (self.addr, self.begin, self.end, proc_id)
|
133
|
-
|
134
|
-
def __setstate__(self, state: Tuple[int, int, int, str]) -> None:
|
135
|
-
self.local_data = None
|
136
|
-
self.addr, self.begin, self.end, self.proc_id = state
|
137
|
-
|
138
|
-
async def read_into(self, dst: torch.Tensor, offset: int = 0) -> None:
|
139
|
-
"""
|
140
|
-
Read data from the RDMABuffer into a destination tensor.
|
141
|
-
|
142
|
-
The destination tensor must be contiguous and 1 byte per item.
|
143
|
-
"""
|
144
|
-
_assert_tensor_is_1d_contiguous_uint8(dst)
|
145
|
-
bytes = await RDMAManager.on_proc(self.proc_id).fetch.call_one(
|
146
|
-
self.addr, offset, dst.numel()
|
147
|
-
)
|
148
|
-
dst.copy_(torch.frombuffer(bytes, dtype=torch.uint8))
|
149
|
-
|
150
|
-
async def write(self, src: torch.Tensor, offset: int = 0) -> None:
|
151
|
-
"""
|
152
|
-
Write data from a source tensor into the RDMABuffer.
|
153
|
-
|
154
|
-
The source tensor must be contiguous and 1 byte per item.
|
155
|
-
"""
|
156
|
-
_assert_tensor_is_1d_contiguous_uint8(src)
|
157
|
-
bytes = _get_bytes(
|
158
|
-
src,
|
159
|
-
cast(int, src.storage_offset()),
|
160
|
-
src.numel(),
|
161
|
-
)
|
162
|
-
await RDMAManager.on_proc(self.proc_id).put.call_one(self.addr, offset, bytes)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt
RENAMED
File without changes
|