torchmonarch-nightly 2025.6.9__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/actor_mesh.py +35 -6
- monarch/allocator.py +67 -0
- monarch/bootstrap_main.py +1 -1
- monarch/mesh_controller.py +209 -0
- monarch/monarch_controller +0 -0
- tests/test_allocator.py +216 -0
- tests/test_python_actors.py +24 -0
- tests/test_rust_backend.py +35 -0
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/RECORD +15 -13
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/actor_mesh.py
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
import asyncio
|
8
8
|
import collections
|
9
9
|
import contextvars
|
10
|
+
import functools
|
10
11
|
import inspect
|
11
12
|
|
12
13
|
import itertools
|
@@ -38,6 +39,7 @@ from typing import (
|
|
38
39
|
|
39
40
|
import monarch
|
40
41
|
from monarch import ActorFuture as Future
|
42
|
+
from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
|
41
43
|
|
42
44
|
from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
|
43
45
|
from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
|
@@ -49,6 +51,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
|
|
49
51
|
)
|
50
52
|
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
51
53
|
from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
|
54
|
+
|
52
55
|
from monarch.common.pickle_flatten import flatten, unflatten
|
53
56
|
from monarch.common.shape import MeshTrait, NDSlice
|
54
57
|
|
@@ -83,7 +86,7 @@ class MonarchContext:
|
|
83
86
|
|
84
87
|
|
85
88
|
_context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
|
86
|
-
"monarch.
|
89
|
+
"monarch.actor_mesh._context"
|
87
90
|
)
|
88
91
|
|
89
92
|
|
@@ -492,13 +495,29 @@ class _Actor:
|
|
492
495
|
return None
|
493
496
|
else:
|
494
497
|
the_method = getattr(self.instance, message.method)._method
|
495
|
-
|
498
|
+
|
496
499
|
if not inspect.iscoroutinefunction(the_method):
|
500
|
+
enter_span(
|
501
|
+
the_method.__module__, message.method, str(ctx.mailbox.actor_id)
|
502
|
+
)
|
503
|
+
result = the_method(self.instance, *args, **kwargs)
|
504
|
+
exit_span()
|
497
505
|
if port is not None:
|
498
506
|
port.send("result", result)
|
499
507
|
return None
|
500
508
|
|
501
|
-
|
509
|
+
async def instrumented():
|
510
|
+
enter_span(
|
511
|
+
the_method.__module__, message.method, str(ctx.mailbox.actor_id)
|
512
|
+
)
|
513
|
+
result = await the_method(self.instance, *args, **kwargs)
|
514
|
+
exit_span()
|
515
|
+
return result
|
516
|
+
|
517
|
+
return self.run_async(
|
518
|
+
ctx,
|
519
|
+
self.run_task(port, instrumented(), panic_flag),
|
520
|
+
)
|
502
521
|
except Exception as e:
|
503
522
|
traceback.print_exc()
|
504
523
|
s = ActorError(e)
|
@@ -510,7 +529,11 @@ class _Actor:
|
|
510
529
|
else:
|
511
530
|
raise s from None
|
512
531
|
|
513
|
-
async def run_async(
|
532
|
+
async def run_async(
|
533
|
+
self,
|
534
|
+
ctx: MonarchContext,
|
535
|
+
coroutine: Coroutine[Any, None, Any],
|
536
|
+
) -> None:
|
514
537
|
_context.set(ctx)
|
515
538
|
if self.complete_task is None:
|
516
539
|
self.complete_task = asyncio.create_task(self._complete())
|
@@ -564,6 +587,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
|
|
564
587
|
|
565
588
|
|
566
589
|
class Actor(MeshTrait):
|
590
|
+
@functools.cached_property
|
591
|
+
def logger(cls) -> logging.Logger:
|
592
|
+
lgr = logging.getLogger(cls.__class__.__name__)
|
593
|
+
lgr.setLevel(logging.DEBUG)
|
594
|
+
return lgr
|
595
|
+
|
567
596
|
@property
|
568
597
|
def _ndslice(self) -> NDSlice:
|
569
598
|
raise NotImplementedError(
|
@@ -677,7 +706,7 @@ class ActorError(Exception):
|
|
677
706
|
def __init__(
|
678
707
|
self,
|
679
708
|
exception: Exception,
|
680
|
-
message: str = "A remote
|
709
|
+
message: str = "A remote actor call has failed asynchronously.",
|
681
710
|
) -> None:
|
682
711
|
self.exception = exception
|
683
712
|
self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
|
@@ -688,7 +717,7 @@ class ActorError(Exception):
|
|
688
717
|
actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
|
689
718
|
return (
|
690
719
|
f"{self.message}\n"
|
691
|
-
f"Traceback of where the
|
720
|
+
f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
|
692
721
|
)
|
693
722
|
|
694
723
|
|
monarch/allocator.py
CHANGED
@@ -4,6 +4,9 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import abc
|
7
10
|
from typing import final
|
8
11
|
|
9
12
|
from monarch import ActorFuture as Future
|
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
|
|
15
18
|
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
16
19
|
LocalAllocatorBase,
|
17
20
|
ProcessAllocatorBase,
|
21
|
+
RemoteAllocatorBase,
|
18
22
|
)
|
19
23
|
|
20
24
|
|
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
|
|
60
64
|
lambda: self.allocate_nonblocking(spec),
|
61
65
|
lambda: self.allocate_blocking(spec),
|
62
66
|
)
|
67
|
+
|
68
|
+
|
69
|
+
class RemoteAllocInitializer(abc.ABC):
|
70
|
+
"""Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
|
71
|
+
|
72
|
+
NOTE: changes to method signatures of this class must be made to the call-site at
|
73
|
+
`PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
|
74
|
+
"""
|
75
|
+
|
76
|
+
@abc.abstractmethod
|
77
|
+
async def initialize_alloc(self) -> list[str]:
|
78
|
+
"""
|
79
|
+
Return the addresses of the servers that should be used to allocate processes
|
80
|
+
for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
|
81
|
+
|
82
|
+
Each address is of the form `{transport}!{addr}(:{port})`.
|
83
|
+
This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
|
84
|
+
For example, `tcp!127.0.0.1:1234`.
|
85
|
+
|
86
|
+
NOTE: Currently, all the addresses must have the same transport type and port
|
87
|
+
NOTE: Although this method is currently called once at the initialization of the Allocator,
|
88
|
+
in the future this method can be called multiple times and should return the current set of
|
89
|
+
addresses that are eligible to handle allocation requests.
|
90
|
+
|
91
|
+
"""
|
92
|
+
...
|
93
|
+
|
94
|
+
|
95
|
+
class StaticRemoteAllocInitializer(RemoteAllocInitializer):
|
96
|
+
"""
|
97
|
+
Returns the static list of server addresses that this initializer
|
98
|
+
was constructed with on each `initialize_alloc()` call.
|
99
|
+
"""
|
100
|
+
|
101
|
+
def __init__(self, *addrs: str) -> None:
|
102
|
+
super().__init__()
|
103
|
+
self.addrs: list[str] = list(addrs)
|
104
|
+
|
105
|
+
async def initialize_alloc(self) -> list[str]:
|
106
|
+
return list(self.addrs)
|
107
|
+
|
108
|
+
|
109
|
+
@final
|
110
|
+
class RemoteAllocator(RemoteAllocatorBase):
|
111
|
+
"""
|
112
|
+
An allocator that allocates by spawning actors on a remote host.
|
113
|
+
The remote host must be running hyperactor's remote-process-allocator.
|
114
|
+
"""
|
115
|
+
|
116
|
+
def allocate(self, spec: AllocSpec) -> Future[Alloc]:
|
117
|
+
"""
|
118
|
+
Allocate a process according to the provided spec.
|
119
|
+
|
120
|
+
Arguments:
|
121
|
+
- `spec`: The spec to allocate according to.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
- A future that will be fulfilled when the requested allocation is fulfilled.
|
125
|
+
"""
|
126
|
+
return Future(
|
127
|
+
lambda: self.allocate_nonblocking(spec),
|
128
|
+
lambda: self.allocate_blocking(spec),
|
129
|
+
)
|
monarch/bootstrap_main.py
CHANGED
@@ -58,7 +58,7 @@ def invoke_main():
|
|
58
58
|
|
59
59
|
# forward logs to rust tracing. Defaults to on.
|
60
60
|
if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
|
61
|
-
logging.root.addHandler(TracingForwarder())
|
61
|
+
logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
|
62
62
|
|
63
63
|
try:
|
64
64
|
with (
|
@@ -0,0 +1,209 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import traceback
|
9
|
+
from collections import deque
|
10
|
+
from logging import Logger
|
11
|
+
from typing import List, NamedTuple, Optional, Union
|
12
|
+
|
13
|
+
import torch.utils._python_dispatch
|
14
|
+
|
15
|
+
from monarch import NDSlice
|
16
|
+
from monarch._rust_bindings.monarch_extension import client, debugger
|
17
|
+
from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monarch/monarch_extension:monarch_extension
|
18
|
+
WorldState,
|
19
|
+
)
|
20
|
+
from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
|
21
|
+
from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
22
|
+
ActorId,
|
23
|
+
)
|
24
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
|
25
|
+
from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
|
26
|
+
from monarch.common.client import Client
|
27
|
+
from monarch.common.controller_api import LogMessage, MessageResult
|
28
|
+
from monarch.common.device_mesh import DeviceMesh, no_mesh
|
29
|
+
from monarch.common.invocation import DeviceException, RemoteException
|
30
|
+
from monarch.controller.debugger import read as debugger_read, write as debugger_write
|
31
|
+
from monarch.proc_mesh import ProcMesh
|
32
|
+
from pyre_extensions import none_throws
|
33
|
+
|
34
|
+
logger: Logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
class Controller(_Controller):
|
38
|
+
def __init__(self, workers: HyProcMesh) -> None:
|
39
|
+
super().__init__()
|
40
|
+
# Buffer for messages unrelated to debugging that are received while a
|
41
|
+
# debugger session is active.
|
42
|
+
self._non_debugger_pending_messages: deque[
|
43
|
+
Optional[client.LogMessage | client.WorkerResponse]
|
44
|
+
] = deque()
|
45
|
+
self._pending_debugger_sessions: deque[ActorId] = deque()
|
46
|
+
|
47
|
+
def next_message(
|
48
|
+
self, timeout: Optional[float]
|
49
|
+
) -> Optional[LogMessage | MessageResult]:
|
50
|
+
if self._non_debugger_pending_messages:
|
51
|
+
msg = self._non_debugger_pending_messages.popleft()
|
52
|
+
else:
|
53
|
+
msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
|
54
|
+
if msg is None:
|
55
|
+
return None
|
56
|
+
|
57
|
+
if isinstance(msg, client.WorkerResponse):
|
58
|
+
return _worker_response_to_result(msg)
|
59
|
+
elif isinstance(msg, client.LogMessage):
|
60
|
+
return LogMessage(msg.level, msg.message)
|
61
|
+
elif isinstance(msg, client.DebuggerMessage):
|
62
|
+
self._run_debugger_loop(msg)
|
63
|
+
|
64
|
+
def send(
|
65
|
+
self,
|
66
|
+
ranks: Union[NDSlice, List[NDSlice]],
|
67
|
+
msg: NamedTuple,
|
68
|
+
) -> None:
|
69
|
+
with torch.utils._python_dispatch._disable_current_modes():
|
70
|
+
return super().send(ranks, msg)
|
71
|
+
|
72
|
+
def drain_and_stop(
|
73
|
+
self,
|
74
|
+
) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
|
75
|
+
logger.info("rust controller shutting down")
|
76
|
+
results = []
|
77
|
+
for msg in self._drain_and_stop():
|
78
|
+
if isinstance(msg, client.WorkerResponse):
|
79
|
+
results.append(_worker_response_to_result(msg))
|
80
|
+
elif isinstance(msg, client.LogMessage):
|
81
|
+
results.append(LogMessage(msg.level, msg.message))
|
82
|
+
elif isinstance(msg, client.DebuggerMessage):
|
83
|
+
results.append(msg)
|
84
|
+
else:
|
85
|
+
raise RuntimeError(f"Unexpected message type {type(msg)}")
|
86
|
+
return results
|
87
|
+
|
88
|
+
def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
|
89
|
+
if not isinstance(message.action, DebuggerAction.Paused):
|
90
|
+
raise RuntimeError(
|
91
|
+
f"Unexpected debugger message {message} when no debugger session is running"
|
92
|
+
)
|
93
|
+
|
94
|
+
self._pending_debugger_sessions.append(message.debugger_actor_id)
|
95
|
+
while self._pending_debugger_sessions:
|
96
|
+
debugger_actor_id = self._pending_debugger_sessions.popleft()
|
97
|
+
rank = debugger_actor_id.rank
|
98
|
+
proc_id = debugger_actor_id.proc_id
|
99
|
+
debugger_write(
|
100
|
+
f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
|
101
|
+
)
|
102
|
+
|
103
|
+
self._debugger_attach(debugger_actor_id)
|
104
|
+
while True:
|
105
|
+
# TODO: Add appropriate timeout.
|
106
|
+
msg = self._get_next_message(timeout_msec=None)
|
107
|
+
|
108
|
+
if not isinstance(msg, client.DebuggerMessage):
|
109
|
+
self._non_debugger_pending_messages.append(msg)
|
110
|
+
continue
|
111
|
+
|
112
|
+
if msg.debugger_actor_id != debugger_actor_id:
|
113
|
+
if isinstance(msg.action, DebuggerAction.Paused):
|
114
|
+
self._pending_debugger_sessions.append(msg.debugger_actor_id)
|
115
|
+
continue
|
116
|
+
else:
|
117
|
+
raise RuntimeError(
|
118
|
+
f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
|
119
|
+
f"when debugging rank {debugger_actor_id.rank}"
|
120
|
+
)
|
121
|
+
|
122
|
+
action = msg.action
|
123
|
+
if isinstance(action, DebuggerAction.Detach):
|
124
|
+
break
|
125
|
+
elif isinstance(action, DebuggerAction.Read):
|
126
|
+
self._debugger_write(
|
127
|
+
debugger_actor_id, debugger_read(action.requested_size)
|
128
|
+
)
|
129
|
+
elif isinstance(action, DebuggerAction.Write):
|
130
|
+
debugger_write(
|
131
|
+
debugger.get_bytes_from_write_action(action).decode()
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
raise RuntimeError(
|
135
|
+
f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
|
136
|
+
)
|
137
|
+
|
138
|
+
def worker_world_state(self) -> WorldState:
|
139
|
+
raise NotImplementedError("worker world state")
|
140
|
+
|
141
|
+
def stop_mesh(self):
|
142
|
+
# I think this is a noop?
|
143
|
+
|
144
|
+
pass
|
145
|
+
|
146
|
+
|
147
|
+
# TODO: Handling conversion of the response can move to a separate module over time
|
148
|
+
# especially as we have structured error messages.
|
149
|
+
def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
|
150
|
+
if not result.is_exception():
|
151
|
+
# The result of the message needs to be unwrapped on a real device.
|
152
|
+
# Staying as a fake tensor will fail the tensor deserialization.
|
153
|
+
with no_mesh.activate():
|
154
|
+
return MessageResult(result.seq, result.result(), None)
|
155
|
+
exc = none_throws(result.exception())
|
156
|
+
if isinstance(exc, client.Error):
|
157
|
+
worker_frames = [
|
158
|
+
traceback.FrameSummary("<unknown>", None, frame)
|
159
|
+
for frame in exc.backtrace.split("\\n")
|
160
|
+
]
|
161
|
+
logger.error(f"Worker {exc.actor_id} failed")
|
162
|
+
return MessageResult(
|
163
|
+
seq=result.seq,
|
164
|
+
result=None,
|
165
|
+
error=RemoteException(
|
166
|
+
seq=exc.caused_by_seq,
|
167
|
+
exception=RuntimeError(exc.backtrace),
|
168
|
+
controller_frame_index=0, # TODO: T225205291 fix this once we have recording support in rust
|
169
|
+
controller_frames=None,
|
170
|
+
worker_frames=worker_frames,
|
171
|
+
source_actor_id=exc.actor_id,
|
172
|
+
message=f"Worker {exc.actor_id} failed",
|
173
|
+
),
|
174
|
+
)
|
175
|
+
elif isinstance(exc, client.Failure):
|
176
|
+
frames = [
|
177
|
+
traceback.FrameSummary("<unknown>", None, frame)
|
178
|
+
for frame in exc.backtrace.split("\n")
|
179
|
+
]
|
180
|
+
reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
|
181
|
+
logger.error(reason)
|
182
|
+
return MessageResult(
|
183
|
+
seq=0, # seq is not consumed for DeviceException; it will be directly thrown by the client
|
184
|
+
result=None,
|
185
|
+
error=DeviceException(
|
186
|
+
exception=RuntimeError(reason),
|
187
|
+
frames=frames,
|
188
|
+
source_actor_id=exc.actor_id,
|
189
|
+
message=reason,
|
190
|
+
),
|
191
|
+
)
|
192
|
+
else:
|
193
|
+
raise RuntimeError(f"Unknown exception type: {type(exc)}")
|
194
|
+
|
195
|
+
|
196
|
+
def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
|
197
|
+
# This argument to Controller
|
198
|
+
# is currently only used for debug printing. It should be fixed to
|
199
|
+
# report the proc ID instead of the rank it currently does.
|
200
|
+
gpus = proc_mesh.sizes.get("gpus", 1)
|
201
|
+
backend_ctrl = Controller(proc_mesh._proc_mesh)
|
202
|
+
client = Client(backend_ctrl, proc_mesh.size(), gpus)
|
203
|
+
dm = DeviceMesh(
|
204
|
+
client,
|
205
|
+
NDSlice.new_row_major(list(proc_mesh.sizes.values())),
|
206
|
+
tuple(proc_mesh.sizes.keys()),
|
207
|
+
)
|
208
|
+
dm.exit = lambda: client.shutdown()
|
209
|
+
return dm
|
monarch/monarch_controller
CHANGED
Binary file
|
tests/test_allocator.py
ADDED
@@ -0,0 +1,216 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import contextlib
|
10
|
+
import importlib.resources
|
11
|
+
import math
|
12
|
+
import os
|
13
|
+
import subprocess
|
14
|
+
import sys
|
15
|
+
import unittest
|
16
|
+
from datetime import timedelta
|
17
|
+
from typing import Generator
|
18
|
+
|
19
|
+
import cloudpickle
|
20
|
+
|
21
|
+
import torch
|
22
|
+
import torch.distributed as dist
|
23
|
+
import torch.nn.functional as F
|
24
|
+
|
25
|
+
from monarch._rust_bindings.hyperactor_extension.alloc import (
|
26
|
+
AllocConstraints,
|
27
|
+
AllocSpec,
|
28
|
+
)
|
29
|
+
|
30
|
+
from monarch._rust_bindings.monarch_hyperactor.channel import (
|
31
|
+
ChannelAddr,
|
32
|
+
ChannelTransport,
|
33
|
+
)
|
34
|
+
from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
|
35
|
+
|
36
|
+
from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
|
37
|
+
from monarch.proc_mesh import ProcMesh
|
38
|
+
|
39
|
+
from torch.distributed.elastic.utils.distributed import get_free_port
|
40
|
+
|
41
|
+
_100_MILLISECONDS = timedelta(milliseconds=100)
|
42
|
+
|
43
|
+
|
44
|
+
class TestActor(Actor):
|
45
|
+
"""Silly actor that computes the world size by all-reducing rank-hot tensors"""
|
46
|
+
|
47
|
+
def __init__(self) -> None:
|
48
|
+
self.rank: int = current_rank().rank
|
49
|
+
self.world_size: int = math.prod(current_size().values())
|
50
|
+
|
51
|
+
@endpoint
|
52
|
+
async def compute_world_size(self, master_addr: str, master_port: int) -> int:
|
53
|
+
os.environ["MASTER_ADDR"] = master_addr
|
54
|
+
os.environ["MASTER_PORT"] = str(master_port)
|
55
|
+
dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
|
56
|
+
|
57
|
+
try:
|
58
|
+
t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
|
59
|
+
dist.all_reduce(t)
|
60
|
+
return int(torch.sum(t).item())
|
61
|
+
finally:
|
62
|
+
dist.destroy_process_group()
|
63
|
+
|
64
|
+
|
65
|
+
@contextlib.contextmanager
|
66
|
+
def remote_process_allocator() -> Generator[str, None, None]:
|
67
|
+
with importlib.resources.path(__package__, "") as package_path:
|
68
|
+
addr = ChannelAddr.any(ChannelTransport.Unix)
|
69
|
+
|
70
|
+
process_allocator = subprocess.Popen(
|
71
|
+
args=[
|
72
|
+
"process_allocator",
|
73
|
+
f"--addr={addr}",
|
74
|
+
],
|
75
|
+
env={
|
76
|
+
# prefix PATH with this test module's directory to
|
77
|
+
# give 'process_allocator' and 'monarch_bootstrap' binary resources
|
78
|
+
# in this test module's directory precedence over the installed ones
|
79
|
+
# useful in BUCK where these binaries are added as 'resources' of this test target
|
80
|
+
"PATH": f"{package_path}:{os.getenv('PATH', '')}",
|
81
|
+
"RUST_LOG": "debug",
|
82
|
+
},
|
83
|
+
)
|
84
|
+
try:
|
85
|
+
yield addr
|
86
|
+
finally:
|
87
|
+
process_allocator.terminate()
|
88
|
+
try:
|
89
|
+
five_seconds = 5
|
90
|
+
process_allocator.wait(timeout=five_seconds)
|
91
|
+
except subprocess.TimeoutExpired:
|
92
|
+
process_allocator.kill()
|
93
|
+
|
94
|
+
|
95
|
+
class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
96
|
+
@classmethod
|
97
|
+
def setUpClass(cls) -> None:
|
98
|
+
cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
|
99
|
+
|
100
|
+
@classmethod
|
101
|
+
def tearDownClass(cls) -> None:
|
102
|
+
cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
|
103
|
+
|
104
|
+
def assert_computed_world_size(
|
105
|
+
self, computed: ValueMesh[int], expected_world_size: int
|
106
|
+
) -> None:
|
107
|
+
expected_world_sizes = {
|
108
|
+
rank: expected_world_size for rank in range(0, expected_world_size)
|
109
|
+
}
|
110
|
+
computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
|
111
|
+
self.assertDictEqual(expected_world_sizes, computed_world_sizes)
|
112
|
+
|
113
|
+
async def test_call_allocate_twice(self) -> None:
|
114
|
+
class DeletingAllocInitializer(StaticRemoteAllocInitializer):
|
115
|
+
"""test initializer that removes the last address from the list each time initialize_alloc() is called
|
116
|
+
used to test that the state of the initializer is preserved across calls to allocate()
|
117
|
+
"""
|
118
|
+
|
119
|
+
async def initialize_alloc(self) -> list[str]:
|
120
|
+
alloc = await super().initialize_alloc()
|
121
|
+
self.addrs.pop(-1)
|
122
|
+
return alloc
|
123
|
+
|
124
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
125
|
+
initializer = DeletingAllocInitializer(host1, host2)
|
126
|
+
|
127
|
+
allocator = RemoteAllocator(
|
128
|
+
world_id="test_remote_allocator",
|
129
|
+
initializer=initializer,
|
130
|
+
heartbeat_interval=_100_MILLISECONDS,
|
131
|
+
)
|
132
|
+
|
133
|
+
spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
|
134
|
+
|
135
|
+
await allocator.allocate(spec)
|
136
|
+
self.assertEqual([host1], initializer.addrs)
|
137
|
+
|
138
|
+
await allocator.allocate(spec)
|
139
|
+
self.assertEqual([], initializer.addrs)
|
140
|
+
|
141
|
+
async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
|
142
|
+
class EmptyAllocInitializer(StaticRemoteAllocInitializer):
|
143
|
+
"""test initializer that returns an empty list of addresses"""
|
144
|
+
|
145
|
+
async def initialize_alloc(self) -> list[str]:
|
146
|
+
return []
|
147
|
+
|
148
|
+
empty_initializer = EmptyAllocInitializer()
|
149
|
+
with self.assertRaisesRegex(
|
150
|
+
RuntimeError, r"initializer must return non-empty list of addresses"
|
151
|
+
):
|
152
|
+
allocator = RemoteAllocator(
|
153
|
+
world_id="test_remote_allocator",
|
154
|
+
initializer=empty_initializer,
|
155
|
+
heartbeat_interval=_100_MILLISECONDS,
|
156
|
+
)
|
157
|
+
await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
|
158
|
+
|
159
|
+
async def test_allocate_2d_mesh(self) -> None:
|
160
|
+
hosts = 2
|
161
|
+
gpus = 4
|
162
|
+
world_size = hosts * gpus
|
163
|
+
spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
|
164
|
+
|
165
|
+
# create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
|
166
|
+
with remote_process_allocator() as host1, remote_process_allocator() as host2:
|
167
|
+
allocator = RemoteAllocator(
|
168
|
+
world_id="test_remote_allocator",
|
169
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
170
|
+
heartbeat_interval=_100_MILLISECONDS,
|
171
|
+
)
|
172
|
+
alloc = await allocator.allocate(spec)
|
173
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
174
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
175
|
+
|
176
|
+
values = await actor.compute_world_size.call(
|
177
|
+
master_addr="::",
|
178
|
+
master_port=get_free_port(),
|
179
|
+
)
|
180
|
+
|
181
|
+
self.assert_computed_world_size(values, world_size)
|
182
|
+
|
183
|
+
async def test_stacked_1d_meshes(self) -> None:
|
184
|
+
# create two stacked actor meshes on the same host
|
185
|
+
# each actor mesh running on separate process-allocators
|
186
|
+
|
187
|
+
with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
|
188
|
+
allocator_a = RemoteAllocator(
|
189
|
+
world_id="a",
|
190
|
+
initializer=StaticRemoteAllocInitializer(host1_a),
|
191
|
+
heartbeat_interval=_100_MILLISECONDS,
|
192
|
+
)
|
193
|
+
allocator_b = RemoteAllocator(
|
194
|
+
world_id="b",
|
195
|
+
initializer=StaticRemoteAllocInitializer(host1_b),
|
196
|
+
heartbeat_interval=_100_MILLISECONDS,
|
197
|
+
)
|
198
|
+
|
199
|
+
spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
|
200
|
+
spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
|
201
|
+
|
202
|
+
proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
|
203
|
+
proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
|
204
|
+
|
205
|
+
actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
|
206
|
+
actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
|
207
|
+
|
208
|
+
results_a = await actor_a.compute_world_size.call(
|
209
|
+
master_addr="::", master_port=get_free_port()
|
210
|
+
)
|
211
|
+
results_b = await actor_b.compute_world_size.call(
|
212
|
+
master_addr="::", master_port=get_free_port()
|
213
|
+
)
|
214
|
+
|
215
|
+
self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
|
216
|
+
self.assert_computed_world_size(results_b, 6) # b is a 1x6 mesh
|
tests/test_python_actors.py
CHANGED
@@ -7,7 +7,12 @@
|
|
7
7
|
import operator
|
8
8
|
from types import ModuleType
|
9
9
|
|
10
|
+
import monarch
|
11
|
+
|
12
|
+
import pytest
|
13
|
+
|
10
14
|
import torch
|
15
|
+
|
11
16
|
from monarch.actor_mesh import (
|
12
17
|
Accumulator,
|
13
18
|
Actor,
|
@@ -17,6 +22,8 @@ from monarch.actor_mesh import (
|
|
17
22
|
endpoint,
|
18
23
|
)
|
19
24
|
|
25
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
26
|
+
|
20
27
|
from monarch.proc_mesh import local_proc_mesh, proc_mesh
|
21
28
|
from monarch.rdma import RDMABuffer
|
22
29
|
|
@@ -375,3 +382,20 @@ def test_rust_binding_modules_correct() -> None:
|
|
375
382
|
assert value.__module__ == path
|
376
383
|
|
377
384
|
check(bindings, "monarch._rust_bindings")
|
385
|
+
|
386
|
+
|
387
|
+
def test_tensor_engine() -> None:
|
388
|
+
pm = proc_mesh(gpus=2).get()
|
389
|
+
|
390
|
+
dm = spawn_tensor_engine(pm)
|
391
|
+
with dm.activate():
|
392
|
+
r = monarch.inspect(2 * torch.zeros(3, 4))
|
393
|
+
|
394
|
+
fm = dm.flatten("all")
|
395
|
+
with fm.activate():
|
396
|
+
f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
|
397
|
+
|
398
|
+
assert torch.allclose(torch.zeros(3, 4), r)
|
399
|
+
assert torch.allclose(torch.zeros(3, 4), f)
|
400
|
+
|
401
|
+
dm.exit()
|
tests/test_rust_backend.py
CHANGED
@@ -14,6 +14,7 @@ import monarch
|
|
14
14
|
|
15
15
|
import pytest
|
16
16
|
import torch
|
17
|
+
import torch.utils._python_dispatch
|
17
18
|
from monarch import fetch_shard, no_mesh, remote, Stream
|
18
19
|
from monarch.common.device_mesh import DeviceMesh
|
19
20
|
from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
|
@@ -180,3 +181,37 @@ class TestRustBackend(TestCase):
|
|
180
181
|
|
181
182
|
self.assertIsNotNone(mesh_info.mesh_labels)
|
182
183
|
self.assertEqual(len(mesh_info.devices_labels), 2)
|
184
|
+
|
185
|
+
def test_ivalue_problems(self) -> None:
|
186
|
+
with local_mesh(hosts=1, gpu_per_host=1):
|
187
|
+
from typing import cast
|
188
|
+
|
189
|
+
from monarch.common.messages import CallFunction, CommandGroup
|
190
|
+
|
191
|
+
a = cast(monarch.Tensor, torch.rand(3, 4))
|
192
|
+
result = monarch.Tensor(a._fake, a.mesh, a.stream)
|
193
|
+
msg = CallFunction(
|
194
|
+
0,
|
195
|
+
result,
|
196
|
+
(),
|
197
|
+
monarch.common.function.ResolvableFunctionFromPath(
|
198
|
+
"torch.ops.aten.mul.Tensor"
|
199
|
+
),
|
200
|
+
(2, a),
|
201
|
+
{},
|
202
|
+
a.stream._to_ref(a.mesh.client),
|
203
|
+
a.mesh,
|
204
|
+
[],
|
205
|
+
)
|
206
|
+
# Internally, this will call CallFunction(...).to_rust_message().
|
207
|
+
# The 2 arg will be converted to an IValue tensor via rust + C++.
|
208
|
+
# Then when the CommandGroup message gets converted to rust, it
|
209
|
+
# will attempt to clone the rust CallFunction message, which will
|
210
|
+
# attempt to clone the IValue tensor, which will cause a crash.
|
211
|
+
# Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
|
212
|
+
# intercepts the following two calls:
|
213
|
+
# aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
|
214
|
+
# aten.clone.default () (2,) {}
|
215
|
+
|
216
|
+
with torch.utils._python_dispatch._disable_current_modes():
|
217
|
+
CommandGroup([msg]).to_rust_message()
|
@@ -1,15 +1,16 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=g2tlum6iqfdR4KRkVhp_BwUmlz0tYUSITNVaJjSNitE,40645720
|
3
3
|
monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
|
4
|
-
monarch/actor_mesh.py,sha256=
|
5
|
-
monarch/allocator.py,sha256=
|
6
|
-
monarch/bootstrap_main.py,sha256=
|
4
|
+
monarch/actor_mesh.py,sha256=4I8xp_XIM6KZJY_jXVjJ8tPW2l1J4a6ZhrknU7zKbAk,23947
|
5
|
+
monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
|
6
|
+
monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
8
8
|
monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
9
9
|
monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
|
10
10
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
11
11
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
12
|
-
monarch/
|
12
|
+
monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
|
13
|
+
monarch/monarch_controller,sha256=41B7zLv7M7_CSmChN5bfvVrygi2VeBhMDcNQXlnbVZU,20394376
|
13
14
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
14
15
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
15
16
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
@@ -131,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
|
|
131
132
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
132
133
|
tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
|
133
134
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
135
|
+
tests/test_allocator.py,sha256=dqQbQyOjOX3JgnHIPT0iawT0wMeFztbLCYjK2tl8GcI,8149
|
134
136
|
tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
|
135
137
|
tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
|
136
138
|
tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
|
@@ -139,9 +141,9 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
|
|
139
141
|
tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
|
140
142
|
tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
|
141
143
|
tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
|
142
|
-
tests/test_python_actors.py,sha256=
|
144
|
+
tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
|
143
145
|
tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
|
144
|
-
tests/test_rust_backend.py,sha256=
|
146
|
+
tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
|
145
147
|
tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
|
146
148
|
tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
|
147
149
|
tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -149,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
149
151
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
150
152
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
151
153
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
152
|
-
torchmonarch_nightly-2025.6.
|
153
|
-
torchmonarch_nightly-2025.6.
|
154
|
-
torchmonarch_nightly-2025.6.
|
155
|
-
torchmonarch_nightly-2025.6.
|
156
|
-
torchmonarch_nightly-2025.6.
|
157
|
-
torchmonarch_nightly-2025.6.
|
154
|
+
torchmonarch_nightly-2025.6.11.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
155
|
+
torchmonarch_nightly-2025.6.11.dist-info/METADATA,sha256=SCdAxETtVZ5ESzbLepOp6mf1L4G-HSYVkjdRFT7D0kg,2772
|
156
|
+
torchmonarch_nightly-2025.6.11.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
157
|
+
torchmonarch_nightly-2025.6.11.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
158
|
+
torchmonarch_nightly-2025.6.11.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
159
|
+
torchmonarch_nightly-2025.6.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/top_level.txt
RENAMED
File without changes
|