torchmonarch-nightly 2025.7.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +874 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +270 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +500 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +56 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +51 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +12 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/mesh_controller.py +201 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +95 -35
- monarch/tools/mesh_spec.py +55 -0
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +373 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +162 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +55 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
monarch/proc_mesh.py
CHANGED
@@ -4,315 +4,12 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
-
|
7
|
+
import warnings
|
8
8
|
|
9
|
-
|
10
|
-
import
|
11
|
-
|
12
|
-
|
13
|
-
from typing import (
|
14
|
-
Any,
|
15
|
-
cast,
|
16
|
-
Dict,
|
17
|
-
List,
|
18
|
-
Optional,
|
19
|
-
Sequence,
|
20
|
-
Type,
|
21
|
-
TYPE_CHECKING,
|
22
|
-
TypeVar,
|
23
|
-
)
|
24
|
-
|
25
|
-
if TYPE_CHECKING:
|
26
|
-
import torch
|
27
|
-
|
28
|
-
import monarch
|
29
|
-
from monarch import ActorFuture as Future
|
30
|
-
|
31
|
-
# Conditionally import DeviceMesh and spawn_tensor_engine only if tensor_engine is available
|
32
|
-
# pyre-ignore[21]
|
33
|
-
from monarch._rust_bindings import has_tensor_engine
|
34
|
-
|
35
|
-
from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
|
36
|
-
Alloc,
|
37
|
-
AllocConstraints,
|
38
|
-
AllocSpec,
|
39
|
-
)
|
40
|
-
from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
|
41
|
-
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
42
|
-
ProcMesh as HyProcMesh,
|
43
|
-
ProcMeshMonitor,
|
9
|
+
warnings.warn(
|
10
|
+
"monarch.proc_mesh is deprecated, please import from monarch.actor instead.",
|
11
|
+
DeprecationWarning,
|
12
|
+
stacklevel=2,
|
44
13
|
)
|
45
|
-
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
46
|
-
from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
|
47
|
-
|
48
|
-
from monarch.code_sync import RemoteWorkspace, RsyncMeshClient
|
49
|
-
from monarch.common._device_utils import _local_device_count
|
50
|
-
from monarch.common.shape import MeshTrait
|
51
|
-
from monarch.rdma import RDMAManager
|
52
|
-
|
53
|
-
if has_tensor_engine():
|
54
|
-
from monarch.common.device_mesh import DeviceMesh
|
55
|
-
from monarch.mesh_controller import spawn_tensor_engine
|
56
|
-
else:
|
57
|
-
DeviceMesh = None
|
58
|
-
spawn_tensor_engine = None
|
59
|
-
|
60
|
-
T = TypeVar("T")
|
61
|
-
try:
|
62
|
-
from __manifest__ import fbmake # noqa
|
63
|
-
|
64
|
-
IN_PAR = True
|
65
|
-
except ImportError:
|
66
|
-
IN_PAR = False
|
67
|
-
|
68
|
-
|
69
|
-
async def _allocate_nonblocking(alloc: Alloc) -> "ProcMesh":
|
70
|
-
return ProcMesh(await HyProcMesh.allocate_nonblocking(alloc))
|
71
|
-
|
72
|
-
|
73
|
-
def _allocate_blocking(alloc: Alloc) -> "ProcMesh":
|
74
|
-
return ProcMesh(HyProcMesh.allocate_blocking(alloc))
|
75
|
-
|
76
|
-
|
77
|
-
class ProcMesh(MeshTrait):
|
78
|
-
def __init__(
|
79
|
-
self,
|
80
|
-
hy_proc_mesh: HyProcMesh,
|
81
|
-
_mock_shape: Optional[Shape] = None,
|
82
|
-
_device_mesh: Optional[DeviceMesh] = None,
|
83
|
-
) -> None:
|
84
|
-
self._proc_mesh = hy_proc_mesh
|
85
|
-
self._mock_shape: Optional[Shape] = _mock_shape
|
86
|
-
self._mailbox: Mailbox = self._proc_mesh.client
|
87
|
-
self._rdma_manager: Optional[RDMAManager] = None
|
88
|
-
self._rsync_mesh_client: Optional[RsyncMeshClient] = None
|
89
|
-
self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
|
90
|
-
if _mock_shape is None:
|
91
|
-
self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
|
92
|
-
|
93
|
-
@property
|
94
|
-
def _shape(self) -> Shape:
|
95
|
-
return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
|
96
|
-
|
97
|
-
@property
|
98
|
-
def _ndslice(self) -> Slice:
|
99
|
-
return self._shape.ndslice
|
100
|
-
|
101
|
-
@property
|
102
|
-
def _labels(self) -> List[str]:
|
103
|
-
return self._shape.labels
|
104
|
-
|
105
|
-
def _new_with_shape(self, shape: Shape) -> "ProcMesh":
|
106
|
-
device_mesh = (
|
107
|
-
None
|
108
|
-
if self._device_mesh is None
|
109
|
-
else self._device_mesh._new_with_shape(shape)
|
110
|
-
)
|
111
|
-
return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
|
112
|
-
|
113
|
-
def spawn(
|
114
|
-
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
115
|
-
) -> Future[ActorMeshRef[T]]:
|
116
|
-
if self._mock_shape is not None:
|
117
|
-
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
118
|
-
return Future(
|
119
|
-
lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
|
120
|
-
lambda: self._spawn_blocking(name, Class, *args, **kwargs),
|
121
|
-
)
|
122
|
-
|
123
|
-
async def monitor(self) -> ProcMeshMonitor:
|
124
|
-
"""
|
125
|
-
Get a monitor (async iterator) of the proc mesh, it is used to
|
126
|
-
monitor the status of the proc mesh. This function can be called at most once.
|
127
|
-
|
128
|
-
Note: This API is experimental and subject to change.
|
129
|
-
|
130
|
-
Example:
|
131
|
-
|
132
|
-
async def monitor_loop(monitor):
|
133
|
-
async for event in monitor:
|
134
|
-
await handle_exception_event(event)
|
135
|
-
|
136
|
-
# Kick off in background
|
137
|
-
asyncio.create_task(monitor_loop(monitor))
|
138
|
-
"""
|
139
|
-
return await self._proc_mesh.monitor()
|
140
|
-
|
141
|
-
@classmethod
|
142
|
-
def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
|
143
|
-
return Future(
|
144
|
-
lambda: _allocate_nonblocking(alloc),
|
145
|
-
lambda: _allocate_blocking(alloc),
|
146
|
-
)
|
147
|
-
|
148
|
-
def _spawn_blocking(
|
149
|
-
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
150
|
-
) -> T:
|
151
|
-
if not issubclass(Class, Actor):
|
152
|
-
raise ValueError(
|
153
|
-
f"{Class} must subclass monarch.service.Actor to spawn it."
|
154
|
-
)
|
155
|
-
|
156
|
-
actor_mesh = self._proc_mesh.spawn_blocking(name, _Actor)
|
157
|
-
service = ActorMeshRef(
|
158
|
-
Class,
|
159
|
-
_ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
|
160
|
-
self._mailbox,
|
161
|
-
)
|
162
|
-
# useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
|
163
|
-
# doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
|
164
|
-
service._create(args, kwargs)
|
165
|
-
return cast(T, service)
|
166
|
-
|
167
|
-
def __repr__(self) -> str:
|
168
|
-
return repr(self._proc_mesh)
|
169
|
-
|
170
|
-
def __str__(self) -> str:
|
171
|
-
return str(self._proc_mesh)
|
172
|
-
|
173
|
-
async def _spawn_nonblocking(
|
174
|
-
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
175
|
-
) -> T:
|
176
|
-
if not issubclass(Class, Actor):
|
177
|
-
raise ValueError(
|
178
|
-
f"{Class} must subclass monarch.service.Actor to spawn it."
|
179
|
-
)
|
180
|
-
|
181
|
-
actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
|
182
|
-
service = ActorMeshRef(
|
183
|
-
Class,
|
184
|
-
_ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
|
185
|
-
self._mailbox,
|
186
|
-
)
|
187
|
-
# useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
|
188
|
-
# doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
|
189
|
-
service._create(args, kwargs)
|
190
|
-
return cast(T, service)
|
191
|
-
|
192
|
-
@property
|
193
|
-
def _device_mesh(self) -> "DeviceMesh":
|
194
|
-
if spawn_tensor_engine is None:
|
195
|
-
raise RuntimeError(
|
196
|
-
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
|
197
|
-
)
|
198
|
-
if self._maybe_device_mesh is None:
|
199
|
-
if self._mock_shape is not None:
|
200
|
-
raise NotImplementedError(
|
201
|
-
"NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
|
202
|
-
)
|
203
|
-
self._maybe_device_mesh = spawn_tensor_engine(self)
|
204
|
-
return self._maybe_device_mesh
|
205
|
-
|
206
|
-
# pyre-ignore
|
207
|
-
def activate(self) -> AbstractContextManager:
|
208
|
-
return self._device_mesh.activate()
|
209
|
-
|
210
|
-
def rank_tensor(self, dim: str | Sequence[str]) -> "torch.Tensor":
|
211
|
-
return self._device_mesh.rank(dim)
|
212
|
-
|
213
|
-
def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
|
214
|
-
return self._device_mesh.ranks
|
215
|
-
|
216
|
-
async def sync_workspace(self) -> None:
|
217
|
-
if self._rsync_mesh_client is None:
|
218
|
-
# TODO(agallagher): We need some way to configure and pass this
|
219
|
-
# in -- right now we're assuming the `gpu` dimension, which isn't
|
220
|
-
# correct.
|
221
|
-
assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
|
222
|
-
# The workspace shape (i.e. only perform one rsync per host).
|
223
|
-
workspace_shape = self.slice(gpus=slice(0, 1, 1))._mock_shape
|
224
|
-
assert workspace_shape is not None
|
225
|
-
# TODO(agallagher): We should probably hide this behind something
|
226
|
-
# like a `Workspace` class and support abstracting/configuring
|
227
|
-
# different sync methods.
|
228
|
-
self._rsync_mesh_client = RsyncMeshClient.spawn_blocking(
|
229
|
-
proc_mesh=self._proc_mesh,
|
230
|
-
shape=workspace_shape,
|
231
|
-
# TODO(agallagher): Is there a better way to infer/set the local
|
232
|
-
# workspace dir, rather than use PWD?
|
233
|
-
local_workspace=os.getcwd(),
|
234
|
-
remote_workspace=RemoteWorkspace.FromEnvVar("WORKSPACE_DIR"),
|
235
|
-
)
|
236
|
-
await self._rsync_mesh_client.sync_workspace()
|
237
|
-
|
238
|
-
|
239
|
-
async def local_proc_mesh_nonblocking(
|
240
|
-
*, gpus: Optional[int] = None, hosts: int = 1
|
241
|
-
) -> ProcMesh:
|
242
|
-
if gpus is None:
|
243
|
-
gpus = _local_device_count()
|
244
|
-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
245
|
-
allocator = monarch.LocalAllocator()
|
246
|
-
alloc = await allocator.allocate(spec)
|
247
|
-
return await ProcMesh.from_alloc(alloc)
|
248
|
-
|
249
|
-
|
250
|
-
def local_proc_mesh_blocking(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
|
251
|
-
if gpus is None:
|
252
|
-
gpus = _local_device_count()
|
253
|
-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
254
|
-
allocator = monarch.LocalAllocator()
|
255
|
-
alloc = allocator.allocate(spec).get()
|
256
|
-
return ProcMesh.from_alloc(alloc).get()
|
257
|
-
|
258
|
-
|
259
|
-
def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
|
260
|
-
return Future(
|
261
|
-
lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
|
262
|
-
lambda: local_proc_mesh_blocking(gpus=gpus, hosts=hosts),
|
263
|
-
)
|
264
|
-
|
265
|
-
|
266
|
-
_BOOTSTRAP_MAIN = "monarch.bootstrap_main"
|
267
|
-
|
268
|
-
|
269
|
-
def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
|
270
|
-
if IN_PAR:
|
271
|
-
cmd = sys.argv[0]
|
272
|
-
args = None
|
273
|
-
env = {
|
274
|
-
"PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
|
275
|
-
}
|
276
|
-
else:
|
277
|
-
cmd = sys.executable
|
278
|
-
args = ["-m", _BOOTSTRAP_MAIN]
|
279
|
-
env = {}
|
280
|
-
|
281
|
-
return cmd, args, env
|
282
|
-
|
283
|
-
|
284
|
-
async def proc_mesh_nonblocking(
|
285
|
-
*, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
|
286
|
-
) -> ProcMesh:
|
287
|
-
if gpus is None:
|
288
|
-
gpus = _local_device_count()
|
289
|
-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
290
|
-
env = env or {}
|
291
|
-
cmd, args, base_env = _get_bootstrap_args()
|
292
|
-
env.update(base_env)
|
293
|
-
allocator = monarch.ProcessAllocator(cmd, args, env)
|
294
|
-
alloc = await allocator.allocate(spec)
|
295
|
-
return await ProcMesh.from_alloc(alloc)
|
296
|
-
|
297
|
-
|
298
|
-
def proc_mesh_blocking(
|
299
|
-
*, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
|
300
|
-
) -> ProcMesh:
|
301
|
-
if gpus is None:
|
302
|
-
gpus = _local_device_count()
|
303
|
-
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
304
|
-
env = env or {}
|
305
|
-
cmd, args, base_env = _get_bootstrap_args()
|
306
|
-
env.update(base_env)
|
307
|
-
allocator = monarch.ProcessAllocator(cmd, args, env)
|
308
|
-
alloc = allocator.allocate(spec).get()
|
309
|
-
return ProcMesh.from_alloc(alloc).get()
|
310
|
-
|
311
14
|
|
312
|
-
|
313
|
-
*, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
|
314
|
-
) -> Future[ProcMesh]:
|
315
|
-
return Future(
|
316
|
-
lambda: proc_mesh_nonblocking(gpus=gpus, hosts=hosts, env=env),
|
317
|
-
lambda: proc_mesh_blocking(gpus=gpus, hosts=hosts, env=env),
|
318
|
-
)
|
15
|
+
from monarch._src.actor.proc_mesh import * # noqa
|
monarch/python_local_mesh.py
CHANGED
@@ -11,7 +11,7 @@ from time import sleep
|
|
11
11
|
from typing import Optional, TYPE_CHECKING
|
12
12
|
|
13
13
|
import monarch_supervisor
|
14
|
-
from monarch.
|
14
|
+
from monarch._src.actor.device_utils import _local_device_count
|
15
15
|
from monarch.common.fake import fake_call
|
16
16
|
from monarch.common.invocation import DeviceException, RemoteException
|
17
17
|
from monarch.world_mesh import world_mesh
|
monarch/rust_backend_mesh.py
CHANGED
@@ -20,11 +20,12 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
|
|
20
20
|
init_proc,
|
21
21
|
Proc,
|
22
22
|
)
|
23
|
+
|
24
|
+
from monarch._src.actor.shape import NDSlice
|
23
25
|
from monarch.common.client import Client
|
24
26
|
from monarch.common.device_mesh import DeviceMesh, DeviceMeshStatus
|
25
27
|
from monarch.common.invocation import DeviceException, RemoteException
|
26
28
|
from monarch.common.mast import MastJob
|
27
|
-
from monarch.common.shape import NDSlice
|
28
29
|
from monarch.controller.rust_backend.controller import RustController
|
29
30
|
|
30
31
|
TORCHX_MAST_TASK_GROUP_NAME = "script"
|
monarch/rust_local_mesh.py
CHANGED
@@ -71,7 +71,7 @@ _MONARCH_TENSOR_WORKER_MAIN = "monarch.tensor_worker_main"
|
|
71
71
|
try:
|
72
72
|
from __manifest__ import fbmake # noqa
|
73
73
|
|
74
|
-
IN_PAR =
|
74
|
+
IN_PAR = bool(fbmake.get("par_style"))
|
75
75
|
except ImportError:
|
76
76
|
IN_PAR = False
|
77
77
|
|
@@ -122,7 +122,9 @@ _PROC_ENV: dict[str, str] = {}
|
|
122
122
|
|
123
123
|
def get_controller_main() -> tuple[Path, dict[str, str]]:
|
124
124
|
with (
|
125
|
-
importlib.resources.
|
125
|
+
importlib.resources.as_file(
|
126
|
+
importlib.resources.files("monarch") / "monarch_controller"
|
127
|
+
) as controller_main,
|
126
128
|
):
|
127
129
|
if not controller_main.exists():
|
128
130
|
if IN_PAR:
|
monarch/sim_mesh.py
CHANGED
@@ -31,7 +31,6 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
|
|
31
31
|
)
|
32
32
|
|
33
33
|
from monarch._rust_bindings.monarch_extension.simulator_client import ( # @manual=//monarch/monarch_extension:monarch_extension
|
34
|
-
bootstrap_simulator_backend,
|
35
34
|
SimulatorClient,
|
36
35
|
)
|
37
36
|
|
@@ -40,6 +39,8 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarc
|
|
40
39
|
init_proc,
|
41
40
|
Proc,
|
42
41
|
)
|
42
|
+
|
43
|
+
from monarch._src.actor.shape import NDSlice
|
43
44
|
from monarch.common.client import Client
|
44
45
|
from monarch.common.constants import (
|
45
46
|
SIM_MESH_CLIENT_SUPERVISION_UPDATE_INTERVAL,
|
@@ -50,7 +51,6 @@ from monarch.common.fake import fake_call
|
|
50
51
|
from monarch.common.future import Future, T
|
51
52
|
from monarch.common.invocation import DeviceException, RemoteException
|
52
53
|
from monarch.common.messages import Dims
|
53
|
-
from monarch.common.shape import NDSlice
|
54
54
|
from monarch.controller.rust_backend.controller import RustController
|
55
55
|
from monarch.rust_backend_mesh import MeshWorld
|
56
56
|
|
@@ -58,9 +58,7 @@ from monarch.rust_backend_mesh import MeshWorld
|
|
58
58
|
logger: logging.Logger = logging.getLogger(__name__)
|
59
59
|
|
60
60
|
|
61
|
-
def sim_mesh(
|
62
|
-
n_meshes: int, hosts: int, gpus_per_host: int, proxy_addr: Optional[str] = None
|
63
|
-
) -> List[DeviceMesh]:
|
61
|
+
def sim_mesh(n_meshes: int, hosts: int, gpus_per_host: int) -> List[DeviceMesh]:
|
64
62
|
"""
|
65
63
|
Creates a single simulated device mesh with the given number of per host.
|
66
64
|
|
@@ -75,7 +73,6 @@ def sim_mesh(
|
|
75
73
|
bootstrap: Bootstrap = Bootstrap(
|
76
74
|
n_meshes,
|
77
75
|
mesh_world_state,
|
78
|
-
proxy_addr=proxy_addr,
|
79
76
|
world_size=hosts * gpus_per_host,
|
80
77
|
)
|
81
78
|
|
@@ -180,14 +177,12 @@ class Bootstrap:
|
|
180
177
|
self,
|
181
178
|
num_meshes: int,
|
182
179
|
mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]],
|
183
|
-
proxy_addr: Optional[str] = None,
|
184
180
|
world_size: int = 1,
|
185
181
|
) -> None:
|
186
182
|
"""
|
187
183
|
Bootstraps a SimMesh.
|
188
184
|
Args:
|
189
185
|
num_meshes: int - number of meshes to create.
|
190
|
-
proxy_addr: Option[str] - the proxy address of the simulation process
|
191
186
|
mesh_world_state: a state of the meshes. Keys are the MeshWorld and values are boolean indicating if this mesh is active.
|
192
187
|
"""
|
193
188
|
# do a fake call to instantiate ThreadPoolExecutor so we don't block GIL later
|
@@ -198,17 +193,11 @@ class Bootstrap:
|
|
198
193
|
|
199
194
|
self._mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]] = mesh_world_state
|
200
195
|
|
201
|
-
|
202
|
-
self.
|
203
|
-
|
204
|
-
client_proxy_addr = f"unix!@{_random_id()}-proxy"
|
205
|
-
self.client_listen_addr: str = f"sim!unix!@client,{client_proxy_addr}"
|
206
|
-
self.client_bootstrap_addr: str = (
|
207
|
-
f"sim!unix!@client,{client_proxy_addr},unix!@system,{proxy_addr}"
|
208
|
-
)
|
209
|
-
bootstrap_simulator_backend(self.bootstrap_addr, proxy_addr, world_size)
|
196
|
+
self.bootstrap_addr: str = "sim!unix!@system"
|
197
|
+
self.client_listen_addr = "sim!unix!@client"
|
198
|
+
self.client_bootstrap_addr = "sim!unix!@client,unix!@system"
|
210
199
|
|
211
|
-
self._simulator_client = SimulatorClient(
|
200
|
+
self._simulator_client = SimulatorClient(self.bootstrap_addr, world_size)
|
212
201
|
for i in range(num_meshes):
|
213
202
|
mesh_name: str = f"mesh_{i}"
|
214
203
|
controller_world: str = f"{mesh_name}_controller"
|
@@ -234,7 +223,9 @@ class Bootstrap:
|
|
234
223
|
worker_world, controller_id = mesh_world
|
235
224
|
controller_world = controller_id.world_name
|
236
225
|
self._simulator_client.spawn_mesh(
|
237
|
-
self.bootstrap_addr,
|
226
|
+
self.bootstrap_addr,
|
227
|
+
f"{controller_world}[0].root",
|
228
|
+
worker_world,
|
238
229
|
)
|
239
230
|
|
240
231
|
|
@@ -12,9 +12,9 @@ from dataclasses import dataclass
|
|
12
12
|
from typing import List, NamedTuple, Optional, Sequence
|
13
13
|
|
14
14
|
import torch
|
15
|
+
from monarch._src.actor.shape import NDSlice
|
15
16
|
|
16
17
|
from monarch.common import messages
|
17
|
-
from monarch.common.shape import NDSlice
|
18
18
|
from monarch.simulator.ir import IRGraph
|
19
19
|
from monarch.simulator.tensor import DTensorRef
|
20
20
|
from monarch.simulator.utils import clean_name, file_path_with_iter
|
monarch/simulator/interface.py
CHANGED
@@ -6,9 +6,10 @@
|
|
6
6
|
|
7
7
|
from typing import Union
|
8
8
|
|
9
|
+
from monarch._src.actor.shape import NDSlice
|
10
|
+
|
9
11
|
from monarch.common.client import Client as _Client
|
10
12
|
from monarch.common.device_mesh import DeviceMesh
|
11
|
-
from monarch.common.shape import NDSlice
|
12
13
|
|
13
14
|
from monarch.simulator.ir import IRGraph
|
14
15
|
from monarch.simulator.simulator import (
|
@@ -25,6 +25,7 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
|
|
25
25
|
from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
26
26
|
ActorId,
|
27
27
|
)
|
28
|
+
from monarch._src.actor.shape import iter_ranks, NDSlice, Slices as Ranks
|
28
29
|
|
29
30
|
from monarch.common import messages
|
30
31
|
|
@@ -32,7 +33,6 @@ from monarch.common.controller_api import DebuggerMessage, LogMessage, MessageRe
|
|
32
33
|
from monarch.common.device_mesh import no_mesh
|
33
34
|
from monarch.common.invocation import Invocation, RemoteException, Seq
|
34
35
|
from monarch.common.reference import Ref
|
35
|
-
from monarch.common.shape import iter_ranks, NDSlice, Slices as Ranks
|
36
36
|
from monarch.common.tree import flatten
|
37
37
|
|
38
38
|
if TYPE_CHECKING:
|
monarch/simulator/simulator.py
CHANGED
@@ -43,12 +43,12 @@ import torch
|
|
43
43
|
from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
44
44
|
ActorId,
|
45
45
|
)
|
46
|
+
from monarch._src.actor.shape import iter_ranks, NDSlice
|
46
47
|
from monarch.common import messages
|
47
48
|
from monarch.common.controller_api import LogMessage, MessageResult
|
48
49
|
from monarch.common.device_mesh import DeviceMesh
|
49
50
|
from monarch.common.function import ResolvableFunction, ResolvableFunctionFromPath
|
50
51
|
from monarch.common.invocation import DeviceException
|
51
|
-
from monarch.common.shape import iter_ranks, NDSlice
|
52
52
|
from monarch.simulator.command_history import CommandHistory, DTensorRef
|
53
53
|
from monarch.simulator.config import META_VAL
|
54
54
|
from monarch.simulator.ir import IRGraph
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
"""
|
8
|
+
Monarch Tensor Engine API - Public interface for tensor engine functionality.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from monarch._src.tensor_engine.rdma import (
|
12
|
+
is_available,
|
13
|
+
RDMABuffer,
|
14
|
+
RDMAReadTransferWarning,
|
15
|
+
RDMAWriteTransferWarning,
|
16
|
+
)
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"is_available",
|
20
|
+
"RDMABuffer",
|
21
|
+
"RDMAReadTransferWarning",
|
22
|
+
"RDMAWriteTransferWarning",
|
23
|
+
]
|
monarch/tensor_worker_main.py
CHANGED
@@ -249,7 +249,9 @@ if __name__ == "__main__":
|
|
249
249
|
torch.cuda.set_device = check_set_device
|
250
250
|
|
251
251
|
with (
|
252
|
-
importlib.resources.
|
252
|
+
importlib.resources.as_file(
|
253
|
+
importlib.resources.files("monarch") / "py-spy"
|
254
|
+
) as pyspy,
|
253
255
|
):
|
254
256
|
if pyspy.exists():
|
255
257
|
os.environ["PYSPY_BIN"] = str(pyspy)
|
monarch/tools/cli.py
CHANGED
@@ -86,7 +86,9 @@ class CreateCmd:
|
|
86
86
|
else defaults.component_fn(config.scheduler)
|
87
87
|
)
|
88
88
|
component_args = component_args_from_cli(component_fn, args.component_args)
|
89
|
-
|
89
|
+
appdef = component_fn(**component_args)
|
90
|
+
|
91
|
+
handle = create(config, appdef)
|
90
92
|
print(handle)
|
91
93
|
|
92
94
|
|