torchmonarch-nightly 2025.8.2__cp310-cp310-manylinux2014_x86_64.whl → 2025.9.3__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +414 -216
- monarch/_src/actor/allocator.py +75 -6
- monarch/_src/actor/bootstrap_main.py +7 -4
- monarch/_src/actor/code_sync/__init__.py +2 -0
- monarch/_src/actor/debugger/__init__.py +7 -0
- monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
- monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
- monarch/_src/actor/endpoint.py +27 -45
- monarch/_src/actor/future.py +86 -24
- monarch/_src/actor/host_mesh.py +125 -0
- monarch/_src/actor/logging.py +94 -0
- monarch/_src/actor/pickle.py +25 -0
- monarch/_src/actor/proc_mesh.py +423 -156
- monarch/_src/actor/python_extension_methods.py +90 -0
- monarch/_src/actor/shape.py +8 -1
- monarch/_src/actor/source_loader.py +45 -0
- monarch/_src/actor/telemetry/__init__.py +172 -0
- monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
- monarch/_src/debug_cli/__init__.py +7 -0
- monarch/_src/debug_cli/debug_cli.py +43 -0
- monarch/_src/tensor_engine/rdma.py +64 -9
- monarch/_testing.py +1 -3
- monarch/actor/__init__.py +24 -4
- monarch/common/_C.so +0 -0
- monarch/common/device_mesh.py +14 -0
- monarch/common/future.py +10 -0
- monarch/common/remote.py +14 -25
- monarch/common/tensor.py +12 -0
- monarch/debug_cli/__init__.py +7 -0
- monarch/debug_cli/__main__.py +12 -0
- monarch/fetch.py +2 -2
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +4 -2
- monarch/mesh_controller.py +34 -14
- monarch/monarch_controller +0 -0
- monarch/tools/colors.py +25 -0
- monarch/tools/commands.py +42 -7
- monarch/tools/components/hyperactor.py +1 -1
- monarch/tools/config/__init__.py +31 -4
- monarch/tools/config/defaults.py +13 -3
- monarch/tools/config/environment.py +45 -0
- monarch/tools/config/workspace.py +165 -0
- monarch/tools/mesh_spec.py +2 -0
- monarch/utils/__init__.py +9 -0
- monarch/utils/utils.py +78 -0
- tests/error_test_binary.py +5 -3
- tests/python_actor_test_binary.py +52 -0
- tests/test_actor_error.py +142 -14
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +59 -72
- tests/test_debugger.py +639 -45
- tests/test_env_before_cuda.py +4 -4
- tests/test_mesh_trait.py +38 -0
- tests/test_python_actors.py +965 -75
- tests/test_rdma.py +7 -6
- tests/test_tensor_engine.py +6 -6
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
monarch/_src/actor/proc_mesh.py
CHANGED
@@ -6,83 +6,91 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
|
+
import asyncio
|
9
10
|
import logging
|
10
11
|
import os
|
11
12
|
import sys
|
13
|
+
import threading
|
12
14
|
import warnings
|
13
15
|
from contextlib import AbstractContextManager
|
14
16
|
|
17
|
+
from functools import cache
|
18
|
+
from pathlib import Path
|
19
|
+
|
15
20
|
from typing import (
|
16
21
|
Any,
|
17
22
|
Callable,
|
18
23
|
cast,
|
19
24
|
Dict,
|
20
25
|
List,
|
26
|
+
Literal,
|
21
27
|
Optional,
|
22
28
|
Sequence,
|
29
|
+
Tuple,
|
23
30
|
Type,
|
24
31
|
TYPE_CHECKING,
|
25
32
|
TypeVar,
|
26
33
|
)
|
34
|
+
from weakref import WeakValueDictionary
|
27
35
|
|
28
|
-
from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
|
29
36
|
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
30
37
|
Alloc,
|
31
38
|
AllocConstraints,
|
32
39
|
AllocSpec,
|
33
40
|
)
|
34
|
-
|
41
|
+
|
35
42
|
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
36
43
|
ProcMesh as HyProcMesh,
|
37
44
|
ProcMeshMonitor,
|
38
45
|
)
|
46
|
+
from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
|
39
47
|
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
40
|
-
from monarch._src.actor.actor_mesh import _Actor,
|
41
|
-
|
48
|
+
from monarch._src.actor.actor_mesh import _Actor, Actor, ActorMesh, context
|
42
49
|
from monarch._src.actor.allocator import (
|
43
50
|
AllocateMixin,
|
51
|
+
AllocHandle,
|
44
52
|
LocalAllocator,
|
45
53
|
ProcessAllocator,
|
46
54
|
SimAllocator,
|
47
55
|
)
|
48
56
|
from monarch._src.actor.code_sync import (
|
49
57
|
CodeSyncMeshClient,
|
58
|
+
CodeSyncMethod,
|
50
59
|
RemoteWorkspace,
|
60
|
+
WorkspaceConfig,
|
51
61
|
WorkspaceLocation,
|
52
62
|
WorkspaceShape,
|
53
63
|
)
|
54
|
-
from monarch._src.actor.debugger import (
|
55
|
-
_DEBUG_MANAGER_ACTOR_NAME,
|
56
|
-
DebugClient,
|
57
|
-
DebugManager,
|
58
|
-
)
|
59
|
-
|
60
64
|
from monarch._src.actor.device_utils import _local_device_count
|
61
65
|
|
62
66
|
from monarch._src.actor.endpoint import endpoint
|
63
|
-
from monarch._src.actor.future import Future
|
67
|
+
from monarch._src.actor.future import DeprecatedNotAFuture, Future
|
68
|
+
from monarch._src.actor.logging import LoggingManager
|
64
69
|
from monarch._src.actor.shape import MeshTrait
|
70
|
+
from monarch.tools.config.environment import CondaEnvironment
|
71
|
+
from monarch.tools.config.workspace import Workspace
|
72
|
+
from monarch.tools.utils import conda as conda_utils
|
65
73
|
|
66
|
-
HAS_TENSOR_ENGINE = False
|
67
|
-
try:
|
68
|
-
# Torch is needed for tensor engine
|
69
|
-
import torch # @manual
|
70
74
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
75
|
+
@cache
|
76
|
+
def _has_tensor_engine() -> bool:
|
77
|
+
try:
|
78
|
+
# Torch is needed for tensor engine
|
79
|
+
import torch # @manual
|
76
80
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
+
# Confirm that rust bindings were built with tensor engine enabled
|
82
|
+
from monarch._rust_bindings.rdma import _RdmaManager # noqa
|
83
|
+
|
84
|
+
return True
|
85
|
+
except ImportError:
|
86
|
+
logging.warning("Tensor engine is not available on this platform")
|
87
|
+
return False
|
81
88
|
|
82
89
|
|
83
90
|
if TYPE_CHECKING:
|
84
91
|
Tensor = Any
|
85
92
|
DeviceMesh = Any
|
93
|
+
from monarch._src.actor.host_mesh import HostMesh
|
86
94
|
|
87
95
|
|
88
96
|
class SetupActor(Actor):
|
@@ -114,55 +122,108 @@ except ImportError:
|
|
114
122
|
IN_PAR = False
|
115
123
|
|
116
124
|
|
117
|
-
|
125
|
+
# A temporary gate used by the PythonActorMesh/PythonActorMeshRef migration.
|
126
|
+
# We can use this gate to quickly roll back to using _ActorMeshRefImpl, if we
|
127
|
+
# encounter any issues with the migration.
|
128
|
+
#
|
129
|
+
# This should be removed once we confirm PythonActorMesh/PythonActorMeshRef is
|
130
|
+
# working correctly in production.
|
131
|
+
@cache
|
132
|
+
def _use_standin_mesh() -> bool:
|
133
|
+
return os.getenv("USE_STANDIN_ACTOR_MESH", default="0") != "0"
|
134
|
+
|
135
|
+
|
136
|
+
# Ultra-hack to allow actors to identify proc meshes but with no real functionality.
|
137
|
+
class ProcMeshRef:
|
138
|
+
def __init__(self, proc_mesh_id: int) -> None:
|
139
|
+
self._proc_mesh_id = proc_mesh_id
|
140
|
+
self._host_mesh: Optional["HostMesh"] = None
|
141
|
+
|
142
|
+
@classmethod
|
143
|
+
def _fake_proc_mesh(cls, proc_mesh_id: int) -> "ProcMesh":
|
144
|
+
return cast(ProcMesh, cls(proc_mesh_id))
|
145
|
+
|
146
|
+
def __getattr__(self, attr: str) -> Any:
|
147
|
+
# AttributeError instead of NotImplementedError so that any hasattr calls
|
148
|
+
# will properly return False
|
149
|
+
raise AttributeError(
|
150
|
+
f"NYI: attempting to get ProcMesh attribute `{attr}` on object that's actually a ProcMeshRef"
|
151
|
+
)
|
152
|
+
|
153
|
+
def __hash__(self) -> int:
|
154
|
+
return hash(self._proc_mesh_id)
|
155
|
+
|
156
|
+
def __eq__(self, other: object) -> bool:
|
157
|
+
if not isinstance(other, ProcMeshRef):
|
158
|
+
return False
|
159
|
+
return self._proc_mesh_id == other._proc_mesh_id
|
160
|
+
|
161
|
+
@property
|
162
|
+
def _proc_mesh(self) -> Shared["HyProcMesh"]:
|
163
|
+
return _deref_proc_mesh(self)._proc_mesh
|
164
|
+
|
165
|
+
|
166
|
+
_proc_mesh_lock: threading.Lock = threading.Lock()
|
167
|
+
_proc_mesh_key: int = 0
|
168
|
+
_proc_mesh_registry: WeakValueDictionary[ProcMeshRef, "ProcMesh"] = (
|
169
|
+
WeakValueDictionary()
|
170
|
+
)
|
171
|
+
|
172
|
+
|
173
|
+
def _deref_proc_mesh(proc_mesh: ProcMeshRef) -> "ProcMesh":
|
174
|
+
if proc_mesh not in _proc_mesh_registry:
|
175
|
+
raise ValueError(
|
176
|
+
f"ProcMesh with id {proc_mesh._proc_mesh_id} does not exist on host."
|
177
|
+
)
|
178
|
+
return _proc_mesh_registry[proc_mesh]
|
179
|
+
|
180
|
+
|
181
|
+
class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
118
182
|
def __init__(
|
119
183
|
self,
|
120
|
-
hy_proc_mesh: HyProcMesh,
|
121
|
-
|
184
|
+
hy_proc_mesh: "Shared[HyProcMesh]",
|
185
|
+
shape: Shape,
|
122
186
|
_device_mesh: Optional["DeviceMesh"] = None,
|
123
187
|
) -> None:
|
124
188
|
self._proc_mesh = hy_proc_mesh
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
self.
|
189
|
+
global _proc_mesh_lock, _proc_mesh_key
|
190
|
+
with _proc_mesh_lock:
|
191
|
+
self._proc_mesh_id: int = _proc_mesh_key
|
192
|
+
_proc_mesh_key += 1
|
193
|
+
self._shape = shape
|
194
|
+
# until we have real slicing support keep track
|
195
|
+
# of whether this is a slice of a real proc_meshg
|
196
|
+
self._slice = False
|
130
197
|
self._code_sync_client: Optional[CodeSyncMeshClient] = None
|
131
|
-
self.
|
198
|
+
self._logging_manager: LoggingManager = LoggingManager()
|
132
199
|
self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
|
133
200
|
self._stopped = False
|
201
|
+
self._controller_controller: Optional["_ControllerController"] = None
|
202
|
+
# current set only for context()'s proc_mesh to be a local host mesh.
|
203
|
+
self._host_mesh: Optional["HostMesh"] = None
|
134
204
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
if HAS_TENSOR_ENGINE and _RdmaBuffer.rdma_supported()
|
144
|
-
else None
|
145
|
-
)
|
146
|
-
|
147
|
-
_debug_manager = await self._spawn_nonblocking(
|
148
|
-
_DEBUG_MANAGER_ACTOR_NAME, DebugManager, await _debug_client()
|
149
|
-
)
|
205
|
+
@property
|
206
|
+
def initialized(self) -> Future[Literal[True]]:
|
207
|
+
"""
|
208
|
+
Future completes with 'True' when the ProcMesh has initialized.
|
209
|
+
Because ProcMesh are remote objects, there is no guarentee that the ProcMesh is
|
210
|
+
still usable after this completes, only that at some point in the past it was usable.
|
211
|
+
"""
|
212
|
+
pm: Shared[HyProcMesh] = self._proc_mesh
|
150
213
|
|
151
|
-
|
152
|
-
|
214
|
+
async def task() -> Literal[True]:
|
215
|
+
await pm
|
216
|
+
return True
|
153
217
|
|
154
|
-
|
155
|
-
# If the user has passed the setup lambda, we need to call
|
156
|
-
# it here before any of the other actors are spawned so that
|
157
|
-
# the environment variables are set up before cuda init.
|
158
|
-
setup_actor = await self._spawn_nonblocking("setup", SetupActor, setup)
|
159
|
-
# pyre-ignore
|
160
|
-
await setup_actor.setup.call()._status.coro
|
161
|
-
return self
|
218
|
+
return Future(coro=task())
|
162
219
|
|
163
220
|
@property
|
164
|
-
def
|
165
|
-
|
221
|
+
def host_mesh(self) -> "HostMesh":
|
222
|
+
if self._host_mesh is None:
|
223
|
+
raise NotImplementedError(
|
224
|
+
"NYI complete for release 0.1 (ProcMeshRef knowing its host mesh)"
|
225
|
+
)
|
226
|
+
return self._host_mesh
|
166
227
|
|
167
228
|
@property
|
168
229
|
def _ndslice(self) -> Slice:
|
@@ -173,17 +234,34 @@ class ProcMesh(MeshTrait):
|
|
173
234
|
return self._shape.labels
|
174
235
|
|
175
236
|
def _new_with_shape(self, shape: Shape) -> "ProcMesh":
|
237
|
+
# make sure that if we slice something with unity,
|
238
|
+
# we do not lose the ability to spawn on it.
|
239
|
+
# remote when spawn is implemented.
|
240
|
+
if shape == self._shape:
|
241
|
+
return self
|
176
242
|
device_mesh = (
|
177
243
|
None
|
178
244
|
if self._maybe_device_mesh is None
|
179
245
|
else self._device_mesh._new_with_shape(shape)
|
180
246
|
)
|
181
|
-
|
247
|
+
pm = ProcMesh(self._proc_mesh, shape, _device_mesh=device_mesh)
|
248
|
+
pm._slice = True
|
249
|
+
return pm
|
182
250
|
|
183
|
-
def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) ->
|
184
|
-
if self.
|
251
|
+
def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> T:
|
252
|
+
if self._slice:
|
185
253
|
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
186
|
-
return
|
254
|
+
return self._spawn_nonblocking(name, Class, *args, **kwargs)
|
255
|
+
|
256
|
+
@property
|
257
|
+
async def _proc_mesh_for_asyncio_fixme(self) -> HyProcMesh:
|
258
|
+
"""
|
259
|
+
Get ProcMesh on the asyncio event stream.
|
260
|
+
We should redo this functionality to work on the tokio stream.
|
261
|
+
This must be called on the asyncio stream.
|
262
|
+
"""
|
263
|
+
assert asyncio.get_running_loop() is not None
|
264
|
+
return await Future(coro=self._proc_mesh.task())
|
187
265
|
|
188
266
|
async def monitor(self) -> ProcMeshMonitor:
|
189
267
|
"""
|
@@ -201,12 +279,17 @@ class ProcMesh(MeshTrait):
|
|
201
279
|
# Kick off in background
|
202
280
|
asyncio.create_task(monitor_loop(monitor))
|
203
281
|
"""
|
204
|
-
|
282
|
+
# todo: move monitor to tokio loop
|
283
|
+
proc_mesh = await Future(coro=self._proc_mesh.task())
|
284
|
+
return await proc_mesh.monitor()
|
205
285
|
|
206
286
|
@classmethod
|
207
287
|
def from_alloc(
|
208
|
-
self,
|
209
|
-
|
288
|
+
self,
|
289
|
+
alloc: AllocHandle,
|
290
|
+
setup: Callable[[], None] | None = None,
|
291
|
+
_attach_controller_controller: bool = True,
|
292
|
+
) -> "ProcMesh":
|
210
293
|
"""
|
211
294
|
Allocate a process mesh according to the provided alloc.
|
212
295
|
Returns when the mesh is fully allocated.
|
@@ -225,37 +308,98 @@ class ProcMesh(MeshTrait):
|
|
225
308
|
os.environ["LOCAL_RANK"] = str(rank["gpus"])
|
226
309
|
```
|
227
310
|
"""
|
228
|
-
|
229
|
-
|
311
|
+
|
312
|
+
async def task() -> HyProcMesh:
|
313
|
+
return await HyProcMesh.allocate_nonblocking(await alloc._hy_alloc)
|
314
|
+
|
315
|
+
shape = Shape(
|
316
|
+
list(alloc._extent.keys()),
|
317
|
+
Slice.new_row_major(list(alloc._extent.values())),
|
230
318
|
)
|
231
319
|
|
320
|
+
hy_proc_mesh = PythonTask.from_coroutine(task()).spawn()
|
321
|
+
|
322
|
+
pm = ProcMesh(hy_proc_mesh, shape)
|
323
|
+
if _attach_controller_controller:
|
324
|
+
instance = context().actor_instance
|
325
|
+
pm._controller_controller = instance._controller_controller
|
326
|
+
instance._add_child(pm)
|
327
|
+
|
328
|
+
async def task(
|
329
|
+
pm: "ProcMesh",
|
330
|
+
hy_proc_mesh_task: "Shared[HyProcMesh]",
|
331
|
+
setup_actor: Optional[SetupActor],
|
332
|
+
stream_log_to_client: bool,
|
333
|
+
) -> HyProcMesh:
|
334
|
+
hy_proc_mesh = await hy_proc_mesh_task
|
335
|
+
|
336
|
+
await pm._logging_manager.init(hy_proc_mesh, stream_log_to_client)
|
337
|
+
|
338
|
+
if setup_actor is not None:
|
339
|
+
await setup_actor.setup.call()
|
340
|
+
|
341
|
+
return hy_proc_mesh
|
342
|
+
|
343
|
+
setup_actor = None
|
344
|
+
if setup is not None:
|
345
|
+
# If the user has passed the setup lambda, we need to call
|
346
|
+
# it here before any of the other actors are spawned so that
|
347
|
+
# the environment variables are set up before cuda init.
|
348
|
+
setup_actor = pm._spawn_nonblocking_on(
|
349
|
+
hy_proc_mesh, "setup", SetupActor, setup
|
350
|
+
)
|
351
|
+
|
352
|
+
pm._proc_mesh = PythonTask.from_coroutine(
|
353
|
+
task(pm, hy_proc_mesh, setup_actor, alloc.stream_logs)
|
354
|
+
).spawn()
|
355
|
+
|
356
|
+
return pm
|
357
|
+
|
232
358
|
def __repr__(self) -> str:
|
233
359
|
return repr(self._proc_mesh)
|
234
360
|
|
235
361
|
def __str__(self) -> str:
|
236
362
|
return str(self._proc_mesh)
|
237
363
|
|
238
|
-
|
364
|
+
def _spawn_nonblocking(
|
239
365
|
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
366
|
+
) -> T:
|
367
|
+
return self._spawn_nonblocking_on(self._proc_mesh, name, Class, *args, **kwargs)
|
368
|
+
|
369
|
+
def to_table(self) -> str:
|
370
|
+
return self._device_mesh.to_table()
|
371
|
+
|
372
|
+
def _spawn_nonblocking_on(
|
373
|
+
self,
|
374
|
+
pm: "Shared[HyProcMesh]",
|
375
|
+
name: str,
|
376
|
+
Class: Type[T],
|
377
|
+
*args: Any,
|
378
|
+
**kwargs: Any,
|
240
379
|
) -> T:
|
241
380
|
if not issubclass(Class, Actor):
|
242
381
|
raise ValueError(
|
243
382
|
f"{Class} must subclass monarch.service.Actor to spawn it."
|
244
383
|
)
|
245
|
-
|
246
|
-
|
384
|
+
|
385
|
+
actor_mesh = HyProcMesh.spawn_async(pm, name, _Actor, _use_standin_mesh())
|
386
|
+
instance = context().actor_instance
|
387
|
+
service = ActorMesh._create(
|
247
388
|
Class,
|
248
|
-
|
249
|
-
|
389
|
+
actor_mesh,
|
390
|
+
instance._mailbox,
|
391
|
+
self._shape,
|
392
|
+
self,
|
393
|
+
self._controller_controller,
|
394
|
+
*args,
|
395
|
+
**kwargs,
|
250
396
|
)
|
251
|
-
|
252
|
-
# doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
|
253
|
-
service._create(args, kwargs)
|
397
|
+
instance._add_child(service)
|
254
398
|
return cast(T, service)
|
255
399
|
|
256
400
|
@property
|
257
401
|
def _device_mesh(self) -> "DeviceMesh":
|
258
|
-
if not
|
402
|
+
if not _has_tensor_engine():
|
259
403
|
raise RuntimeError(
|
260
404
|
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
|
261
405
|
)
|
@@ -264,7 +408,7 @@ class ProcMesh(MeshTrait):
|
|
264
408
|
from monarch.mesh_controller import spawn_tensor_engine # @manual
|
265
409
|
|
266
410
|
if self._maybe_device_mesh is None:
|
267
|
-
if self.
|
411
|
+
if self._slice:
|
268
412
|
raise NotImplementedError(
|
269
413
|
"NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
|
270
414
|
)
|
@@ -282,52 +426,97 @@ class ProcMesh(MeshTrait):
|
|
282
426
|
def rank_tensors(self) -> Dict[str, "Tensor"]:
|
283
427
|
return self._device_mesh.ranks
|
284
428
|
|
285
|
-
async def sync_workspace(
|
429
|
+
async def sync_workspace(
|
430
|
+
self,
|
431
|
+
workspace: Workspace,
|
432
|
+
conda: bool = False,
|
433
|
+
auto_reload: bool = False,
|
434
|
+
) -> None:
|
286
435
|
if self._code_sync_client is None:
|
287
436
|
self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
|
288
|
-
proc_mesh=self.
|
437
|
+
proc_mesh=await self._proc_mesh_for_asyncio_fixme,
|
289
438
|
)
|
439
|
+
|
290
440
|
# TODO(agallagher): We need some way to configure and pass this
|
291
441
|
# in -- right now we're assuming the `gpu` dimension, which isn't
|
292
442
|
# correct.
|
293
443
|
# The workspace shape (i.e. only perform one rsync per host).
|
294
|
-
assert set(self.
|
444
|
+
assert set(self._shape.labels).issubset({"gpus", "hosts"})
|
445
|
+
|
446
|
+
workspaces = []
|
447
|
+
for src_dir, dst_dir in workspace.dirs.items():
|
448
|
+
workspaces.append(
|
449
|
+
WorkspaceConfig(
|
450
|
+
local=Path(src_dir),
|
451
|
+
remote=RemoteWorkspace(
|
452
|
+
location=WorkspaceLocation.FromEnvVar(
|
453
|
+
env="WORKSPACE_DIR",
|
454
|
+
relpath=dst_dir,
|
455
|
+
),
|
456
|
+
shape=WorkspaceShape.shared("gpus"),
|
457
|
+
),
|
458
|
+
method=CodeSyncMethod.Rsync,
|
459
|
+
),
|
460
|
+
)
|
461
|
+
|
462
|
+
# If `conda` is set, also sync the currently activated conda env.
|
463
|
+
conda_prefix = conda_utils.active_env_dir()
|
464
|
+
if isinstance(workspace.env, CondaEnvironment):
|
465
|
+
conda_prefix = workspace.env._conda_prefix
|
466
|
+
|
467
|
+
if conda and conda_prefix is not None:
|
468
|
+
conda_prefix = Path(conda_prefix)
|
469
|
+
|
470
|
+
# Resolve top-level symlinks for rsync/conda-sync.
|
471
|
+
while conda_prefix.is_symlink():
|
472
|
+
conda_prefix = conda_prefix.parent / conda_prefix.readlink()
|
473
|
+
|
474
|
+
workspaces.append(
|
475
|
+
WorkspaceConfig(
|
476
|
+
local=conda_prefix,
|
477
|
+
remote=RemoteWorkspace(
|
478
|
+
location=WorkspaceLocation.FromEnvVar(
|
479
|
+
env="CONDA_PREFIX",
|
480
|
+
relpath="",
|
481
|
+
),
|
482
|
+
shape=WorkspaceShape.shared("gpus"),
|
483
|
+
),
|
484
|
+
method=CodeSyncMethod.CondaSync,
|
485
|
+
),
|
486
|
+
)
|
487
|
+
|
295
488
|
assert self._code_sync_client is not None
|
296
|
-
await self._code_sync_client.
|
297
|
-
|
298
|
-
# workspace dir, rather than use PWD?
|
299
|
-
local=os.getcwd(),
|
300
|
-
remote=RemoteWorkspace(
|
301
|
-
location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
|
302
|
-
shape=WorkspaceShape.shared("gpus"),
|
303
|
-
),
|
489
|
+
await self._code_sync_client.sync_workspaces(
|
490
|
+
workspaces=workspaces,
|
304
491
|
auto_reload=auto_reload,
|
305
492
|
)
|
306
493
|
|
307
494
|
async def logging_option(
|
308
495
|
self,
|
309
|
-
stream_to_client: bool =
|
310
|
-
aggregate_window_sec: int | None =
|
496
|
+
stream_to_client: bool = True,
|
497
|
+
aggregate_window_sec: int | None = 3,
|
498
|
+
level: int = logging.INFO,
|
311
499
|
) -> None:
|
312
500
|
"""
|
313
501
|
Set the logging options for the remote processes
|
314
502
|
|
315
503
|
Args:
|
316
504
|
stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
|
317
|
-
Defaults to
|
505
|
+
Defaults to True.
|
318
506
|
aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
|
319
|
-
and sent to the client every aggregate_window_sec seconds. Defaults to
|
320
|
-
|
507
|
+
and sent to the client every aggregate_window_sec seconds. Defaults to 3 seconds, meaning no aggregation.
|
508
|
+
Error will be thrown if aggregate_window_sec is set and stream_to_client is False.
|
509
|
+
level (int): The logging level of the logger. Defaults to logging.INFO.
|
321
510
|
|
322
511
|
Returns:
|
323
512
|
None
|
324
513
|
"""
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
514
|
+
await self.initialized
|
515
|
+
|
516
|
+
await self._logging_manager.logging_option(
|
517
|
+
stream_to_client=stream_to_client,
|
518
|
+
aggregate_window_sec=aggregate_window_sec,
|
519
|
+
level=level,
|
331
520
|
)
|
332
521
|
|
333
522
|
async def __aenter__(self) -> "ProcMesh":
|
@@ -336,8 +525,10 @@ class ProcMesh(MeshTrait):
|
|
336
525
|
return self
|
337
526
|
|
338
527
|
def stop(self) -> Future[None]:
|
528
|
+
self._logging_manager.stop()
|
529
|
+
|
339
530
|
async def _stop_nonblocking() -> None:
|
340
|
-
await self._proc_mesh.stop_nonblocking()
|
531
|
+
await (await self._proc_mesh).stop_nonblocking()
|
341
532
|
self._stopped = True
|
342
533
|
|
343
534
|
return Future(coro=_stop_nonblocking())
|
@@ -353,6 +544,8 @@ class ProcMesh(MeshTrait):
|
|
353
544
|
# Finalizer to check if the proc mesh was closed properly.
|
354
545
|
def __del__(self) -> None:
|
355
546
|
if not self._stopped:
|
547
|
+
self._logging_manager.stop()
|
548
|
+
|
356
549
|
warnings.warn(
|
357
550
|
f"unstopped ProcMesh {self!r}",
|
358
551
|
ResourceWarning,
|
@@ -361,17 +554,59 @@ class ProcMesh(MeshTrait):
|
|
361
554
|
)
|
362
555
|
# Cannot call stop here because it is async.
|
363
556
|
|
557
|
+
def __reduce_ex__(self, protocol: ...) -> Tuple[Any, Tuple[Any, ...]]:
|
558
|
+
# Ultra-hack. Remote python actors can get a reference to this proc mesh that
|
559
|
+
# doesn't have any real functionality, but if they send a request back to the client
|
560
|
+
# where the real proc mesh exists, the client can look it up in the proc mesh registry
|
561
|
+
# and do something with it.
|
562
|
+
global _proc_mesh_registry
|
563
|
+
_proc_mesh_registry[ProcMeshRef(self._proc_mesh_id)] = self
|
564
|
+
return (ProcMeshRef._fake_proc_mesh, (self._proc_mesh_id,))
|
565
|
+
|
566
|
+
@staticmethod
|
567
|
+
def _from_ref(proc_mesh_ref: ProcMeshRef) -> "ProcMesh":
|
568
|
+
maybe_proc_mesh = _proc_mesh_registry.get(proc_mesh_ref, None)
|
569
|
+
if maybe_proc_mesh is None:
|
570
|
+
raise RuntimeError(
|
571
|
+
f"ProcMesh with id {proc_mesh_ref._proc_mesh_id} does not exist"
|
572
|
+
)
|
573
|
+
return maybe_proc_mesh
|
574
|
+
|
575
|
+
|
576
|
+
def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
|
577
|
+
warnings.warn(
|
578
|
+
"Use monarch._src.actor.host_mesh.fake_in_process_host().spawn_procs for testing. For launching an actor in the current process use this_proc().spawn_procs()",
|
579
|
+
DeprecationWarning,
|
580
|
+
stacklevel=2,
|
581
|
+
)
|
364
582
|
|
365
|
-
|
366
|
-
|
367
|
-
|
583
|
+
return _proc_mesh_from_allocator(
|
584
|
+
allocator=LocalAllocator(),
|
585
|
+
gpus=gpus,
|
586
|
+
hosts=hosts,
|
368
587
|
)
|
369
588
|
|
370
589
|
|
371
|
-
def sim_proc_mesh(
|
372
|
-
|
373
|
-
|
590
|
+
def sim_proc_mesh(
|
591
|
+
*,
|
592
|
+
gpus: int = 1,
|
593
|
+
hosts: int = 1,
|
594
|
+
racks: int = 1,
|
595
|
+
zones: int = 1,
|
596
|
+
dcs: int = 1,
|
597
|
+
regions: int = 1,
|
598
|
+
) -> ProcMesh:
|
599
|
+
spec: AllocSpec = AllocSpec(
|
600
|
+
AllocConstraints(),
|
601
|
+
hosts=hosts,
|
602
|
+
gpus=gpus,
|
603
|
+
racks=racks,
|
604
|
+
zones=zones,
|
605
|
+
dcs=dcs,
|
606
|
+
regions=regions,
|
374
607
|
)
|
608
|
+
alloc = SimAllocator().allocate(spec)
|
609
|
+
return ProcMesh.from_alloc(alloc, None, True)
|
375
610
|
|
376
611
|
|
377
612
|
_BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
|
@@ -392,25 +627,19 @@ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
|
|
392
627
|
return cmd, args, env
|
393
628
|
|
394
629
|
|
395
|
-
async def
|
396
|
-
alloc: Alloc,
|
397
|
-
|
398
|
-
|
399
|
-
) -> ProcMesh:
|
400
|
-
_hy_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
|
401
|
-
proc_mesh = ProcMesh(_hy_proc_mesh)
|
402
|
-
if init_manager_actors:
|
403
|
-
await proc_mesh._init_manager_actors(setup)
|
404
|
-
return proc_mesh
|
630
|
+
async def _hy_proc_mesh_from_alloc_coro(
|
631
|
+
alloc: "Shared[Alloc] | PythonTask[Alloc]",
|
632
|
+
) -> HyProcMesh:
|
633
|
+
return await HyProcMesh.allocate_nonblocking(await alloc)
|
405
634
|
|
406
635
|
|
407
|
-
|
636
|
+
def _proc_mesh_from_allocator(
|
408
637
|
*,
|
409
638
|
allocator: AllocateMixin,
|
410
|
-
gpus: Optional[int]
|
411
|
-
hosts: int
|
639
|
+
gpus: Optional[int],
|
640
|
+
hosts: int,
|
412
641
|
setup: Callable[[], None] | None = None,
|
413
|
-
|
642
|
+
_attach_controller_controller: bool = True,
|
414
643
|
) -> ProcMesh:
|
415
644
|
if gpus is None:
|
416
645
|
gpus = _local_device_count()
|
@@ -418,9 +647,8 @@ async def _proc_mesh_coro(
|
|
418
647
|
# test_remote_function_all_gather expects that hosts comes before gpus
|
419
648
|
# in the order of the dimensions.
|
420
649
|
spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
|
421
|
-
alloc =
|
422
|
-
|
423
|
-
return await _proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors)
|
650
|
+
alloc = allocator.allocate(spec)
|
651
|
+
return ProcMesh.from_alloc(alloc, setup, _attach_controller_controller)
|
424
652
|
|
425
653
|
|
426
654
|
def proc_mesh(
|
@@ -429,53 +657,92 @@ def proc_mesh(
|
|
429
657
|
hosts: int = 1,
|
430
658
|
env: dict[str, str] | None = None,
|
431
659
|
setup: Callable[[], None] | None = None,
|
432
|
-
) ->
|
433
|
-
|
660
|
+
) -> ProcMesh:
|
661
|
+
warnings.warn(
|
662
|
+
"use this_host().spawn_procs(per_host = {'hosts': 2, 'gpus': 3}) instead of monarch.actor.proc_mesh(hosts=2, gpus=3)",
|
663
|
+
DeprecationWarning,
|
664
|
+
stacklevel=2,
|
665
|
+
)
|
434
666
|
|
667
|
+
env = env or {}
|
435
668
|
# Todo: Deprecate the env field from the ProcessAllocator
|
436
669
|
# The PAR_MAIN_OVERRIDE needs to be passed as an env
|
437
670
|
# to the proc mesh construction in rust, so can not be moved to the
|
438
671
|
# SetupActor yet
|
439
672
|
cmd, args, bootstrap_env = _get_bootstrap_args()
|
440
673
|
env.update(bootstrap_env)
|
441
|
-
|
442
|
-
|
674
|
+
return _proc_mesh_from_allocator(
|
675
|
+
allocator=ProcessAllocator(cmd, args, env),
|
443
676
|
hosts=hosts,
|
677
|
+
gpus=gpus,
|
444
678
|
setup=setup,
|
445
|
-
|
446
|
-
init_manager_actors=True,
|
679
|
+
_attach_controller_controller=True,
|
447
680
|
)
|
448
|
-
return Future(coro=task)
|
449
681
|
|
450
682
|
|
451
|
-
|
683
|
+
_ActorType = TypeVar("_ActorType", bound=Actor)
|
452
684
|
|
453
685
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
# circular import reasons.
|
458
|
-
async def _get_debug_proc_mesh() -> "ProcMesh":
|
459
|
-
global _debug_proc_mesh
|
460
|
-
if _debug_proc_mesh is None:
|
461
|
-
_debug_proc_mesh = await _proc_mesh_coro(
|
462
|
-
gpus=1, hosts=1, allocator=LocalAllocator(), init_manager_actors=False
|
463
|
-
)
|
464
|
-
return _debug_proc_mesh
|
465
|
-
|
686
|
+
class _ControllerController(Actor):
|
687
|
+
def __init__(self) -> None:
|
688
|
+
self._controllers: Dict[str, Actor] = {}
|
466
689
|
|
467
|
-
|
690
|
+
# pyre-ignore
|
691
|
+
@endpoint
|
692
|
+
def get_or_spawn(
|
693
|
+
self, name: str, Class: Type[_ActorType], *args: Any, **kwargs: Any
|
694
|
+
) -> _ActorType:
|
695
|
+
if name not in self._controllers:
|
696
|
+
proc_mesh = _proc_mesh_from_allocator(
|
697
|
+
gpus=1,
|
698
|
+
hosts=1,
|
699
|
+
allocator=LocalAllocator(),
|
700
|
+
)
|
701
|
+
self._controllers[name] = proc_mesh.spawn(name, Class, *args, **kwargs)
|
702
|
+
return cast(_ActorType, self._controllers[name])
|
703
|
+
|
704
|
+
|
705
|
+
_cc_init = threading.Lock()
|
706
|
+
_cc_proc_mesh: Optional["ProcMesh"] = None
|
707
|
+
_controller_controller: Optional["_ControllerController"] = None
|
708
|
+
|
709
|
+
|
710
|
+
# Lazy init so that the controller_controller and proc do not produce logs when they aren't used.
|
711
|
+
# Checking for the controller (when it does not already exist in the MonarchContext) needs a lock,
|
712
|
+
# otherwise two initializing procs will both try to init resulting in duplicates. The critical
|
713
|
+
# region is not blocking: it spawns a separate task to do the init, assigns the
|
714
|
+
# Shared[_ControllerController] from that task to the global and releases the lock.
|
715
|
+
def _get_controller_controller() -> "Tuple[ProcMesh, _ControllerController]":
|
716
|
+
global _controller_controller, _cc_proc_mesh
|
717
|
+
with _cc_init:
|
718
|
+
if _controller_controller is None:
|
719
|
+
alloc = LocalAllocator().allocate(AllocSpec(AllocConstraints()))
|
720
|
+
_cc_proc_mesh = ProcMesh.from_alloc(
|
721
|
+
alloc, _attach_controller_controller=False
|
722
|
+
)
|
723
|
+
_controller_controller = _cc_proc_mesh.spawn(
|
724
|
+
"controller_controller", _ControllerController
|
725
|
+
)
|
726
|
+
assert _cc_proc_mesh is not None
|
727
|
+
return _cc_proc_mesh, _controller_controller
|
468
728
|
|
469
729
|
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
if
|
475
|
-
|
476
|
-
_debug_client_mesh = await mesh._spawn_nonblocking("debug_client", DebugClient)
|
477
|
-
return _debug_client_mesh
|
730
|
+
def get_or_spawn_controller(
|
731
|
+
name: str, Class: Type["_ActorType"], *args: Any, **kwargs: Any
|
732
|
+
) -> Future["_ActorType"]:
|
733
|
+
"""
|
734
|
+
Creates a singleton actor (controller) indexed by name, or if it already exists, returns the
|
735
|
+
existing actor.
|
478
736
|
|
737
|
+
Args:
|
738
|
+
name (str): The unique name of the actor, used as a key for retrieval.
|
739
|
+
Class (Type): The class of the actor to spawn. Must be a subclass of Actor.
|
740
|
+
*args (Any): Positional arguments to pass to the actor constructor.
|
741
|
+
**kwargs (Any): Keyword arguments to pass to the actor constructor.
|
479
742
|
|
480
|
-
|
481
|
-
|
743
|
+
Returns:
|
744
|
+
A Future that resolves to a reference to the actor.
|
745
|
+
"""
|
746
|
+
return context().actor_instance._controller_controller.get_or_spawn.call_one(
|
747
|
+
name, Class, *args, **kwargs
|
748
|
+
)
|