torchmonarch-nightly 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +874 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +270 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +500 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +56 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +51 -0
- monarch/actor_mesh.py +6 -752
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +12 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/mesh_controller.py +201 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +95 -35
- monarch/tools/mesh_spec.py +55 -0
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +75 -9
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +373 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +162 -0
- tests/test_python_actors.py +184 -332
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +55 -1
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,500 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import sys
|
12
|
+
import warnings
|
13
|
+
from contextlib import AbstractContextManager
|
14
|
+
|
15
|
+
from typing import (
|
16
|
+
Any,
|
17
|
+
Callable,
|
18
|
+
cast,
|
19
|
+
Dict,
|
20
|
+
List,
|
21
|
+
Optional,
|
22
|
+
Sequence,
|
23
|
+
Type,
|
24
|
+
TYPE_CHECKING,
|
25
|
+
TypeVar,
|
26
|
+
)
|
27
|
+
|
28
|
+
from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
|
29
|
+
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
30
|
+
Alloc,
|
31
|
+
AllocConstraints,
|
32
|
+
AllocSpec,
|
33
|
+
)
|
34
|
+
from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
|
35
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
36
|
+
ProcMesh as HyProcMesh,
|
37
|
+
ProcMeshMonitor,
|
38
|
+
)
|
39
|
+
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
40
|
+
from monarch._src.actor.actor_mesh import (
|
41
|
+
_Actor,
|
42
|
+
_ActorMeshRefImpl,
|
43
|
+
Actor,
|
44
|
+
ActorMeshRef,
|
45
|
+
fake_sync_state,
|
46
|
+
MonarchContext,
|
47
|
+
)
|
48
|
+
|
49
|
+
from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
|
50
|
+
from monarch._src.actor.code_sync import (
|
51
|
+
CodeSyncMeshClient,
|
52
|
+
RemoteWorkspace,
|
53
|
+
WorkspaceLocation,
|
54
|
+
WorkspaceShape,
|
55
|
+
)
|
56
|
+
from monarch._src.actor.debugger import (
|
57
|
+
_DEBUG_MANAGER_ACTOR_NAME,
|
58
|
+
DebugClient,
|
59
|
+
DebugManager,
|
60
|
+
)
|
61
|
+
|
62
|
+
from monarch._src.actor.device_utils import _local_device_count
|
63
|
+
|
64
|
+
from monarch._src.actor.endpoint import endpoint
|
65
|
+
from monarch._src.actor.future import Future
|
66
|
+
from monarch._src.actor.shape import MeshTrait
|
67
|
+
|
68
|
+
HAS_TENSOR_ENGINE = False
|
69
|
+
try:
|
70
|
+
from monarch._rust_bindings.rdma import ( # type: ignore[import]
|
71
|
+
_RdmaBuffer,
|
72
|
+
_RdmaManager,
|
73
|
+
)
|
74
|
+
|
75
|
+
# type: ignore[16]
|
76
|
+
HAS_TENSOR_ENGINE = _RdmaBuffer.rdma_supported()
|
77
|
+
except ImportError:
|
78
|
+
logging.warning("RDMA is not available on this platform")
|
79
|
+
|
80
|
+
|
81
|
+
if TYPE_CHECKING:
|
82
|
+
Tensor = Any
|
83
|
+
DeviceMesh = Any
|
84
|
+
|
85
|
+
|
86
|
+
class SetupActor(Actor):
|
87
|
+
"""
|
88
|
+
A helper actor to setup the proc mesh with user defined setup method.
|
89
|
+
Typically used to setup the environment variables.
|
90
|
+
"""
|
91
|
+
|
92
|
+
def __init__(self, env: Callable[[MonarchContext], None]) -> None:
|
93
|
+
"""
|
94
|
+
Initialize the setup actor with the user defined setup method.
|
95
|
+
"""
|
96
|
+
self._setup_method = env
|
97
|
+
|
98
|
+
@endpoint
|
99
|
+
async def setup(self) -> None:
|
100
|
+
"""
|
101
|
+
Call the user defined setup method with the monarch context.
|
102
|
+
"""
|
103
|
+
ctx = MonarchContext.get()
|
104
|
+
self._setup_method(ctx)
|
105
|
+
|
106
|
+
|
107
|
+
T = TypeVar("T")
|
108
|
+
try:
|
109
|
+
from __manifest__ import fbmake # noqa
|
110
|
+
|
111
|
+
IN_PAR = bool(fbmake.get("par_style"))
|
112
|
+
except ImportError:
|
113
|
+
IN_PAR = False
|
114
|
+
|
115
|
+
|
116
|
+
async def _allocate_nonblocking(
|
117
|
+
alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
|
118
|
+
) -> "ProcMesh":
|
119
|
+
_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
|
120
|
+
if setup is None:
|
121
|
+
return ProcMesh(_proc_mesh)
|
122
|
+
# If the user has passed the setup lambda, we need to call
|
123
|
+
# it here before any of the other actors are spawned so that
|
124
|
+
# the environment variables are set up before cuda init.
|
125
|
+
proc_mesh = ProcMesh(_proc_mesh)
|
126
|
+
setup_actor = await proc_mesh.spawn("setup", SetupActor, setup)
|
127
|
+
await setup_actor.setup.call()
|
128
|
+
del setup_actor
|
129
|
+
return proc_mesh
|
130
|
+
|
131
|
+
|
132
|
+
class ProcMesh(MeshTrait):
|
133
|
+
def __init__(
|
134
|
+
self,
|
135
|
+
hy_proc_mesh: HyProcMesh,
|
136
|
+
_mock_shape: Optional[Shape] = None,
|
137
|
+
_device_mesh: Optional["DeviceMesh"] = None,
|
138
|
+
_is_initializing_debugger: bool = False,
|
139
|
+
) -> None:
|
140
|
+
self._proc_mesh = hy_proc_mesh
|
141
|
+
self._mock_shape: Optional[Shape] = _mock_shape
|
142
|
+
# type: ignore[21]
|
143
|
+
self._rdma_manager: Optional["_RdmaManager"] = None
|
144
|
+
self._debug_manager: Optional[DebugManager] = None
|
145
|
+
self._mailbox: Mailbox = self._proc_mesh.client
|
146
|
+
self._code_sync_client: Optional[CodeSyncMeshClient] = None
|
147
|
+
self._logging_mesh_client: Optional[LoggingMeshClient] = None
|
148
|
+
self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
|
149
|
+
self._stopped = False
|
150
|
+
|
151
|
+
# This code is unsafe in async contexts, but we currently do it all over the place
|
152
|
+
# we need to refactor this by moving it to the first time we try to spawn on the mesh.
|
153
|
+
# Right now we simply preserve the previous behavior and disable the check that prevents
|
154
|
+
# end users from doing the same.
|
155
|
+
with fake_sync_state():
|
156
|
+
if _mock_shape is None and HAS_TENSOR_ENGINE:
|
157
|
+
# type: ignore[21]
|
158
|
+
self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
|
159
|
+
self._proc_mesh
|
160
|
+
)
|
161
|
+
if not _is_initializing_debugger and _mock_shape is None:
|
162
|
+
self._debug_manager = self.spawn(
|
163
|
+
_DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
|
164
|
+
).get()
|
165
|
+
|
166
|
+
@property
|
167
|
+
def _shape(self) -> Shape:
|
168
|
+
return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
|
169
|
+
|
170
|
+
@property
|
171
|
+
def _ndslice(self) -> Slice:
|
172
|
+
return self._shape.ndslice
|
173
|
+
|
174
|
+
@property
|
175
|
+
def _labels(self) -> List[str]:
|
176
|
+
return self._shape.labels
|
177
|
+
|
178
|
+
def _new_with_shape(self, shape: Shape) -> "ProcMesh":
|
179
|
+
device_mesh = (
|
180
|
+
None
|
181
|
+
if self._maybe_device_mesh is None
|
182
|
+
else self._device_mesh._new_with_shape(shape)
|
183
|
+
)
|
184
|
+
return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
|
185
|
+
|
186
|
+
def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
|
187
|
+
if self._mock_shape is not None:
|
188
|
+
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
189
|
+
return Future(
|
190
|
+
impl=lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
|
191
|
+
requires_loop=False,
|
192
|
+
)
|
193
|
+
|
194
|
+
async def monitor(self) -> ProcMeshMonitor:
|
195
|
+
"""
|
196
|
+
Get a monitor (async iterator) of the proc mesh, it is used to
|
197
|
+
monitor the status of the proc mesh. This function can be called at most once.
|
198
|
+
|
199
|
+
Note: This API is experimental and subject to change.
|
200
|
+
|
201
|
+
Example:
|
202
|
+
|
203
|
+
async def monitor_loop(monitor):
|
204
|
+
async for event in monitor:
|
205
|
+
await handle_exception_event(event)
|
206
|
+
|
207
|
+
# Kick off in background
|
208
|
+
asyncio.create_task(monitor_loop(monitor))
|
209
|
+
"""
|
210
|
+
return await self._proc_mesh.monitor()
|
211
|
+
|
212
|
+
@classmethod
|
213
|
+
def from_alloc(
|
214
|
+
self, alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
|
215
|
+
) -> Future["ProcMesh"]:
|
216
|
+
"""
|
217
|
+
Allocate a process mesh according to the provided alloc.
|
218
|
+
Returns when the mesh is fully allocated.
|
219
|
+
|
220
|
+
Arguments:
|
221
|
+
- `alloc`: The alloc to allocate according to.
|
222
|
+
- `setup`: A lambda taking MonarchContext as param, can be used to setup env vars on the allocated mesh
|
223
|
+
"""
|
224
|
+
return Future(
|
225
|
+
impl=lambda: _allocate_nonblocking(alloc, setup),
|
226
|
+
requires_loop=False,
|
227
|
+
)
|
228
|
+
|
229
|
+
def __repr__(self) -> str:
|
230
|
+
return repr(self._proc_mesh)
|
231
|
+
|
232
|
+
def __str__(self) -> str:
|
233
|
+
return str(self._proc_mesh)
|
234
|
+
|
235
|
+
async def _spawn_nonblocking(
|
236
|
+
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
237
|
+
) -> T:
|
238
|
+
if not issubclass(Class, Actor):
|
239
|
+
raise ValueError(
|
240
|
+
f"{Class} must subclass monarch.service.Actor to spawn it."
|
241
|
+
)
|
242
|
+
actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
|
243
|
+
service = ActorMeshRef(
|
244
|
+
Class,
|
245
|
+
_ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh, self),
|
246
|
+
self._mailbox,
|
247
|
+
)
|
248
|
+
# useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
|
249
|
+
# doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
|
250
|
+
service._create(args, kwargs)
|
251
|
+
return cast(T, service)
|
252
|
+
|
253
|
+
@property
|
254
|
+
def _device_mesh(self) -> "DeviceMesh":
|
255
|
+
if not HAS_TENSOR_ENGINE:
|
256
|
+
raise RuntimeError(
|
257
|
+
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
|
258
|
+
)
|
259
|
+
|
260
|
+
# type: ignore[21]
|
261
|
+
from monarch.mesh_controller import spawn_tensor_engine # @manual
|
262
|
+
|
263
|
+
if self._maybe_device_mesh is None:
|
264
|
+
if self._mock_shape is not None:
|
265
|
+
raise NotImplementedError(
|
266
|
+
"NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
|
267
|
+
)
|
268
|
+
# type: ignore[21]
|
269
|
+
self._maybe_device_mesh = spawn_tensor_engine(self)
|
270
|
+
return self._maybe_device_mesh
|
271
|
+
|
272
|
+
# pyre-ignore
|
273
|
+
def activate(self) -> AbstractContextManager:
|
274
|
+
return self._device_mesh.activate()
|
275
|
+
|
276
|
+
def rank_tensor(self, dim: str | Sequence[str]) -> "Tensor":
|
277
|
+
return self._device_mesh.rank(dim)
|
278
|
+
|
279
|
+
def rank_tensors(self) -> Dict[str, "Tensor"]:
|
280
|
+
return self._device_mesh.ranks
|
281
|
+
|
282
|
+
async def sync_workspace(self, auto_reload: bool = False) -> None:
|
283
|
+
if self._code_sync_client is None:
|
284
|
+
self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
|
285
|
+
proc_mesh=self._proc_mesh,
|
286
|
+
)
|
287
|
+
# TODO(agallagher): We need some way to configure and pass this
|
288
|
+
# in -- right now we're assuming the `gpu` dimension, which isn't
|
289
|
+
# correct.
|
290
|
+
# The workspace shape (i.e. only perform one rsync per host).
|
291
|
+
assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
|
292
|
+
assert self._code_sync_client is not None
|
293
|
+
await self._code_sync_client.sync_workspace(
|
294
|
+
# TODO(agallagher): Is there a better way to infer/set the local
|
295
|
+
# workspace dir, rather than use PWD?
|
296
|
+
local=os.getcwd(),
|
297
|
+
remote=RemoteWorkspace(
|
298
|
+
location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
|
299
|
+
shape=WorkspaceShape.shared("gpus"),
|
300
|
+
),
|
301
|
+
auto_reload=auto_reload,
|
302
|
+
)
|
303
|
+
|
304
|
+
async def logging_option(
|
305
|
+
self,
|
306
|
+
stream_to_client: bool = False,
|
307
|
+
aggregate_window_sec: int | None = None,
|
308
|
+
) -> None:
|
309
|
+
"""
|
310
|
+
Set the logging options for the remote processes
|
311
|
+
|
312
|
+
Args:
|
313
|
+
stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
|
314
|
+
Defaults to False.
|
315
|
+
aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
|
316
|
+
and sent to the client every aggregate_window_sec seconds. Defaults to None, meaning no aggregation.
|
317
|
+
aggregate_window_sec will be ignored if stream_to_client is False.
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
None
|
321
|
+
"""
|
322
|
+
if self._logging_mesh_client is None:
|
323
|
+
self._logging_mesh_client = await LoggingMeshClient.spawn(
|
324
|
+
proc_mesh=self._proc_mesh
|
325
|
+
)
|
326
|
+
self._logging_mesh_client.set_mode(
|
327
|
+
stream_to_client, aggregate_window_sec=aggregate_window_sec
|
328
|
+
)
|
329
|
+
|
330
|
+
async def __aenter__(self) -> "ProcMesh":
|
331
|
+
if self._stopped:
|
332
|
+
raise RuntimeError("`ProcMesh` has already been stopped")
|
333
|
+
return self
|
334
|
+
|
335
|
+
def stop(self) -> Future[None]:
|
336
|
+
async def _stop_nonblocking() -> None:
|
337
|
+
await self._proc_mesh.stop_nonblocking()
|
338
|
+
self._stopped = True
|
339
|
+
|
340
|
+
return Future(
|
341
|
+
impl=lambda: _stop_nonblocking(),
|
342
|
+
requires_loop=False,
|
343
|
+
)
|
344
|
+
|
345
|
+
async def __aexit__(
|
346
|
+
self, exc_type: object, exc_val: object, exc_tb: object
|
347
|
+
) -> None:
|
348
|
+
# In case there are multiple nested "async with" statements, we only
|
349
|
+
# want it to close once.
|
350
|
+
if not self._stopped:
|
351
|
+
await self.stop()
|
352
|
+
|
353
|
+
# Finalizer to check if the proc mesh was closed properly.
|
354
|
+
def __del__(self) -> None:
|
355
|
+
if not self._stopped:
|
356
|
+
warnings.warn(
|
357
|
+
f"unstopped ProcMesh {self!r}",
|
358
|
+
ResourceWarning,
|
359
|
+
stacklevel=2,
|
360
|
+
source=self,
|
361
|
+
)
|
362
|
+
# Cannot call stop here because it is async.
|
363
|
+
|
364
|
+
|
365
|
+
async def local_proc_mesh_nonblocking(
|
366
|
+
*,
|
367
|
+
gpus: Optional[int] = None,
|
368
|
+
hosts: int = 1,
|
369
|
+
_is_initializing_debugger: bool = False,
|
370
|
+
) -> ProcMesh:
|
371
|
+
if gpus is None:
|
372
|
+
gpus = _local_device_count()
|
373
|
+
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
374
|
+
allocator = LocalAllocator()
|
375
|
+
alloc = await allocator.allocate(spec)
|
376
|
+
proc_mesh = HyProcMesh.allocate_nonblocking(alloc)
|
377
|
+
return ProcMesh(
|
378
|
+
await proc_mesh,
|
379
|
+
_is_initializing_debugger=_is_initializing_debugger,
|
380
|
+
)
|
381
|
+
|
382
|
+
|
383
|
+
def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
|
384
|
+
return Future(
|
385
|
+
impl=lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
|
386
|
+
requires_loop=False,
|
387
|
+
)
|
388
|
+
|
389
|
+
|
390
|
+
async def sim_proc_mesh_nonblocking(
|
391
|
+
*, gpus: Optional[int] = None, hosts: int = 1
|
392
|
+
) -> ProcMesh:
|
393
|
+
if gpus is None:
|
394
|
+
gpus = _local_device_count()
|
395
|
+
spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
|
396
|
+
allocator = SimAllocator()
|
397
|
+
alloc = await allocator.allocate(spec)
|
398
|
+
return await ProcMesh.from_alloc(alloc)
|
399
|
+
|
400
|
+
|
401
|
+
def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
|
402
|
+
return Future(
|
403
|
+
impl=lambda: sim_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
|
404
|
+
requires_loop=False,
|
405
|
+
)
|
406
|
+
|
407
|
+
|
408
|
+
_BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
|
409
|
+
|
410
|
+
|
411
|
+
def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
|
412
|
+
if IN_PAR:
|
413
|
+
cmd = sys.argv[0]
|
414
|
+
args = None
|
415
|
+
env = {
|
416
|
+
"PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
|
417
|
+
}
|
418
|
+
else:
|
419
|
+
cmd = sys.executable
|
420
|
+
args = ["-m", _BOOTSTRAP_MAIN]
|
421
|
+
env = {}
|
422
|
+
|
423
|
+
return cmd, args, env
|
424
|
+
|
425
|
+
|
426
|
+
async def proc_mesh_nonblocking(
|
427
|
+
*,
|
428
|
+
gpus: Optional[int] = None,
|
429
|
+
hosts: int = 1,
|
430
|
+
env: dict[str, str] | None = None,
|
431
|
+
setup: Callable[[MonarchContext], None] | None = None,
|
432
|
+
) -> ProcMesh:
|
433
|
+
if gpus is None:
|
434
|
+
gpus = _local_device_count()
|
435
|
+
# gpus must come last in this order because
|
436
|
+
# test_remote_function_all_gather expects that hosts comes before gpus
|
437
|
+
# in the order of the dimensions.
|
438
|
+
spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
|
439
|
+
env = env or {}
|
440
|
+
# Todo: Deprecate the env field from the ProcessAllocator
|
441
|
+
# The PAR_MAIN_OVERRIDE needs to be passed as an env
|
442
|
+
# to the proc mesh construction in rust, so can not be moved to the
|
443
|
+
# SetupActor yet
|
444
|
+
cmd, args, bootstrap_env = _get_bootstrap_args()
|
445
|
+
env.update(bootstrap_env)
|
446
|
+
allocator = ProcessAllocator(cmd, args, env)
|
447
|
+
alloc = await allocator.allocate(spec)
|
448
|
+
|
449
|
+
return await ProcMesh.from_alloc(
|
450
|
+
alloc,
|
451
|
+
setup=setup,
|
452
|
+
)
|
453
|
+
|
454
|
+
|
455
|
+
def proc_mesh(
|
456
|
+
*,
|
457
|
+
gpus: Optional[int] = None,
|
458
|
+
hosts: int = 1,
|
459
|
+
env: dict[str, str] | None = None,
|
460
|
+
setup: Callable[[MonarchContext], None] | None = None,
|
461
|
+
) -> Future[ProcMesh]:
|
462
|
+
return Future(
|
463
|
+
impl=lambda: proc_mesh_nonblocking(
|
464
|
+
gpus=gpus, hosts=hosts, env=env, setup=setup
|
465
|
+
),
|
466
|
+
requires_loop=False,
|
467
|
+
)
|
468
|
+
|
469
|
+
|
470
|
+
_debug_proc_mesh: Optional["ProcMesh"] = None
|
471
|
+
|
472
|
+
|
473
|
+
# Lazy init of the debug proc mesh so that importing monarch.proc_mesh
|
474
|
+
# doesn't trigger the debug client to spawn, which could cause confusing
|
475
|
+
# logs. This is defined in proc_mesh.py instead of debugger.py for
|
476
|
+
# circular import reasons.
|
477
|
+
def _get_debug_proc_mesh() -> "ProcMesh":
|
478
|
+
global _debug_proc_mesh
|
479
|
+
if _debug_proc_mesh is None:
|
480
|
+
_debug_proc_mesh = Future(
|
481
|
+
impl=lambda: local_proc_mesh_nonblocking(
|
482
|
+
gpus=1, hosts=1, _is_initializing_debugger=True
|
483
|
+
),
|
484
|
+
requires_loop=False,
|
485
|
+
).get()
|
486
|
+
return _debug_proc_mesh
|
487
|
+
|
488
|
+
|
489
|
+
_debug_client_mesh: Optional[DebugClient] = None
|
490
|
+
|
491
|
+
|
492
|
+
# Lazy init for the same reason as above. This is defined in proc_mesh.py
|
493
|
+
# instead of debugger.py for circular import reasons.
|
494
|
+
def debug_client() -> DebugClient:
|
495
|
+
global _debug_client_mesh
|
496
|
+
if _debug_client_mesh is None:
|
497
|
+
_debug_client_mesh = (
|
498
|
+
_get_debug_proc_mesh().spawn("debug_client", DebugClient).get()
|
499
|
+
)
|
500
|
+
return _debug_client_mesh
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
from contextlib import contextmanager
|
9
|
+
|
10
|
+
|
11
|
+
@contextmanager
|
12
|
+
def fake_sync_state():
|
13
|
+
prev_loop = asyncio.events._get_running_loop()
|
14
|
+
asyncio._set_running_loop(None)
|
15
|
+
try:
|
16
|
+
yield
|
17
|
+
finally:
|
18
|
+
asyncio._set_running_loop(prev_loop)
|
@@ -9,7 +9,7 @@
|
|
9
9
|
|
10
10
|
import logging
|
11
11
|
|
12
|
-
from monarch._rust_bindings.
|
12
|
+
from monarch._rust_bindings.monarch_hyperactor.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
|
13
13
|
forward_to_tracing,
|
14
14
|
)
|
15
15
|
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
import logging
|
10
|
+
from contextlib import contextmanager
|
11
|
+
from typing import Iterator, Mapping, Optional, Union
|
12
|
+
|
13
|
+
import opentelemetry.util.types as types # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
|
14
|
+
|
15
|
+
from monarch._rust_bindings.monarch_hyperactor.telemetry import (
|
16
|
+
get_current_span_id,
|
17
|
+
PySpan,
|
18
|
+
)
|
19
|
+
|
20
|
+
from opentelemetry import ( # @manual=fbsource//third-party/pypi/opentelemetry-api:opentelemetry-api
|
21
|
+
trace,
|
22
|
+
)
|
23
|
+
from opentelemetry.trace import Tracer
|
24
|
+
from opentelemetry.trace.status import Status, StatusCode
|
25
|
+
from pyre_extensions import override
|
26
|
+
|
27
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
|
30
|
+
class SpanWrapper(trace.Span):
|
31
|
+
def __init__(self, name: str) -> None:
|
32
|
+
super().__init__()
|
33
|
+
self._span: PySpan | None = PySpan(name)
|
34
|
+
|
35
|
+
@override
|
36
|
+
def end(self, end_time: Optional[int] = None) -> None:
|
37
|
+
# since PySpan is not sendable, we need to make sure it is deallocated on this thread so it doesn't log warnings.
|
38
|
+
s = self._span
|
39
|
+
assert s is not None
|
40
|
+
s.exit()
|
41
|
+
self._span = None
|
42
|
+
del s
|
43
|
+
|
44
|
+
def record_exception(
|
45
|
+
self,
|
46
|
+
exception: BaseException,
|
47
|
+
attributes: types.Attributes = None,
|
48
|
+
timestamp: Optional[int] = None,
|
49
|
+
escaped: bool = False,
|
50
|
+
) -> None:
|
51
|
+
pass
|
52
|
+
|
53
|
+
def is_recording(self) -> bool:
|
54
|
+
return False
|
55
|
+
|
56
|
+
def get_span_context(self) -> trace.span.SpanContext:
|
57
|
+
span_id = get_current_span_id()
|
58
|
+
return trace.span.SpanContext(trace_id=0, span_id=span_id, is_remote=False)
|
59
|
+
|
60
|
+
def set_attributes(self, attributes: Mapping[str, types.AttributeValue]) -> None:
|
61
|
+
pass
|
62
|
+
|
63
|
+
def set_attribute(self, key: str, value: types.AttributeValue) -> None:
|
64
|
+
pass
|
65
|
+
|
66
|
+
def add_event(
|
67
|
+
self,
|
68
|
+
name: str,
|
69
|
+
attributes: types.Attributes = None,
|
70
|
+
timestamp: Optional[int] = None,
|
71
|
+
) -> None:
|
72
|
+
pass
|
73
|
+
|
74
|
+
def update_name(self, name: str) -> None:
|
75
|
+
pass
|
76
|
+
|
77
|
+
def set_status(
|
78
|
+
self,
|
79
|
+
status: Union[Status, StatusCode],
|
80
|
+
description: Optional[str] = None,
|
81
|
+
) -> None:
|
82
|
+
pass
|
83
|
+
|
84
|
+
|
85
|
+
class RustTracer(trace.Tracer):
|
86
|
+
def start_span(
|
87
|
+
self,
|
88
|
+
name: str,
|
89
|
+
context: Optional[trace.Context] = None,
|
90
|
+
kind: trace.SpanKind = trace.SpanKind.INTERNAL,
|
91
|
+
attributes: types.Attributes = None,
|
92
|
+
links: trace._Links = None,
|
93
|
+
start_time: Optional[int] = None,
|
94
|
+
record_exception: bool = True,
|
95
|
+
set_status_on_exception: bool = True,
|
96
|
+
) -> trace.Span:
|
97
|
+
return SpanWrapper(name)
|
98
|
+
|
99
|
+
@contextmanager
|
100
|
+
# pyre-fixme[15]: `start_as_current_span` overrides method defined in `Tracer`
|
101
|
+
# inconsistently.
|
102
|
+
def start_as_current_span(
|
103
|
+
self,
|
104
|
+
name: str,
|
105
|
+
context: Optional[trace.Context] = None,
|
106
|
+
kind: trace.SpanKind = trace.SpanKind.INTERNAL,
|
107
|
+
attributes: types.Attributes = None,
|
108
|
+
links: trace._Links = None,
|
109
|
+
start_time: Optional[int] = None,
|
110
|
+
record_exception: bool = True,
|
111
|
+
set_status_on_exception: bool = True,
|
112
|
+
end_on_exit: bool = True,
|
113
|
+
) -> Iterator[trace.Span]:
|
114
|
+
with SpanWrapper(name) as s:
|
115
|
+
with trace.use_span(s):
|
116
|
+
yield s
|
117
|
+
del s
|
118
|
+
|
119
|
+
|
120
|
+
class RustTracerProvider(trace.TracerProvider):
|
121
|
+
@override
|
122
|
+
def get_tracer(
|
123
|
+
self,
|
124
|
+
instrumenting_module_name: str,
|
125
|
+
*args: object,
|
126
|
+
instrumenting_library_version: Optional[str] = None,
|
127
|
+
schema_url: Optional[str] = None,
|
128
|
+
**kwargs: object,
|
129
|
+
) -> trace.Tracer:
|
130
|
+
return RustTracer()
|
131
|
+
|
132
|
+
|
133
|
+
def get_monarch_tracer() -> Tracer:
|
134
|
+
"""
|
135
|
+
Creates and returns a Monarch python tracer that logs to the Rust telemetry system.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
Tracer: A configured OpenTelemetry tracer for Monarch.
|
139
|
+
|
140
|
+
Usage:
|
141
|
+
tracer = get_monarch_tracer()
|
142
|
+
with tracer.start_as_current_span("span_name") as span:
|
143
|
+
# code here
|
144
|
+
"""
|
145
|
+
install()
|
146
|
+
return trace.get_tracer("monarch.python.tracer")
|
147
|
+
|
148
|
+
|
149
|
+
_INSTALLED = False
|
150
|
+
|
151
|
+
|
152
|
+
def install() -> None:
|
153
|
+
global _INSTALLED
|
154
|
+
if _INSTALLED:
|
155
|
+
return
|
156
|
+
|
157
|
+
provider = RustTracerProvider()
|
158
|
+
trace.set_tracer_provider(provider)
|
159
|
+
_INSTALLED = True
|