torchmonarch-nightly 2025.6.27__cp313-cp313-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +189 -0
- monarch/_monarch/__init__.py +5 -0
- monarch/_monarch/hyperactor/__init__.py +58 -0
- monarch/_monarch/selection/__init__.py +13 -0
- monarch/_monarch/worker/__init__.py +0 -0
- monarch/_monarch/worker/debugger.py +117 -0
- monarch/_monarch/worker/logging.py +107 -0
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +230 -0
- monarch/actor_mesh.py +761 -0
- monarch/allocator.py +220 -0
- monarch/bootstrap_main.py +59 -0
- monarch/builtins/__init__.py +14 -0
- monarch/builtins/log.py +22 -0
- monarch/builtins/random.py +68 -0
- monarch/cached_remote_function.py +257 -0
- monarch/code_sync.py +10 -0
- monarch/common/_C.pyi +11 -0
- monarch/common/_C.so +0 -0
- monarch/common/__init__.py +0 -0
- monarch/common/_coalescing.py +308 -0
- monarch/common/_device_utils.py +18 -0
- monarch/common/_tensor_to_table.py +172 -0
- monarch/common/base_tensor.py +28 -0
- monarch/common/borrows.py +143 -0
- monarch/common/client.py +690 -0
- monarch/common/constants.py +10 -0
- monarch/common/context_manager.py +40 -0
- monarch/common/controller_api.py +104 -0
- monarch/common/device_mesh.py +417 -0
- monarch/common/fake.py +55 -0
- monarch/common/function.py +160 -0
- monarch/common/function_caching.py +164 -0
- monarch/common/future.py +168 -0
- monarch/common/invocation.py +125 -0
- monarch/common/mast.py +221 -0
- monarch/common/messages.py +573 -0
- monarch/common/mock_cuda.py +41 -0
- monarch/common/opaque_ref.py +98 -0
- monarch/common/pickle_flatten.py +48 -0
- monarch/common/pipe.py +152 -0
- monarch/common/process_group.py +55 -0
- monarch/common/recording.py +127 -0
- monarch/common/reference.py +33 -0
- monarch/common/remote.py +297 -0
- monarch/common/selection.py +9 -0
- monarch/common/shape.py +229 -0
- monarch/common/stream.py +114 -0
- monarch/common/tensor.py +814 -0
- monarch/common/tensor_factory.py +31 -0
- monarch/common/tree.py +73 -0
- monarch/controller/__init__.py +7 -0
- monarch/controller/backend.py +223 -0
- monarch/controller/controller.py +223 -0
- monarch/controller/debugger.py +47 -0
- monarch/controller/history.py +90 -0
- monarch/controller/rust_backend/__init__.py +7 -0
- monarch/controller/rust_backend/controller.py +245 -0
- monarch/debugger.py +379 -0
- monarch/fetch.py +55 -0
- monarch/future.py +76 -0
- monarch/gradient/__init__.py +11 -0
- monarch/gradient/_gradient_generator.pyi +22 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +185 -0
- monarch/memory.py +43 -0
- monarch/mesh_controller.py +271 -0
- monarch/monarch_controller +0 -0
- monarch/notebook.py +761 -0
- monarch/opaque_module.py +235 -0
- monarch/opaque_object.py +88 -0
- monarch/parallel/__init__.py +9 -0
- monarch/parallel/pipelining/__init__.py +7 -0
- monarch/parallel/pipelining/runtime.py +847 -0
- monarch/parallel/pipelining/schedule_ir.py +692 -0
- monarch/parallel/pipelining/scheduler.py +249 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/proc_mesh.py +299 -0
- monarch/profiler.py +160 -0
- monarch/python_local_mesh.py +107 -0
- monarch/random.py +61 -0
- monarch/rdma.py +162 -0
- monarch/remote_class.py +114 -0
- monarch/rust_backend_mesh.py +280 -0
- monarch/rust_local_mesh.py +1402 -0
- monarch/sim_mesh.py +359 -0
- monarch/simulator/__init__.py +7 -0
- monarch/simulator/command_history.py +424 -0
- monarch/simulator/config.py +21 -0
- monarch/simulator/interface.py +59 -0
- monarch/simulator/ir.py +770 -0
- monarch/simulator/mock_controller.py +214 -0
- monarch/simulator/profiling.py +424 -0
- monarch/simulator/simulator.py +1052 -0
- monarch/simulator/task.py +255 -0
- monarch/simulator/tensor.py +373 -0
- monarch/simulator/trace.py +395 -0
- monarch/simulator/utils.py +41 -0
- monarch/simulator/worker.py +389 -0
- monarch/telemetry.py +19 -0
- monarch/tensor_worker_main.py +260 -0
- monarch/tensorboard.py +84 -0
- monarch/timer/__init__.py +21 -0
- monarch/timer/example_monarch.py +78 -0
- monarch/timer/example_spmd.py +55 -0
- monarch/timer/execution_timer.py +199 -0
- monarch/timer/execution_timer_test.py +131 -0
- monarch/tools/__init__.py +7 -0
- monarch/tools/cli.py +167 -0
- monarch/tools/commands.py +251 -0
- monarch/tools/components/__init__.py +7 -0
- monarch/tools/components/hyperactor.py +58 -0
- monarch/tools/config/__init__.py +20 -0
- monarch/tools/config/defaults.py +54 -0
- monarch/tools/mesh_spec.py +165 -0
- monarch/tools/network.py +69 -0
- monarch/worker/__init__.py +7 -0
- monarch/worker/_testing_function.py +481 -0
- monarch/worker/compiled_block.py +270 -0
- monarch/worker/debugger.py +125 -0
- monarch/worker/lines.py +47 -0
- monarch/worker/monitor.py +53 -0
- monarch/worker/worker.py +1191 -0
- monarch/world_mesh.py +34 -0
- monarch_supervisor/__init__.py +1044 -0
- monarch_supervisor/_testing.py +44 -0
- monarch_supervisor/function_call.py +30 -0
- monarch_supervisor/host.py +386 -0
- monarch_supervisor/launchers.py +145 -0
- monarch_supervisor/log_pstree.py +48 -0
- monarch_supervisor/logging.py +103 -0
- monarch_supervisor/python_executable.py +42 -0
- tests/__init__.py +0 -0
- tests/dispatch_bench.py +124 -0
- tests/dispatch_bench_helper.py +25 -0
- tests/error_test_binary.py +180 -0
- tests/simulator/__init__.py +0 -0
- tests/simulator/test_profiling.py +136 -0
- tests/simulator/test_simulator.py +411 -0
- tests/simulator/test_task.py +64 -0
- tests/simulator/test_worker.py +102 -0
- tests/sleep_binary.py +35 -0
- tests/test_actor_error.py +240 -0
- tests/test_alloc.py +25 -0
- tests/test_allocator.py +365 -0
- tests/test_coalescing.py +492 -0
- tests/test_controller.py +845 -0
- tests/test_device_mesh.py +132 -0
- tests/test_fault_tolerance.py +398 -0
- tests/test_future.py +94 -0
- tests/test_grad_generator.py +121 -0
- tests/test_mock_cuda.py +74 -0
- tests/test_pdb_actor.py +110 -0
- tests/test_python_actors.py +736 -0
- tests/test_remote_functions.py +1271 -0
- tests/test_rust_backend.py +217 -0
- tests/test_signal_safe_block_on.py +103 -0
- tests/test_sim_backend.py +54 -0
- tests/test_tensor_engine.py +52 -0
- torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
- torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/_testing.py
ADDED
@@ -0,0 +1,230 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-unsafe
|
8
|
+
|
9
|
+
import logging
|
10
|
+
import tempfile
|
11
|
+
import time
|
12
|
+
from contextlib import contextmanager, ExitStack
|
13
|
+
from typing import Any, Callable, Dict, Generator, Literal, Optional
|
14
|
+
|
15
|
+
import monarch_supervisor
|
16
|
+
from monarch.common.client import Client
|
17
|
+
from monarch.common.device_mesh import DeviceMesh
|
18
|
+
from monarch.common.invocation import DeviceException, RemoteException
|
19
|
+
from monarch.common.shape import NDSlice
|
20
|
+
from monarch.controller.backend import ProcessBackend
|
21
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
22
|
+
from monarch.proc_mesh import proc_mesh, ProcMesh
|
23
|
+
from monarch.python_local_mesh import PythonLocalContext
|
24
|
+
from monarch.rust_local_mesh import (
|
25
|
+
local_mesh,
|
26
|
+
LoggingLocation,
|
27
|
+
ProcessCache,
|
28
|
+
SocketType,
|
29
|
+
)
|
30
|
+
from monarch.simulator.mock_controller import MockController
|
31
|
+
from monarch.world_mesh import world_mesh
|
32
|
+
|
33
|
+
|
34
|
+
class TestingContext:
|
35
|
+
"""
|
36
|
+
Context manager for testing.
|
37
|
+
Creates a local device mesh for a given number of hosts and gpus per host.
|
38
|
+
Importantly, it also caches the worker processes so that tests can reuse them
|
39
|
+
without having to reinitialize torch/NCCL.
|
40
|
+
|
41
|
+
Example::
|
42
|
+
with TestingContext() as c:
|
43
|
+
local_mesh = c.local_device_mesh(2, 2)
|
44
|
+
with local_mesh.activate():
|
45
|
+
x = torch.rand(3, 4)
|
46
|
+
local_tensor = fetch_shard(x).result()
|
47
|
+
"""
|
48
|
+
|
49
|
+
__test__ = False
|
50
|
+
|
51
|
+
def __init__(self):
|
52
|
+
self.cleanup = ExitStack()
|
53
|
+
self._py_process_cache = {}
|
54
|
+
self._rust_process_cache = None
|
55
|
+
self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
|
56
|
+
|
57
|
+
@contextmanager
|
58
|
+
def _get_context(self, num_hosts, gpu_per_host):
|
59
|
+
# since we are local, there isn't a lot of latency involved.
|
60
|
+
# Make the host managers exit if they go 0.5 seconds without
|
61
|
+
# hearing from supervisor.
|
62
|
+
monarch_supervisor.HEARTBEAT_INTERVAL = 1
|
63
|
+
ctx = PythonLocalContext(N=num_hosts)
|
64
|
+
store = ProcessBackend._create_store()
|
65
|
+
processes = ProcessBackend._create_pg(
|
66
|
+
ctx.ctx, ctx.hosts, gpu_per_host, store, _restartable=True
|
67
|
+
)
|
68
|
+
yield ctx.ctx, ctx.hosts, processes
|
69
|
+
ctx.shutdown()
|
70
|
+
|
71
|
+
def _processes(self, num_hosts, gpu_per_host):
|
72
|
+
key = (num_hosts, gpu_per_host)
|
73
|
+
if key not in self._py_process_cache:
|
74
|
+
self._py_process_cache[key] = self.cleanup.enter_context(
|
75
|
+
self._get_context(num_hosts, gpu_per_host)
|
76
|
+
)
|
77
|
+
return self._py_process_cache[key]
|
78
|
+
|
79
|
+
@contextmanager
|
80
|
+
def local_py_device_mesh(
|
81
|
+
self,
|
82
|
+
num_hosts,
|
83
|
+
gpu_per_host,
|
84
|
+
) -> Generator[DeviceMesh, None, None]:
|
85
|
+
ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
|
86
|
+
dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
|
87
|
+
try:
|
88
|
+
yield dm
|
89
|
+
dm.client.shutdown(destroy_pg=False)
|
90
|
+
except Exception:
|
91
|
+
# abnormal exit, so we just make sure we do not try to communicate in destructors,
|
92
|
+
# but we do notn wait for workers to exit since we do not know what state they are in.
|
93
|
+
dm.client._shutdown = True
|
94
|
+
raise
|
95
|
+
|
96
|
+
@contextmanager
|
97
|
+
def local_rust_device_mesh(
|
98
|
+
self,
|
99
|
+
num_hosts,
|
100
|
+
gpu_per_host,
|
101
|
+
controller_params=None,
|
102
|
+
) -> Generator[DeviceMesh, None, None]:
|
103
|
+
# Create a new system and mesh for test.
|
104
|
+
with local_mesh(
|
105
|
+
hosts=num_hosts,
|
106
|
+
gpus_per_host=gpu_per_host,
|
107
|
+
socket_type=SocketType.UNIX,
|
108
|
+
logging_location=LoggingLocation.DEFAULT,
|
109
|
+
system_factory=self._rust_process_cache.get_system_server(),
|
110
|
+
controller_factory=self._rust_process_cache.get_controller_server(),
|
111
|
+
worker_factory=self._rust_process_cache.get_worker_servers(
|
112
|
+
num_worker_procs=num_hosts * gpu_per_host,
|
113
|
+
gpus_per_host=gpu_per_host,
|
114
|
+
),
|
115
|
+
controller_params=controller_params,
|
116
|
+
) as dm:
|
117
|
+
try:
|
118
|
+
yield dm
|
119
|
+
dm.exit()
|
120
|
+
except Exception:
|
121
|
+
dm.client._shutdown = True
|
122
|
+
raise
|
123
|
+
finally:
|
124
|
+
# Shutdown the system.
|
125
|
+
# pyre-ignore: Undefined attribute
|
126
|
+
dm.client.inner._actor.stop()
|
127
|
+
|
128
|
+
@contextmanager
|
129
|
+
def local_engine_on_proc_mesh(
|
130
|
+
self,
|
131
|
+
num_hosts,
|
132
|
+
gpu_per_host,
|
133
|
+
) -> Generator[DeviceMesh, None, None]:
|
134
|
+
key = (num_hosts, gpu_per_host)
|
135
|
+
if key not in self._proc_mesh_cache:
|
136
|
+
self._proc_mesh_cache[key] = proc_mesh(
|
137
|
+
hosts=num_hosts, gpus=gpu_per_host
|
138
|
+
).get()
|
139
|
+
|
140
|
+
dm = spawn_tensor_engine(self._proc_mesh_cache[key])
|
141
|
+
dm = dm.rename(hosts="host", gpus="gpu")
|
142
|
+
try:
|
143
|
+
yield dm
|
144
|
+
dm.exit()
|
145
|
+
except Exception as e:
|
146
|
+
# abnormal exit, so we just make sure we do not try to communicate in destructors,
|
147
|
+
# but we do notn wait for workers to exit since we do not know what state they are in.
|
148
|
+
dm.client._shutdown = True
|
149
|
+
raise
|
150
|
+
|
151
|
+
@contextmanager
|
152
|
+
def local_device_mesh(
|
153
|
+
self,
|
154
|
+
num_hosts,
|
155
|
+
gpu_per_host,
|
156
|
+
activate=True,
|
157
|
+
backend: Literal["py", "rs", "mesh"] = "py",
|
158
|
+
controller_params=None,
|
159
|
+
) -> Generator[DeviceMesh, None, None]:
|
160
|
+
start = time.time()
|
161
|
+
if backend == "rs":
|
162
|
+
generator = self.local_rust_device_mesh(
|
163
|
+
num_hosts, gpu_per_host, controller_params=controller_params
|
164
|
+
)
|
165
|
+
elif backend == "py":
|
166
|
+
generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
|
167
|
+
elif backend == "mesh":
|
168
|
+
generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
|
169
|
+
else:
|
170
|
+
raise ValueError(f"invalid backend: {backend}")
|
171
|
+
with generator as dm:
|
172
|
+
end = time.time()
|
173
|
+
logging.info("initialized mesh in {:.2f}s".format(end - start))
|
174
|
+
if activate:
|
175
|
+
with dm.activate():
|
176
|
+
yield dm
|
177
|
+
else:
|
178
|
+
yield dm
|
179
|
+
start = time.time()
|
180
|
+
end = time.time()
|
181
|
+
logging.info("shutdown mesh in {:.2f}s".format(end - start))
|
182
|
+
|
183
|
+
def __enter__(self):
|
184
|
+
start = time.time()
|
185
|
+
self._log_dir = self.cleanup.enter_context(
|
186
|
+
tempfile.TemporaryDirectory(prefix="rust_cached_workers.")
|
187
|
+
)
|
188
|
+
self._rust_process_cache = self.cleanup.enter_context(
|
189
|
+
ProcessCache(
|
190
|
+
logging_location=LoggingLocation.DEFAULT,
|
191
|
+
logging_dir=self._log_dir,
|
192
|
+
)
|
193
|
+
)
|
194
|
+
end = time.time()
|
195
|
+
logging.info("started process caches in {:.2f}s".format(end - start))
|
196
|
+
return self
|
197
|
+
|
198
|
+
def __exit__(self, *args):
|
199
|
+
start = time.time()
|
200
|
+
self.cleanup.__exit__(*args)
|
201
|
+
end = time.time()
|
202
|
+
logging.info("shutdown process caches in {:.2f}s".format(end - start))
|
203
|
+
|
204
|
+
|
205
|
+
def mock_mesh(hosts: int, gpus: int):
|
206
|
+
ctrl = MockController(hosts * gpus)
|
207
|
+
client = Client(ctrl, ctrl.world_size, ctrl.gpu_per_host)
|
208
|
+
dm = DeviceMesh(
|
209
|
+
client,
|
210
|
+
NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]),
|
211
|
+
("host", "gpu"),
|
212
|
+
)
|
213
|
+
|
214
|
+
def create_exit(
|
215
|
+
client: Client,
|
216
|
+
) -> Callable[[Optional[RemoteException | DeviceException | Exception]], None]:
|
217
|
+
def exit(
|
218
|
+
error: Optional[RemoteException | DeviceException | Exception] = None,
|
219
|
+
) -> None:
|
220
|
+
client.shutdown(True, error)
|
221
|
+
|
222
|
+
return exit
|
223
|
+
|
224
|
+
dm.exit = create_exit(client)
|
225
|
+
return dm
|
226
|
+
|
227
|
+
|
228
|
+
class BackendType:
|
229
|
+
PY = "py"
|
230
|
+
RS = "rs"
|