torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +189 -0
- monarch/_monarch/__init__.py +5 -0
- monarch/_monarch/hyperactor/__init__.py +58 -0
- monarch/_monarch/selection/__init__.py +13 -0
- monarch/_monarch/worker/__init__.py +0 -0
- monarch/_monarch/worker/debugger.py +117 -0
- monarch/_monarch/worker/logging.py +107 -0
- monarch/_rust_bindings.so +0 -0
- monarch/_testing.py +230 -0
- monarch/actor_mesh.py +761 -0
- monarch/allocator.py +220 -0
- monarch/bootstrap_main.py +59 -0
- monarch/builtins/__init__.py +14 -0
- monarch/builtins/log.py +22 -0
- monarch/builtins/random.py +68 -0
- monarch/cached_remote_function.py +257 -0
- monarch/code_sync.py +10 -0
- monarch/common/_C.pyi +11 -0
- monarch/common/_C.so +0 -0
- monarch/common/__init__.py +0 -0
- monarch/common/_coalescing.py +308 -0
- monarch/common/_device_utils.py +18 -0
- monarch/common/_tensor_to_table.py +172 -0
- monarch/common/base_tensor.py +28 -0
- monarch/common/borrows.py +143 -0
- monarch/common/client.py +690 -0
- monarch/common/constants.py +10 -0
- monarch/common/context_manager.py +40 -0
- monarch/common/controller_api.py +104 -0
- monarch/common/device_mesh.py +417 -0
- monarch/common/fake.py +55 -0
- monarch/common/function.py +160 -0
- monarch/common/function_caching.py +164 -0
- monarch/common/future.py +168 -0
- monarch/common/invocation.py +125 -0
- monarch/common/mast.py +221 -0
- monarch/common/messages.py +573 -0
- monarch/common/mock_cuda.py +41 -0
- monarch/common/opaque_ref.py +98 -0
- monarch/common/pickle_flatten.py +48 -0
- monarch/common/pipe.py +152 -0
- monarch/common/process_group.py +55 -0
- monarch/common/recording.py +127 -0
- monarch/common/reference.py +33 -0
- monarch/common/remote.py +297 -0
- monarch/common/selection.py +9 -0
- monarch/common/shape.py +229 -0
- monarch/common/stream.py +114 -0
- monarch/common/tensor.py +814 -0
- monarch/common/tensor_factory.py +31 -0
- monarch/common/tree.py +73 -0
- monarch/controller/__init__.py +7 -0
- monarch/controller/backend.py +223 -0
- monarch/controller/controller.py +223 -0
- monarch/controller/debugger.py +47 -0
- monarch/controller/history.py +90 -0
- monarch/controller/rust_backend/__init__.py +7 -0
- monarch/controller/rust_backend/controller.py +245 -0
- monarch/debugger.py +379 -0
- monarch/fetch.py +55 -0
- monarch/future.py +76 -0
- monarch/gradient/__init__.py +11 -0
- monarch/gradient/_gradient_generator.pyi +22 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +185 -0
- monarch/memory.py +43 -0
- monarch/mesh_controller.py +271 -0
- monarch/monarch_controller +0 -0
- monarch/notebook.py +761 -0
- monarch/opaque_module.py +235 -0
- monarch/opaque_object.py +88 -0
- monarch/parallel/__init__.py +9 -0
- monarch/parallel/pipelining/__init__.py +7 -0
- monarch/parallel/pipelining/runtime.py +847 -0
- monarch/parallel/pipelining/schedule_ir.py +692 -0
- monarch/parallel/pipelining/scheduler.py +249 -0
- monarch/pdb_wrapper.py +135 -0
- monarch/proc_mesh.py +299 -0
- monarch/profiler.py +160 -0
- monarch/python_local_mesh.py +107 -0
- monarch/random.py +61 -0
- monarch/rdma.py +162 -0
- monarch/remote_class.py +114 -0
- monarch/rust_backend_mesh.py +280 -0
- monarch/rust_local_mesh.py +1402 -0
- monarch/sim_mesh.py +359 -0
- monarch/simulator/__init__.py +7 -0
- monarch/simulator/command_history.py +424 -0
- monarch/simulator/config.py +21 -0
- monarch/simulator/interface.py +59 -0
- monarch/simulator/ir.py +770 -0
- monarch/simulator/mock_controller.py +214 -0
- monarch/simulator/profiling.py +424 -0
- monarch/simulator/simulator.py +1052 -0
- monarch/simulator/task.py +255 -0
- monarch/simulator/tensor.py +373 -0
- monarch/simulator/trace.py +395 -0
- monarch/simulator/utils.py +41 -0
- monarch/simulator/worker.py +389 -0
- monarch/telemetry.py +19 -0
- monarch/tensor_worker_main.py +260 -0
- monarch/tensorboard.py +84 -0
- monarch/timer/__init__.py +21 -0
- monarch/timer/example_monarch.py +78 -0
- monarch/timer/example_spmd.py +55 -0
- monarch/timer/execution_timer.py +199 -0
- monarch/timer/execution_timer_test.py +131 -0
- monarch/tools/__init__.py +7 -0
- monarch/tools/cli.py +167 -0
- monarch/tools/commands.py +251 -0
- monarch/tools/components/__init__.py +7 -0
- monarch/tools/components/hyperactor.py +58 -0
- monarch/tools/config/__init__.py +20 -0
- monarch/tools/config/defaults.py +54 -0
- monarch/tools/mesh_spec.py +165 -0
- monarch/tools/network.py +69 -0
- monarch/worker/__init__.py +7 -0
- monarch/worker/_testing_function.py +481 -0
- monarch/worker/compiled_block.py +270 -0
- monarch/worker/debugger.py +125 -0
- monarch/worker/lines.py +47 -0
- monarch/worker/monitor.py +53 -0
- monarch/worker/worker.py +1191 -0
- monarch/world_mesh.py +34 -0
- monarch_supervisor/__init__.py +1044 -0
- monarch_supervisor/_testing.py +44 -0
- monarch_supervisor/function_call.py +30 -0
- monarch_supervisor/host.py +386 -0
- monarch_supervisor/launchers.py +145 -0
- monarch_supervisor/log_pstree.py +48 -0
- monarch_supervisor/logging.py +103 -0
- monarch_supervisor/python_executable.py +42 -0
- tests/__init__.py +0 -0
- tests/dispatch_bench.py +124 -0
- tests/dispatch_bench_helper.py +25 -0
- tests/error_test_binary.py +180 -0
- tests/simulator/__init__.py +0 -0
- tests/simulator/test_profiling.py +136 -0
- tests/simulator/test_simulator.py +411 -0
- tests/simulator/test_task.py +64 -0
- tests/simulator/test_worker.py +102 -0
- tests/sleep_binary.py +35 -0
- tests/test_actor_error.py +240 -0
- tests/test_alloc.py +25 -0
- tests/test_allocator.py +365 -0
- tests/test_coalescing.py +492 -0
- tests/test_controller.py +845 -0
- tests/test_device_mesh.py +132 -0
- tests/test_fault_tolerance.py +398 -0
- tests/test_future.py +94 -0
- tests/test_grad_generator.py +121 -0
- tests/test_mock_cuda.py +74 -0
- tests/test_pdb_actor.py +110 -0
- tests/test_python_actors.py +736 -0
- tests/test_remote_functions.py +1271 -0
- tests/test_rust_backend.py +217 -0
- tests/test_signal_safe_block_on.py +103 -0
- tests/test_sim_backend.py +54 -0
- tests/test_tensor_engine.py +52 -0
- torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
- torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
@@ -0,0 +1,217 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-unsafe
|
8
|
+
|
9
|
+
from contextlib import contextmanager
|
10
|
+
from typing import Generator
|
11
|
+
from unittest import TestCase
|
12
|
+
|
13
|
+
import monarch
|
14
|
+
|
15
|
+
import pytest
|
16
|
+
import torch
|
17
|
+
import torch.utils._python_dispatch
|
18
|
+
from monarch import fetch_shard, no_mesh, remote, Stream
|
19
|
+
from monarch.common.device_mesh import DeviceMesh
|
20
|
+
from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
|
21
|
+
from torch.nn.attention import sdpa_kernel, SDPBackend
|
22
|
+
from torch.nn.functional import scaled_dot_product_attention
|
23
|
+
|
24
|
+
|
25
|
+
def simple_all_reduce(*args, **kwargs):
|
26
|
+
return torch.ones(args[0].shape)
|
27
|
+
|
28
|
+
|
29
|
+
simple_all_reduce = remote(
|
30
|
+
"monarch.worker._testing_function.simple_all_reduce_local",
|
31
|
+
propagate=simple_all_reduce,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
@contextmanager
|
36
|
+
def local_mesh(
|
37
|
+
hosts: int = 1, gpu_per_host: int = 2, activate: bool = True
|
38
|
+
) -> Generator[DeviceMesh, None, None]:
|
39
|
+
with monarch.rust_local_mesh.local_mesh(
|
40
|
+
hosts=hosts,
|
41
|
+
gpus_per_host=gpu_per_host,
|
42
|
+
socket_type=SocketType.UNIX,
|
43
|
+
logging_location=LoggingLocation.DEFAULT,
|
44
|
+
) as dm:
|
45
|
+
try:
|
46
|
+
if activate:
|
47
|
+
with dm.activate():
|
48
|
+
yield dm
|
49
|
+
else:
|
50
|
+
yield dm
|
51
|
+
dm.exit()
|
52
|
+
except Exception:
|
53
|
+
dm.client._shutdown = True
|
54
|
+
raise
|
55
|
+
|
56
|
+
|
57
|
+
# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
|
58
|
+
# out is not counted as a failure, so we set a more restrictive timeout to
|
59
|
+
# ensure we see a hard failure in CI.
|
60
|
+
@pytest.mark.timeout(120)
|
61
|
+
@pytest.mark.skipif(
|
62
|
+
torch.cuda.device_count() < 2,
|
63
|
+
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
64
|
+
)
|
65
|
+
class TestRustBackend(TestCase):
|
66
|
+
def test_local_mesh_setup(self):
|
67
|
+
with local_mesh():
|
68
|
+
t = torch.zeros(3, 4)
|
69
|
+
t.add_(1)
|
70
|
+
fut = fetch_shard(t)
|
71
|
+
|
72
|
+
with no_mesh.activate():
|
73
|
+
local_t = fut.result()
|
74
|
+
assert torch.equal(local_t, torch.ones(3, 4))
|
75
|
+
|
76
|
+
def test_result_in_mesh(self):
|
77
|
+
with local_mesh():
|
78
|
+
t = torch.ones(3, 4)
|
79
|
+
t.add_(-1)
|
80
|
+
# Assert calling result() is fine within an active mesh.
|
81
|
+
local_t = fetch_shard(t).result()
|
82
|
+
assert torch.equal(local_t, torch.zeros(3, 4))
|
83
|
+
|
84
|
+
def test_errors(self):
|
85
|
+
t = torch.rand(3, 4)
|
86
|
+
with local_mesh(2, 2) as dm:
|
87
|
+
y = torch.rand(3, 4)
|
88
|
+
with pytest.raises(TypeError, match="LOCAL_TENSOR"):
|
89
|
+
t.add(y)
|
90
|
+
with pytest.raises(TypeError, match="WRONG_MESH"):
|
91
|
+
sub_mesh = dm(host=0)
|
92
|
+
with sub_mesh.activate():
|
93
|
+
x = torch.rand(3, 4)
|
94
|
+
x.add(y)
|
95
|
+
other = Stream("other")
|
96
|
+
t = torch.rand(10).cuda()
|
97
|
+
with pytest.raises(TypeError, match="WRONG_STREAM"):
|
98
|
+
with other.activate():
|
99
|
+
t = t.reduce("host", "sum")
|
100
|
+
|
101
|
+
def test_multi_hosts(self):
|
102
|
+
with local_mesh(hosts=2, gpu_per_host=2):
|
103
|
+
t = torch.rand(3, 4).cuda()
|
104
|
+
local_t1 = fetch_shard(t, {"host": 1, "gpu": 0}).result()
|
105
|
+
local_t2 = fetch_shard(t, {"host": 1, "gpu": 0}).result()
|
106
|
+
local_t3 = fetch_shard(t, {"host": 0, "gpu": 1}).result()
|
107
|
+
assert torch.equal(local_t1, local_t2)
|
108
|
+
assert not torch.equal(local_t1, local_t3)
|
109
|
+
|
110
|
+
def test_fetch_preprocess(self):
|
111
|
+
with local_mesh():
|
112
|
+
assert (
|
113
|
+
"an argument processed"
|
114
|
+
== remote("monarch.worker._testing_function.do_some_processing")
|
115
|
+
.call_on_shard_and_fetch("an argument")
|
116
|
+
.result()
|
117
|
+
)
|
118
|
+
|
119
|
+
def test_brutal_shutdown(self):
|
120
|
+
with monarch.rust_local_mesh.local_mesh(
|
121
|
+
hosts=1, gpus_per_host=1, socket_type=SocketType.UNIX
|
122
|
+
) as dm:
|
123
|
+
dm.exit()
|
124
|
+
dm.deactivate()
|
125
|
+
|
126
|
+
def test_results_filtering(self):
|
127
|
+
with local_mesh(gpu_per_host=1):
|
128
|
+
query = torch.rand(1, 1, 1, 1, dtype=torch.float16, device="cuda")
|
129
|
+
key = torch.rand(1, 1, 1, 1, dtype=torch.float16, device="cuda")
|
130
|
+
value = torch.rand(1, 1, 1, 1, dtype=torch.float16, device="cuda")
|
131
|
+
with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
|
132
|
+
# This function will send 9 results. Only 5 of them will be set.
|
133
|
+
t = scaled_dot_product_attention(query, key, value)
|
134
|
+
fut = fetch_shard(t)
|
135
|
+
local_tensor = fut.result()
|
136
|
+
assert len(local_tensor) == 1
|
137
|
+
|
138
|
+
def test_live_function(self):
|
139
|
+
with local_mesh():
|
140
|
+
|
141
|
+
@remote
|
142
|
+
def has_nan(t):
|
143
|
+
return torch.isnan(t).any().item()
|
144
|
+
|
145
|
+
t = torch.rand(3, 4)
|
146
|
+
res = has_nan.call_on_shard_and_fetch(
|
147
|
+
t, shard={"host": 0, "gpu": 0}
|
148
|
+
).result()
|
149
|
+
|
150
|
+
self.assertFalse(res)
|
151
|
+
|
152
|
+
def test_multiple_global_meshes(self):
|
153
|
+
"""
|
154
|
+
This test is to validate we can have a single client process
|
155
|
+
connecting to multiple global meshes. The global meshes are distinct
|
156
|
+
from each other to provide native failure domain isolation.
|
157
|
+
"""
|
158
|
+
replicas = 4
|
159
|
+
with local_meshes(
|
160
|
+
meshes=replicas,
|
161
|
+
hosts_per_mesh=1,
|
162
|
+
gpus_per_host=1,
|
163
|
+
socket_type=SocketType.UNIX,
|
164
|
+
logging_location=LoggingLocation.DEFAULT,
|
165
|
+
) as groups:
|
166
|
+
results = []
|
167
|
+
for i, group in enumerate(groups):
|
168
|
+
with group.activate():
|
169
|
+
t = torch.ones(i + 1)
|
170
|
+
results.append(fetch_shard(t).result())
|
171
|
+
for i in range(replicas):
|
172
|
+
assert torch.equal(results[i], torch.ones(i + 1))
|
173
|
+
|
174
|
+
for group in groups:
|
175
|
+
group.exit()
|
176
|
+
group.deactivate()
|
177
|
+
|
178
|
+
def test_get_world_status(self) -> None:
|
179
|
+
with local_mesh(gpu_per_host=2) as mesh:
|
180
|
+
mesh_info = mesh.get_info()
|
181
|
+
|
182
|
+
self.assertIsNotNone(mesh_info.mesh_labels)
|
183
|
+
self.assertEqual(len(mesh_info.devices_labels), 2)
|
184
|
+
|
185
|
+
def test_ivalue_problems(self) -> None:
|
186
|
+
with local_mesh(hosts=1, gpu_per_host=1):
|
187
|
+
from typing import cast
|
188
|
+
|
189
|
+
from monarch.common.messages import CallFunction, CommandGroup
|
190
|
+
|
191
|
+
a = cast(monarch.Tensor, torch.rand(3, 4))
|
192
|
+
result = monarch.Tensor(a._fake, a.mesh, a.stream)
|
193
|
+
msg = CallFunction(
|
194
|
+
0,
|
195
|
+
result,
|
196
|
+
(),
|
197
|
+
monarch.common.function.ResolvableFunctionFromPath(
|
198
|
+
"torch.ops.aten.mul.Tensor"
|
199
|
+
),
|
200
|
+
(2, a),
|
201
|
+
{},
|
202
|
+
a.stream._to_ref(a.mesh.client),
|
203
|
+
a.mesh,
|
204
|
+
[],
|
205
|
+
)
|
206
|
+
# Internally, this will call CallFunction(...).to_rust_message().
|
207
|
+
# The 2 arg will be converted to an IValue tensor via rust + C++.
|
208
|
+
# Then when the CommandGroup message gets converted to rust, it
|
209
|
+
# will attempt to clone the rust CallFunction message, which will
|
210
|
+
# attempt to clone the IValue tensor, which will cause a crash.
|
211
|
+
# Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
|
212
|
+
# intercepts the following two calls:
|
213
|
+
# aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
|
214
|
+
# aten.clone.default () (2,) {}
|
215
|
+
|
216
|
+
with torch.utils._python_dispatch._disable_current_modes():
|
217
|
+
CommandGroup([msg]).to_rust_message()
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
7
|
+
|
8
|
+
# pyre-strict
|
9
|
+
|
10
|
+
"""
|
11
|
+
Integration test for signal_safe_block_on.
|
12
|
+
|
13
|
+
This test spawns a Python binary that calls a Rust function which sleeps indefinitely.
|
14
|
+
The test then sends SIGINT to the process and confirms that it exits properly,
|
15
|
+
verifying that signal_safe_block_on correctly handles signals.
|
16
|
+
"""
|
17
|
+
|
18
|
+
import importlib.resources
|
19
|
+
import os
|
20
|
+
import signal
|
21
|
+
import subprocess
|
22
|
+
import time
|
23
|
+
import unittest
|
24
|
+
|
25
|
+
import pytest
|
26
|
+
|
27
|
+
|
28
|
+
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
29
|
+
class TestSignalSafeBlockOn(unittest.TestCase):
|
30
|
+
# pyre-ignore[56]
|
31
|
+
@pytest.mark.oss_skip
|
32
|
+
def test_sigint_handling(self) -> None:
|
33
|
+
"""
|
34
|
+
Test that a process using signal_safe_block_on can be interrupted with SIGINT.
|
35
|
+
|
36
|
+
This test:
|
37
|
+
1. Spawns a subprocess running sleep_binary.py
|
38
|
+
2. Waits for it to start
|
39
|
+
3. Sends SIGINT to the process
|
40
|
+
4. Verifies that the process exits within a reasonable timeout
|
41
|
+
|
42
|
+
To validate that it will behave in the same way as a ctl-c in the shell,
|
43
|
+
we launch the process in it's own process group and send the signal to the process
|
44
|
+
group instead of the process itself.
|
45
|
+
"""
|
46
|
+
test_bin = importlib.resources.files("monarch.python.tests").joinpath(
|
47
|
+
"test_bin"
|
48
|
+
)
|
49
|
+
# Start the subprocess
|
50
|
+
process = subprocess.Popen(
|
51
|
+
[str(test_bin)],
|
52
|
+
stdout=subprocess.PIPE,
|
53
|
+
stderr=subprocess.PIPE,
|
54
|
+
text=True,
|
55
|
+
start_new_session=True,
|
56
|
+
)
|
57
|
+
|
58
|
+
gpig = os.getpgid(process.pid)
|
59
|
+
|
60
|
+
try:
|
61
|
+
# Wait for the process to start and print its startup message
|
62
|
+
start_time = time.time()
|
63
|
+
startup_timeout = 10 # seconds
|
64
|
+
|
65
|
+
while time.time() - start_time < startup_timeout:
|
66
|
+
if process.stdout and "Starting sleep_binary" in (
|
67
|
+
process.stdout.readline() or ""
|
68
|
+
):
|
69
|
+
break
|
70
|
+
time.sleep(0.1)
|
71
|
+
else:
|
72
|
+
self.fail("Subprocess did not start properly within timeout")
|
73
|
+
|
74
|
+
# Give the process a moment to enter the sleep_indefinitely_for_unit_tests function
|
75
|
+
time.sleep(1)
|
76
|
+
|
77
|
+
# Send SIGINT to the process
|
78
|
+
os.killpg(gpig, signal.SIGINT)
|
79
|
+
|
80
|
+
# Wait for the process to exit with a timeout
|
81
|
+
exit_timeout = 5 # seconds
|
82
|
+
exit_time = time.time()
|
83
|
+
|
84
|
+
while time.time() - exit_time < exit_timeout:
|
85
|
+
if process.poll() is not None:
|
86
|
+
# Process has exited
|
87
|
+
break
|
88
|
+
time.sleep(0.1)
|
89
|
+
else:
|
90
|
+
self.fail("Process did not exit after receiving SIGINT")
|
91
|
+
|
92
|
+
# Check that the process exited with code 0 (clean exit)
|
93
|
+
self.assertEqual(process.returncode, 0, "Process did not exit cleanly")
|
94
|
+
|
95
|
+
finally:
|
96
|
+
# Clean up in case the test fails
|
97
|
+
if process.poll() is None:
|
98
|
+
process.kill()
|
99
|
+
process.wait()
|
100
|
+
|
101
|
+
|
102
|
+
if __name__ == "__main__":
|
103
|
+
unittest.main()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-unsafe
|
8
|
+
|
9
|
+
from contextlib import contextmanager
|
10
|
+
from typing import Generator, Optional
|
11
|
+
from unittest import TestCase
|
12
|
+
|
13
|
+
import pytest
|
14
|
+
|
15
|
+
import torch
|
16
|
+
from monarch import fetch_shard
|
17
|
+
from monarch.common.device_mesh import DeviceMesh
|
18
|
+
from monarch.sim_mesh import sim_mesh
|
19
|
+
|
20
|
+
|
21
|
+
@contextmanager
|
22
|
+
def local_sim_mesh(
|
23
|
+
hosts: int = 1,
|
24
|
+
# TODO: support multiple gpus in a mesh.
|
25
|
+
gpu_per_host: int = 1,
|
26
|
+
activate: bool = True,
|
27
|
+
proxy_addr: Optional[str] = None,
|
28
|
+
) -> Generator[DeviceMesh, None, None]:
|
29
|
+
dms = sim_mesh(
|
30
|
+
n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host, proxy_addr=proxy_addr
|
31
|
+
)
|
32
|
+
dm = dms[0]
|
33
|
+
try:
|
34
|
+
if activate:
|
35
|
+
with dm.activate():
|
36
|
+
yield dm
|
37
|
+
else:
|
38
|
+
yield dm
|
39
|
+
dm.exit()
|
40
|
+
except Exception:
|
41
|
+
dm.client._shutdown = True
|
42
|
+
raise
|
43
|
+
|
44
|
+
|
45
|
+
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
46
|
+
@pytest.mark.oss_skip
|
47
|
+
class TestSimBackend(TestCase):
|
48
|
+
def test_local_mesh_setup(self):
|
49
|
+
with local_sim_mesh():
|
50
|
+
t = torch.zeros(3, 4)
|
51
|
+
t.add_(1)
|
52
|
+
local_t = fetch_shard(t).result()
|
53
|
+
# consider support specifying the return value in the mock worker.
|
54
|
+
assert local_t is not None
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import monarch
|
8
|
+
import pytest
|
9
|
+
import torch
|
10
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
11
|
+
from monarch.proc_mesh import proc_mesh
|
12
|
+
|
13
|
+
|
14
|
+
two_gpu = pytest.mark.skipif(
|
15
|
+
torch.cuda.device_count() < 2,
|
16
|
+
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
@two_gpu
|
21
|
+
def test_tensor_engine() -> None:
|
22
|
+
pm = proc_mesh(gpus=2).get()
|
23
|
+
|
24
|
+
dm = spawn_tensor_engine(pm)
|
25
|
+
with dm.activate():
|
26
|
+
r = monarch.inspect(2 * torch.zeros(3, 4))
|
27
|
+
|
28
|
+
fm = dm.flatten("all")
|
29
|
+
with fm.activate():
|
30
|
+
f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
|
31
|
+
|
32
|
+
assert torch.allclose(torch.zeros(3, 4), r)
|
33
|
+
assert torch.allclose(torch.zeros(3, 4), f)
|
34
|
+
|
35
|
+
dm.exit()
|
36
|
+
|
37
|
+
|
38
|
+
@two_gpu
|
39
|
+
def test_proc_mesh_tensor_engine() -> None:
|
40
|
+
pm = proc_mesh(gpus=2).get()
|
41
|
+
with pm.activate():
|
42
|
+
f = 10 * pm.rank_tensor("gpus").cuda()
|
43
|
+
a = monarch.inspect(f, hosts=0, gpus=0)
|
44
|
+
b = monarch.inspect(f, hosts=0, gpus=1)
|
45
|
+
|
46
|
+
one = pm.slice(gpus=1)
|
47
|
+
with one.activate():
|
48
|
+
sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
|
49
|
+
c = monarch.inspect(sliced_b * 10)
|
50
|
+
assert a == 0
|
51
|
+
assert b == 10
|
52
|
+
assert c == 100
|
@@ -0,0 +1,94 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: torchmonarch-nightly
|
3
|
+
Version: 2025.6.27
|
4
|
+
Summary: Monarch: Single controller library
|
5
|
+
Author: Meta
|
6
|
+
Author-email: oncall+monarch@xmail.facebook.com
|
7
|
+
License: BSD-3-Clause
|
8
|
+
Requires-Python: >= 3.10
|
9
|
+
Description-Content-Type: text/markdown
|
10
|
+
License-File: LICENSE
|
11
|
+
Requires-Dist: torch
|
12
|
+
Requires-Dist: pyzmq
|
13
|
+
Requires-Dist: requests
|
14
|
+
Requires-Dist: numpy
|
15
|
+
Requires-Dist: pyre-extensions
|
16
|
+
Requires-Dist: cloudpickle
|
17
|
+
Requires-Dist: torchx-nightly
|
18
|
+
Dynamic: author
|
19
|
+
Dynamic: author-email
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: license
|
23
|
+
Dynamic: license-file
|
24
|
+
Dynamic: requires-dist
|
25
|
+
Dynamic: requires-python
|
26
|
+
Dynamic: summary
|
27
|
+
|
28
|
+
# Monarch 🦋
|
29
|
+
|
30
|
+
**Monarch** is a distributed execution engine for PyTorch. Our overall goal is
|
31
|
+
to deliver the high-quality user experience that people get from single-GPU
|
32
|
+
PyTorch, but at cluster scale.
|
33
|
+
|
34
|
+
> ⚠️ **Early Development Warning** Monarch is currently in an experimental
|
35
|
+
> stage. You should expect bugs, incomplete features, and APIs that may change
|
36
|
+
> in future versions. The project welcomes bugfixes, but to make sure things are
|
37
|
+
> well coordinated you should discuss any significant change before starting the
|
38
|
+
> work. It's recommended that you signal your intention to contribute in the
|
39
|
+
> issue tracker, either by filing a new issue or by claiming an existing one.
|
40
|
+
|
41
|
+
Note: Monarch is currently only supported on Linux systems
|
42
|
+
|
43
|
+
## Installation
|
44
|
+
|
45
|
+
`pip install torchmonarch-nightly`
|
46
|
+
|
47
|
+
or manually
|
48
|
+
|
49
|
+
```sh
|
50
|
+
|
51
|
+
# Create and activate the conda environment
|
52
|
+
conda create -n monarchenv python=3.10 -y
|
53
|
+
conda activate monarchenv
|
54
|
+
|
55
|
+
# Install nightly rust toolchain
|
56
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
57
|
+
rustup toolchain install nightly
|
58
|
+
rustup default nightly
|
59
|
+
|
60
|
+
# Install non-python dependencies
|
61
|
+
conda install libunwind -y
|
62
|
+
|
63
|
+
# Install the correct cuda and cuda-toolkit versions for your machine
|
64
|
+
sudo dnf install cuda-toolkit-12-0 cuda-12-0
|
65
|
+
|
66
|
+
# Install clang-dev and nccl-dev
|
67
|
+
sudo dnf install clang-devel libnccl-devel
|
68
|
+
# Or, in some envrionments, the following may be necessary instead
|
69
|
+
conda install -c conda-forge clangdev nccl
|
70
|
+
conda update -n monarchenv --all -c conda-forge -y
|
71
|
+
|
72
|
+
# Install build dependencies
|
73
|
+
pip install -r build-requirements.txt
|
74
|
+
# Install test dependencies
|
75
|
+
pip install -r python/tests/requirements.txt
|
76
|
+
|
77
|
+
# Build and install Monarch
|
78
|
+
pip install --no-build-isolation .
|
79
|
+
# or setup for development
|
80
|
+
pip install --no-build-isolation -e .
|
81
|
+
|
82
|
+
# Run unit tests. consider -s for more verbose output
|
83
|
+
pytest python/tests/ -v -m "not oss_skip"
|
84
|
+
```
|
85
|
+
|
86
|
+
## Running examples
|
87
|
+
|
88
|
+
Check out the `examples/` directory for demonstrations of how to use Monarch's APIs.
|
89
|
+
|
90
|
+
We'll be adding more examples as we stabilize and polish functionality!
|
91
|
+
|
92
|
+
## License
|
93
|
+
|
94
|
+
Monarch is BSD-3 licensed, as found in the [LICENSE](LICENSE) file.
|