torchmonarch-nightly 2025.6.27__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +58 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +230 -0
  10. monarch/actor_mesh.py +761 -0
  11. monarch/allocator.py +220 -0
  12. monarch/bootstrap_main.py +59 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +68 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/code_sync.py +10 -0
  18. monarch/common/_C.pyi +11 -0
  19. monarch/common/_C.so +0 -0
  20. monarch/common/__init__.py +0 -0
  21. monarch/common/_coalescing.py +308 -0
  22. monarch/common/_device_utils.py +18 -0
  23. monarch/common/_tensor_to_table.py +172 -0
  24. monarch/common/base_tensor.py +28 -0
  25. monarch/common/borrows.py +143 -0
  26. monarch/common/client.py +690 -0
  27. monarch/common/constants.py +10 -0
  28. monarch/common/context_manager.py +40 -0
  29. monarch/common/controller_api.py +104 -0
  30. monarch/common/device_mesh.py +417 -0
  31. monarch/common/fake.py +55 -0
  32. monarch/common/function.py +160 -0
  33. monarch/common/function_caching.py +164 -0
  34. monarch/common/future.py +168 -0
  35. monarch/common/invocation.py +125 -0
  36. monarch/common/mast.py +221 -0
  37. monarch/common/messages.py +573 -0
  38. monarch/common/mock_cuda.py +41 -0
  39. monarch/common/opaque_ref.py +98 -0
  40. monarch/common/pickle_flatten.py +48 -0
  41. monarch/common/pipe.py +152 -0
  42. monarch/common/process_group.py +55 -0
  43. monarch/common/recording.py +127 -0
  44. monarch/common/reference.py +33 -0
  45. monarch/common/remote.py +297 -0
  46. monarch/common/selection.py +9 -0
  47. monarch/common/shape.py +229 -0
  48. monarch/common/stream.py +114 -0
  49. monarch/common/tensor.py +814 -0
  50. monarch/common/tensor_factory.py +31 -0
  51. monarch/common/tree.py +73 -0
  52. monarch/controller/__init__.py +7 -0
  53. monarch/controller/backend.py +223 -0
  54. monarch/controller/controller.py +223 -0
  55. monarch/controller/debugger.py +47 -0
  56. monarch/controller/history.py +90 -0
  57. monarch/controller/rust_backend/__init__.py +7 -0
  58. monarch/controller/rust_backend/controller.py +245 -0
  59. monarch/debugger.py +379 -0
  60. monarch/fetch.py +55 -0
  61. monarch/future.py +76 -0
  62. monarch/gradient/__init__.py +11 -0
  63. monarch/gradient/_gradient_generator.pyi +22 -0
  64. monarch/gradient/_gradient_generator.so +0 -0
  65. monarch/gradient_generator.py +185 -0
  66. monarch/memory.py +43 -0
  67. monarch/mesh_controller.py +271 -0
  68. monarch/monarch_controller +0 -0
  69. monarch/notebook.py +761 -0
  70. monarch/opaque_module.py +235 -0
  71. monarch/opaque_object.py +88 -0
  72. monarch/parallel/__init__.py +9 -0
  73. monarch/parallel/pipelining/__init__.py +7 -0
  74. monarch/parallel/pipelining/runtime.py +847 -0
  75. monarch/parallel/pipelining/schedule_ir.py +692 -0
  76. monarch/parallel/pipelining/scheduler.py +249 -0
  77. monarch/pdb_wrapper.py +135 -0
  78. monarch/proc_mesh.py +299 -0
  79. monarch/profiler.py +160 -0
  80. monarch/python_local_mesh.py +107 -0
  81. monarch/random.py +61 -0
  82. monarch/rdma.py +162 -0
  83. monarch/remote_class.py +114 -0
  84. monarch/rust_backend_mesh.py +280 -0
  85. monarch/rust_local_mesh.py +1402 -0
  86. monarch/sim_mesh.py +359 -0
  87. monarch/simulator/__init__.py +7 -0
  88. monarch/simulator/command_history.py +424 -0
  89. monarch/simulator/config.py +21 -0
  90. monarch/simulator/interface.py +59 -0
  91. monarch/simulator/ir.py +770 -0
  92. monarch/simulator/mock_controller.py +214 -0
  93. monarch/simulator/profiling.py +424 -0
  94. monarch/simulator/simulator.py +1052 -0
  95. monarch/simulator/task.py +255 -0
  96. monarch/simulator/tensor.py +373 -0
  97. monarch/simulator/trace.py +395 -0
  98. monarch/simulator/utils.py +41 -0
  99. monarch/simulator/worker.py +389 -0
  100. monarch/telemetry.py +19 -0
  101. monarch/tensor_worker_main.py +260 -0
  102. monarch/tensorboard.py +84 -0
  103. monarch/timer/__init__.py +21 -0
  104. monarch/timer/example_monarch.py +78 -0
  105. monarch/timer/example_spmd.py +55 -0
  106. monarch/timer/execution_timer.py +199 -0
  107. monarch/timer/execution_timer_test.py +131 -0
  108. monarch/tools/__init__.py +7 -0
  109. monarch/tools/cli.py +167 -0
  110. monarch/tools/commands.py +251 -0
  111. monarch/tools/components/__init__.py +7 -0
  112. monarch/tools/components/hyperactor.py +58 -0
  113. monarch/tools/config/__init__.py +20 -0
  114. monarch/tools/config/defaults.py +54 -0
  115. monarch/tools/mesh_spec.py +165 -0
  116. monarch/tools/network.py +69 -0
  117. monarch/worker/__init__.py +7 -0
  118. monarch/worker/_testing_function.py +481 -0
  119. monarch/worker/compiled_block.py +270 -0
  120. monarch/worker/debugger.py +125 -0
  121. monarch/worker/lines.py +47 -0
  122. monarch/worker/monitor.py +53 -0
  123. monarch/worker/worker.py +1191 -0
  124. monarch/world_mesh.py +34 -0
  125. monarch_supervisor/__init__.py +1044 -0
  126. monarch_supervisor/_testing.py +44 -0
  127. monarch_supervisor/function_call.py +30 -0
  128. monarch_supervisor/host.py +386 -0
  129. monarch_supervisor/launchers.py +145 -0
  130. monarch_supervisor/log_pstree.py +48 -0
  131. monarch_supervisor/logging.py +103 -0
  132. monarch_supervisor/python_executable.py +42 -0
  133. tests/__init__.py +0 -0
  134. tests/dispatch_bench.py +124 -0
  135. tests/dispatch_bench_helper.py +25 -0
  136. tests/error_test_binary.py +180 -0
  137. tests/simulator/__init__.py +0 -0
  138. tests/simulator/test_profiling.py +136 -0
  139. tests/simulator/test_simulator.py +411 -0
  140. tests/simulator/test_task.py +64 -0
  141. tests/simulator/test_worker.py +102 -0
  142. tests/sleep_binary.py +35 -0
  143. tests/test_actor_error.py +240 -0
  144. tests/test_alloc.py +25 -0
  145. tests/test_allocator.py +365 -0
  146. tests/test_coalescing.py +492 -0
  147. tests/test_controller.py +845 -0
  148. tests/test_device_mesh.py +132 -0
  149. tests/test_fault_tolerance.py +398 -0
  150. tests/test_future.py +94 -0
  151. tests/test_grad_generator.py +121 -0
  152. tests/test_mock_cuda.py +74 -0
  153. tests/test_pdb_actor.py +110 -0
  154. tests/test_python_actors.py +736 -0
  155. tests/test_remote_functions.py +1271 -0
  156. tests/test_rust_backend.py +217 -0
  157. tests/test_signal_safe_block_on.py +103 -0
  158. tests/test_sim_backend.py +54 -0
  159. tests/test_tensor_engine.py +52 -0
  160. torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
  161. torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
  162. torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
  163. torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
  164. torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
  165. torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0
monarch/_testing.py ADDED
@@ -0,0 +1,230 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+
9
+ import logging
10
+ import tempfile
11
+ import time
12
+ from contextlib import contextmanager, ExitStack
13
+ from typing import Any, Callable, Dict, Generator, Literal, Optional
14
+
15
+ import monarch_supervisor
16
+ from monarch.common.client import Client
17
+ from monarch.common.device_mesh import DeviceMesh
18
+ from monarch.common.invocation import DeviceException, RemoteException
19
+ from monarch.common.shape import NDSlice
20
+ from monarch.controller.backend import ProcessBackend
21
+ from monarch.mesh_controller import spawn_tensor_engine
22
+ from monarch.proc_mesh import proc_mesh, ProcMesh
23
+ from monarch.python_local_mesh import PythonLocalContext
24
+ from monarch.rust_local_mesh import (
25
+ local_mesh,
26
+ LoggingLocation,
27
+ ProcessCache,
28
+ SocketType,
29
+ )
30
+ from monarch.simulator.mock_controller import MockController
31
+ from monarch.world_mesh import world_mesh
32
+
33
+
34
+ class TestingContext:
35
+ """
36
+ Context manager for testing.
37
+ Creates a local device mesh for a given number of hosts and gpus per host.
38
+ Importantly, it also caches the worker processes so that tests can reuse them
39
+ without having to reinitialize torch/NCCL.
40
+
41
+ Example::
42
+ with TestingContext() as c:
43
+ local_mesh = c.local_device_mesh(2, 2)
44
+ with local_mesh.activate():
45
+ x = torch.rand(3, 4)
46
+ local_tensor = fetch_shard(x).result()
47
+ """
48
+
49
+ __test__ = False
50
+
51
+ def __init__(self):
52
+ self.cleanup = ExitStack()
53
+ self._py_process_cache = {}
54
+ self._rust_process_cache = None
55
+ self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
56
+
57
+ @contextmanager
58
+ def _get_context(self, num_hosts, gpu_per_host):
59
+ # since we are local, there isn't a lot of latency involved.
60
+ # Make the host managers exit if they go 0.5 seconds without
61
+ # hearing from supervisor.
62
+ monarch_supervisor.HEARTBEAT_INTERVAL = 1
63
+ ctx = PythonLocalContext(N=num_hosts)
64
+ store = ProcessBackend._create_store()
65
+ processes = ProcessBackend._create_pg(
66
+ ctx.ctx, ctx.hosts, gpu_per_host, store, _restartable=True
67
+ )
68
+ yield ctx.ctx, ctx.hosts, processes
69
+ ctx.shutdown()
70
+
71
+ def _processes(self, num_hosts, gpu_per_host):
72
+ key = (num_hosts, gpu_per_host)
73
+ if key not in self._py_process_cache:
74
+ self._py_process_cache[key] = self.cleanup.enter_context(
75
+ self._get_context(num_hosts, gpu_per_host)
76
+ )
77
+ return self._py_process_cache[key]
78
+
79
+ @contextmanager
80
+ def local_py_device_mesh(
81
+ self,
82
+ num_hosts,
83
+ gpu_per_host,
84
+ ) -> Generator[DeviceMesh, None, None]:
85
+ ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
86
+ dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
87
+ try:
88
+ yield dm
89
+ dm.client.shutdown(destroy_pg=False)
90
+ except Exception:
91
+ # abnormal exit, so we just make sure we do not try to communicate in destructors,
92
+ # but we do notn wait for workers to exit since we do not know what state they are in.
93
+ dm.client._shutdown = True
94
+ raise
95
+
96
+ @contextmanager
97
+ def local_rust_device_mesh(
98
+ self,
99
+ num_hosts,
100
+ gpu_per_host,
101
+ controller_params=None,
102
+ ) -> Generator[DeviceMesh, None, None]:
103
+ # Create a new system and mesh for test.
104
+ with local_mesh(
105
+ hosts=num_hosts,
106
+ gpus_per_host=gpu_per_host,
107
+ socket_type=SocketType.UNIX,
108
+ logging_location=LoggingLocation.DEFAULT,
109
+ system_factory=self._rust_process_cache.get_system_server(),
110
+ controller_factory=self._rust_process_cache.get_controller_server(),
111
+ worker_factory=self._rust_process_cache.get_worker_servers(
112
+ num_worker_procs=num_hosts * gpu_per_host,
113
+ gpus_per_host=gpu_per_host,
114
+ ),
115
+ controller_params=controller_params,
116
+ ) as dm:
117
+ try:
118
+ yield dm
119
+ dm.exit()
120
+ except Exception:
121
+ dm.client._shutdown = True
122
+ raise
123
+ finally:
124
+ # Shutdown the system.
125
+ # pyre-ignore: Undefined attribute
126
+ dm.client.inner._actor.stop()
127
+
128
+ @contextmanager
129
+ def local_engine_on_proc_mesh(
130
+ self,
131
+ num_hosts,
132
+ gpu_per_host,
133
+ ) -> Generator[DeviceMesh, None, None]:
134
+ key = (num_hosts, gpu_per_host)
135
+ if key not in self._proc_mesh_cache:
136
+ self._proc_mesh_cache[key] = proc_mesh(
137
+ hosts=num_hosts, gpus=gpu_per_host
138
+ ).get()
139
+
140
+ dm = spawn_tensor_engine(self._proc_mesh_cache[key])
141
+ dm = dm.rename(hosts="host", gpus="gpu")
142
+ try:
143
+ yield dm
144
+ dm.exit()
145
+ except Exception as e:
146
+ # abnormal exit, so we just make sure we do not try to communicate in destructors,
147
+ # but we do notn wait for workers to exit since we do not know what state they are in.
148
+ dm.client._shutdown = True
149
+ raise
150
+
151
+ @contextmanager
152
+ def local_device_mesh(
153
+ self,
154
+ num_hosts,
155
+ gpu_per_host,
156
+ activate=True,
157
+ backend: Literal["py", "rs", "mesh"] = "py",
158
+ controller_params=None,
159
+ ) -> Generator[DeviceMesh, None, None]:
160
+ start = time.time()
161
+ if backend == "rs":
162
+ generator = self.local_rust_device_mesh(
163
+ num_hosts, gpu_per_host, controller_params=controller_params
164
+ )
165
+ elif backend == "py":
166
+ generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
167
+ elif backend == "mesh":
168
+ generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
169
+ else:
170
+ raise ValueError(f"invalid backend: {backend}")
171
+ with generator as dm:
172
+ end = time.time()
173
+ logging.info("initialized mesh in {:.2f}s".format(end - start))
174
+ if activate:
175
+ with dm.activate():
176
+ yield dm
177
+ else:
178
+ yield dm
179
+ start = time.time()
180
+ end = time.time()
181
+ logging.info("shutdown mesh in {:.2f}s".format(end - start))
182
+
183
+ def __enter__(self):
184
+ start = time.time()
185
+ self._log_dir = self.cleanup.enter_context(
186
+ tempfile.TemporaryDirectory(prefix="rust_cached_workers.")
187
+ )
188
+ self._rust_process_cache = self.cleanup.enter_context(
189
+ ProcessCache(
190
+ logging_location=LoggingLocation.DEFAULT,
191
+ logging_dir=self._log_dir,
192
+ )
193
+ )
194
+ end = time.time()
195
+ logging.info("started process caches in {:.2f}s".format(end - start))
196
+ return self
197
+
198
+ def __exit__(self, *args):
199
+ start = time.time()
200
+ self.cleanup.__exit__(*args)
201
+ end = time.time()
202
+ logging.info("shutdown process caches in {:.2f}s".format(end - start))
203
+
204
+
205
+ def mock_mesh(hosts: int, gpus: int):
206
+ ctrl = MockController(hosts * gpus)
207
+ client = Client(ctrl, ctrl.world_size, ctrl.gpu_per_host)
208
+ dm = DeviceMesh(
209
+ client,
210
+ NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]),
211
+ ("host", "gpu"),
212
+ )
213
+
214
+ def create_exit(
215
+ client: Client,
216
+ ) -> Callable[[Optional[RemoteException | DeviceException | Exception]], None]:
217
+ def exit(
218
+ error: Optional[RemoteException | DeviceException | Exception] = None,
219
+ ) -> None:
220
+ client.shutdown(True, error)
221
+
222
+ return exit
223
+
224
+ dm.exit = create_exit(client)
225
+ return dm
226
+
227
+
228
+ class BackendType:
229
+ PY = "py"
230
+ RS = "rs"