torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. monarch/__init__.py +189 -0
  2. monarch/_monarch/__init__.py +5 -0
  3. monarch/_monarch/hyperactor/__init__.py +74 -0
  4. monarch/_monarch/selection/__init__.py +13 -0
  5. monarch/_monarch/worker/__init__.py +0 -0
  6. monarch/_monarch/worker/debugger.py +117 -0
  7. monarch/_monarch/worker/logging.py +107 -0
  8. monarch/_rust_bindings.so +0 -0
  9. monarch/_testing.py +198 -0
  10. monarch/actor_mesh.py +692 -0
  11. monarch/allocator.py +62 -0
  12. monarch/bootstrap_main.py +75 -0
  13. monarch/builtins/__init__.py +14 -0
  14. monarch/builtins/log.py +22 -0
  15. monarch/builtins/random.py +69 -0
  16. monarch/cached_remote_function.py +257 -0
  17. monarch/common/_C.pyi +11 -0
  18. monarch/common/_C.so +0 -0
  19. monarch/common/__init__.py +0 -0
  20. monarch/common/_coalescing.py +308 -0
  21. monarch/common/_device_utils.py +18 -0
  22. monarch/common/_tensor_to_table.py +172 -0
  23. monarch/common/base_tensor.py +28 -0
  24. monarch/common/borrows.py +143 -0
  25. monarch/common/client.py +646 -0
  26. monarch/common/constants.py +10 -0
  27. monarch/common/context_manager.py +40 -0
  28. monarch/common/controller_api.py +104 -0
  29. monarch/common/device_mesh.py +443 -0
  30. monarch/common/fake.py +55 -0
  31. monarch/common/function.py +160 -0
  32. monarch/common/function_caching.py +164 -0
  33. monarch/common/future.py +168 -0
  34. monarch/common/invocation.py +125 -0
  35. monarch/common/mast.py +221 -0
  36. monarch/common/messages.py +572 -0
  37. monarch/common/mock_cuda.py +41 -0
  38. monarch/common/opaque_ref.py +98 -0
  39. monarch/common/pickle_flatten.py +48 -0
  40. monarch/common/pipe.py +152 -0
  41. monarch/common/process_group.py +55 -0
  42. monarch/common/recording.py +127 -0
  43. monarch/common/reference.py +33 -0
  44. monarch/common/remote.py +304 -0
  45. monarch/common/selection.py +9 -0
  46. monarch/common/shape.py +204 -0
  47. monarch/common/stream.py +111 -0
  48. monarch/common/tensor.py +793 -0
  49. monarch/common/tensor_factory.py +31 -0
  50. monarch/common/tree.py +73 -0
  51. monarch/controller/__init__.py +7 -0
  52. monarch/controller/backend.py +223 -0
  53. monarch/controller/controller.py +223 -0
  54. monarch/controller/debugger.py +47 -0
  55. monarch/controller/history.py +90 -0
  56. monarch/controller/rust_backend/__init__.py +7 -0
  57. monarch/controller/rust_backend/controller.py +245 -0
  58. monarch/fetch.py +55 -0
  59. monarch/future.py +25 -0
  60. monarch/gradient/__init__.py +11 -0
  61. monarch/gradient/_gradient_generator.pyi +22 -0
  62. monarch/gradient/_gradient_generator.so +0 -0
  63. monarch/gradient_generator.py +185 -0
  64. monarch/memory.py +43 -0
  65. monarch/monarch_controller +0 -0
  66. monarch/notebook.py +761 -0
  67. monarch/opaque_module.py +235 -0
  68. monarch/opaque_object.py +88 -0
  69. monarch/parallel/__init__.py +9 -0
  70. monarch/parallel/pipelining/__init__.py +7 -0
  71. monarch/parallel/pipelining/runtime.py +847 -0
  72. monarch/parallel/pipelining/schedule_ir.py +692 -0
  73. monarch/parallel/pipelining/scheduler.py +249 -0
  74. monarch/proc_mesh.py +188 -0
  75. monarch/profiler.py +160 -0
  76. monarch/python_local_mesh.py +107 -0
  77. monarch/random.py +61 -0
  78. monarch/rdma.py +190 -0
  79. monarch/remote_class.py +114 -0
  80. monarch/rust_backend_mesh.py +280 -0
  81. monarch/rust_local_mesh.py +1402 -0
  82. monarch/sim_mesh.py +357 -0
  83. monarch/simulator/__init__.py +7 -0
  84. monarch/simulator/command_history.py +424 -0
  85. monarch/simulator/config.py +21 -0
  86. monarch/simulator/interface.py +59 -0
  87. monarch/simulator/ir.py +770 -0
  88. monarch/simulator/mock_controller.py +214 -0
  89. monarch/simulator/profiling.py +424 -0
  90. monarch/simulator/simulator.py +1052 -0
  91. monarch/simulator/task.py +255 -0
  92. monarch/simulator/tensor.py +373 -0
  93. monarch/simulator/trace.py +395 -0
  94. monarch/simulator/utils.py +41 -0
  95. monarch/simulator/worker.py +389 -0
  96. monarch/tensor_worker_main.py +260 -0
  97. monarch/tensorboard.py +84 -0
  98. monarch/timer/__init__.py +21 -0
  99. monarch/timer/example_monarch.py +78 -0
  100. monarch/timer/example_spmd.py +55 -0
  101. monarch/timer/execution_timer.py +199 -0
  102. monarch/timer/execution_timer_test.py +131 -0
  103. monarch/tools/__init__.py +7 -0
  104. monarch/tools/cli.py +167 -0
  105. monarch/tools/commands.py +189 -0
  106. monarch/tools/components/__init__.py +7 -0
  107. monarch/tools/components/hyperactor.py +57 -0
  108. monarch/tools/config/__init__.py +20 -0
  109. monarch/tools/config/defaults.py +54 -0
  110. monarch/tools/mesh_spec.py +121 -0
  111. monarch/worker/__init__.py +7 -0
  112. monarch/worker/_testing_function.py +481 -0
  113. monarch/worker/compiled_block.py +270 -0
  114. monarch/worker/debugger.py +125 -0
  115. monarch/worker/lines.py +47 -0
  116. monarch/worker/monitor.py +53 -0
  117. monarch/worker/worker.py +1191 -0
  118. monarch/world_mesh.py +34 -0
  119. monarch_supervisor/__init__.py +1044 -0
  120. monarch_supervisor/_testing.py +44 -0
  121. monarch_supervisor/function_call.py +30 -0
  122. monarch_supervisor/host.py +386 -0
  123. monarch_supervisor/launchers.py +145 -0
  124. monarch_supervisor/log_pstree.py +48 -0
  125. monarch_supervisor/logging.py +103 -0
  126. monarch_supervisor/python_executable.py +42 -0
  127. tests/__init__.py +0 -0
  128. tests/dispatch_bench.py +124 -0
  129. tests/dispatch_bench_helper.py +25 -0
  130. tests/error_test_binary.py +139 -0
  131. tests/simulator/__init__.py +0 -0
  132. tests/simulator/test_profiling.py +136 -0
  133. tests/simulator/test_simulator.py +411 -0
  134. tests/simulator/test_task.py +64 -0
  135. tests/simulator/test_worker.py +102 -0
  136. tests/sleep_binary.py +35 -0
  137. tests/test_actor_error.py +112 -0
  138. tests/test_alloc.py +25 -0
  139. tests/test_coalescing.py +492 -0
  140. tests/test_controller.py +835 -0
  141. tests/test_device_mesh.py +132 -0
  142. tests/test_fault_tolerance.py +398 -0
  143. tests/test_future.py +94 -0
  144. tests/test_grad_generator.py +121 -0
  145. tests/test_mock_cuda.py +74 -0
  146. tests/test_pdb_actor.py +110 -0
  147. tests/test_python_actors.py +372 -0
  148. tests/test_remote_functions.py +1271 -0
  149. tests/test_rust_backend.py +182 -0
  150. tests/test_signal_safe_block_on.py +103 -0
  151. tests/test_sim_backend.py +54 -0
  152. torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
  153. torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
  154. torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
  155. torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
  156. torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
  157. torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0
monarch/_testing.py ADDED
@@ -0,0 +1,198 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+
9
+ import logging
10
+ import tempfile
11
+ import time
12
+ from contextlib import contextmanager, ExitStack
13
+ from typing import Callable, Generator, Optional
14
+
15
+ import monarch_supervisor
16
+ from monarch.common.client import Client
17
+ from monarch.common.device_mesh import DeviceMesh
18
+ from monarch.common.invocation import DeviceException, RemoteException
19
+ from monarch.common.shape import NDSlice
20
+ from monarch.controller.backend import ProcessBackend
21
+ from monarch.python_local_mesh import PythonLocalContext
22
+ from monarch.rust_local_mesh import (
23
+ local_mesh,
24
+ LoggingLocation,
25
+ ProcessCache,
26
+ SocketType,
27
+ )
28
+ from monarch.simulator.mock_controller import MockController
29
+ from monarch.world_mesh import world_mesh
30
+
31
+
32
+ class TestingContext:
33
+ """
34
+ Context manager for testing.
35
+ Creates a local device mesh for a given number of hosts and gpus per host.
36
+ Importantly, it also caches the worker processes so that tests can reuse them
37
+ without having to reinitialize torch/NCCL.
38
+
39
+ Example::
40
+ with TestingContext() as c:
41
+ local_mesh = c.local_device_mesh(2, 2)
42
+ with local_mesh.activate():
43
+ x = torch.rand(3, 4)
44
+ local_tensor = fetch_shard(x).result()
45
+ """
46
+
47
+ __test__ = False
48
+
49
+ def __init__(self):
50
+ self.cleanup = ExitStack()
51
+ self._py_process_cache = {}
52
+ self._rust_process_cache = None
53
+
54
+ @contextmanager
55
+ def _get_context(self, num_hosts, gpu_per_host):
56
+ # since we are local, there isn't a lot of latency involved.
57
+ # Make the host managers exit if they go 0.5 seconds without
58
+ # hearing from supervisor.
59
+ monarch_supervisor.HEARTBEAT_INTERVAL = 1
60
+ ctx = PythonLocalContext(N=num_hosts)
61
+ store = ProcessBackend._create_store()
62
+ processes = ProcessBackend._create_pg(
63
+ ctx.ctx, ctx.hosts, gpu_per_host, store, _restartable=True
64
+ )
65
+ yield ctx.ctx, ctx.hosts, processes
66
+ ctx.shutdown()
67
+
68
+ def _processes(self, num_hosts, gpu_per_host):
69
+ key = (num_hosts, gpu_per_host)
70
+ if key not in self._py_process_cache:
71
+ self._py_process_cache[key] = self.cleanup.enter_context(
72
+ self._get_context(num_hosts, gpu_per_host)
73
+ )
74
+ return self._py_process_cache[key]
75
+
76
+ @contextmanager
77
+ def local_py_device_mesh(
78
+ self, num_hosts, gpu_per_host, activate=True
79
+ ) -> Generator[DeviceMesh, None, None]:
80
+ ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
81
+ dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
82
+ try:
83
+ if activate:
84
+ with dm.activate():
85
+ yield dm
86
+ else:
87
+ yield dm
88
+ dm.client.shutdown(destroy_pg=False)
89
+ except Exception:
90
+ # abnormal exit, so we just make sure we do not try to communicate in destructors,
91
+ # but we do notn wait for workers to exit since we do not know what state they are in.
92
+ dm.client._shutdown = True
93
+ raise
94
+
95
+ @contextmanager
96
+ def local_rust_device_mesh(
97
+ self,
98
+ num_hosts,
99
+ gpu_per_host,
100
+ activate: bool = True,
101
+ controller_params=None,
102
+ ) -> Generator[DeviceMesh, None, None]:
103
+ # Create a new system and mesh for test.
104
+ with local_mesh(
105
+ hosts=num_hosts,
106
+ gpus_per_host=gpu_per_host,
107
+ socket_type=SocketType.UNIX,
108
+ logging_location=LoggingLocation.DEFAULT,
109
+ system_factory=self._rust_process_cache.get_system_server(),
110
+ controller_factory=self._rust_process_cache.get_controller_server(),
111
+ worker_factory=self._rust_process_cache.get_worker_servers(
112
+ num_worker_procs=num_hosts * gpu_per_host,
113
+ gpus_per_host=gpu_per_host,
114
+ ),
115
+ controller_params=controller_params,
116
+ ) as dm:
117
+ try:
118
+ if activate:
119
+ with dm.activate():
120
+ yield dm
121
+ else:
122
+ yield dm
123
+ dm.exit()
124
+ except Exception:
125
+ dm.client._shutdown = True
126
+ raise
127
+ finally:
128
+ # Shutdown the system.
129
+ # pyre-ignore: Undefined attribute
130
+ dm.client.inner._actor.stop()
131
+
132
+ @contextmanager
133
+ def local_device_mesh(
134
+ self, num_hosts, gpu_per_host, activate=True, rust=False, controller_params=None
135
+ ) -> Generator[DeviceMesh, None, None]:
136
+ start = time.time()
137
+ if rust:
138
+ generator = self.local_rust_device_mesh(
139
+ num_hosts, gpu_per_host, activate, controller_params=controller_params
140
+ )
141
+ else:
142
+ generator = self.local_py_device_mesh(num_hosts, gpu_per_host, activate)
143
+ with generator as dm:
144
+ end = time.time()
145
+ logging.info("initialized mesh in {:.2f}s".format(end - start))
146
+ yield dm
147
+ start = time.time()
148
+ end = time.time()
149
+ logging.info("shutdown mesh in {:.2f}s".format(end - start))
150
+
151
+ def __enter__(self):
152
+ start = time.time()
153
+ self._log_dir = self.cleanup.enter_context(
154
+ tempfile.TemporaryDirectory(prefix="rust_cached_workers.")
155
+ )
156
+ self._rust_process_cache = self.cleanup.enter_context(
157
+ ProcessCache(
158
+ logging_location=LoggingLocation.DEFAULT,
159
+ logging_dir=self._log_dir,
160
+ )
161
+ )
162
+ end = time.time()
163
+ logging.info("started process caches in {:.2f}s".format(end - start))
164
+ return self
165
+
166
+ def __exit__(self, *args):
167
+ start = time.time()
168
+ self.cleanup.__exit__(*args)
169
+ end = time.time()
170
+ logging.info("shutdown process caches in {:.2f}s".format(end - start))
171
+
172
+
173
+ def mock_mesh(hosts: int, gpus: int):
174
+ ctrl = MockController(hosts * gpus)
175
+ client = Client(ctrl, ctrl.world_size, ctrl.gpu_per_host)
176
+ dm = DeviceMesh(
177
+ client,
178
+ NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]),
179
+ ("host", "gpu"),
180
+ )
181
+
182
+ def create_exit(
183
+ client: Client,
184
+ ) -> Callable[[Optional[RemoteException | DeviceException | Exception]], None]:
185
+ def exit(
186
+ error: Optional[RemoteException | DeviceException | Exception] = None,
187
+ ) -> None:
188
+ client.shutdown(True, error)
189
+
190
+ return exit
191
+
192
+ dm.exit = create_exit(client)
193
+ return dm
194
+
195
+
196
+ class BackendType:
197
+ PY = "py"
198
+ RS = "rs"