torchmonarch-nightly 2025.7.1__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.26__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/__init__.py +13 -9
- monarch/_rust_bindings.so +0 -0
- monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
- monarch/_src/actor/actor_mesh.py +878 -0
- monarch/{allocator.py → _src/actor/allocator.py} +26 -17
- monarch/_src/actor/bootstrap_main.py +73 -0
- monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
- monarch/_src/actor/code_sync/auto_reload.py +223 -0
- monarch/_src/actor/debugger.py +565 -0
- monarch/_src/actor/endpoint.py +303 -0
- monarch/_src/actor/event_loop.py +97 -0
- monarch/_src/actor/future.py +100 -0
- monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
- monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
- monarch/_src/actor/proc_mesh.py +508 -0
- monarch/_src/actor/sync_state.py +18 -0
- monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
- monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
- monarch/_src/actor/tensor_engine_shim.py +59 -0
- monarch/_src/tensor_engine/rdma.py +180 -0
- monarch/_testing.py +3 -2
- monarch/actor/__init__.py +53 -0
- monarch/actor_mesh.py +6 -765
- monarch/bootstrap_main.py +8 -47
- monarch/common/client.py +1 -1
- monarch/common/controller_api.py +2 -1
- monarch/common/device_mesh.py +12 -2
- monarch/common/messages.py +21 -1
- monarch/common/recording.py +4 -3
- monarch/common/remote.py +135 -52
- monarch/common/tensor.py +2 -1
- monarch/controller/backend.py +2 -2
- monarch/controller/controller.py +2 -1
- monarch/controller/rust_backend/controller.py +2 -1
- monarch/fetch.py +3 -5
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/mesh_controller.py +263 -139
- monarch/monarch_controller +0 -0
- monarch/opaque_module.py +4 -6
- monarch/opaque_object.py +3 -3
- monarch/proc_mesh.py +6 -309
- monarch/python_local_mesh.py +1 -1
- monarch/rust_backend_mesh.py +2 -1
- monarch/rust_local_mesh.py +4 -2
- monarch/sim_mesh.py +10 -19
- monarch/simulator/command_history.py +1 -1
- monarch/simulator/interface.py +2 -1
- monarch/simulator/mock_controller.py +1 -1
- monarch/simulator/simulator.py +1 -1
- monarch/tensor_engine/__init__.py +23 -0
- monarch/tensor_worker_main.py +3 -1
- monarch/tools/cli.py +3 -1
- monarch/tools/commands.py +129 -47
- monarch/tools/components/hyperactor.py +5 -3
- monarch/tools/config/__init__.py +18 -1
- monarch/tools/config/defaults.py +2 -2
- monarch/tools/mesh_spec.py +59 -1
- monarch/tools/utils.py +38 -0
- monarch/worker/worker.py +1 -1
- monarch/world_mesh.py +2 -1
- monarch_supervisor/python_executable.py +6 -3
- tests/error_test_binary.py +48 -10
- tests/test_actor_error.py +370 -21
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +369 -17
- tests/test_controller.py +2 -0
- tests/test_debugger.py +416 -0
- tests/test_env_before_cuda.py +161 -0
- tests/test_python_actors.py +184 -333
- tests/test_rdma.py +198 -0
- tests/test_remote_functions.py +40 -12
- tests/test_rust_backend.py +7 -5
- tests/test_sim_backend.py +1 -4
- tests/test_tensor_engine.py +81 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/METADATA +39 -1
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/RECORD +84 -72
- torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt +3 -0
- monarch/_monarch/hyperactor/__init__.py +0 -58
- monarch/_monarch/worker/debugger.py +0 -117
- monarch/_monarch/worker/logging.py +0 -107
- monarch/debugger.py +0 -379
- monarch/future.py +0 -76
- monarch/rdma.py +0 -162
- torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
- /monarch/{_monarch/worker → _src}/__init__.py +0 -0
- /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
- /monarch/{common → _src/actor}/shape.py +0 -0
- /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.26.dist-info}/top_level.txt +0 -0
@@ -10,18 +10,17 @@ import abc
|
|
10
10
|
import logging
|
11
11
|
from typing import final, Optional
|
12
12
|
|
13
|
-
from monarch import
|
14
|
-
from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
13
|
+
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
15
14
|
Alloc,
|
16
15
|
AllocSpec,
|
17
|
-
)
|
18
|
-
|
19
|
-
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
20
16
|
LocalAllocatorBase,
|
21
17
|
ProcessAllocatorBase,
|
22
18
|
RemoteAllocatorBase,
|
19
|
+
SimAllocatorBase,
|
23
20
|
)
|
24
21
|
|
22
|
+
from monarch._src.actor.future import Future
|
23
|
+
|
25
24
|
ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
|
26
25
|
|
27
26
|
logger: logging.Logger = logging.getLogger(__name__)
|
@@ -43,10 +42,7 @@ class ProcessAllocator(ProcessAllocatorBase):
|
|
43
42
|
Returns:
|
44
43
|
- A future that will be fulfilled when the requested allocation is fulfilled.
|
45
44
|
"""
|
46
|
-
return Future(
|
47
|
-
lambda: self.allocate_nonblocking(spec),
|
48
|
-
lambda: self.allocate_blocking(spec),
|
49
|
-
)
|
45
|
+
return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
|
50
46
|
|
51
47
|
|
52
48
|
@final
|
@@ -65,10 +61,26 @@ class LocalAllocator(LocalAllocatorBase):
|
|
65
61
|
Returns:
|
66
62
|
- A future that will be fulfilled when the requested allocation is fulfilled.
|
67
63
|
"""
|
68
|
-
return Future(
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
|
65
|
+
|
66
|
+
|
67
|
+
@final
|
68
|
+
class SimAllocator(SimAllocatorBase):
|
69
|
+
"""
|
70
|
+
An allocator that allocates by spawning actors into the current process using simulated channels for transport
|
71
|
+
"""
|
72
|
+
|
73
|
+
def allocate(self, spec: AllocSpec) -> Future[Alloc]:
|
74
|
+
"""
|
75
|
+
Allocate a process according to the provided spec.
|
76
|
+
|
77
|
+
Arguments:
|
78
|
+
- `spec`: The spec to allocate according to.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
- A future that will be fulfilled when the requested allocation is fulfilled.
|
82
|
+
"""
|
83
|
+
return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
|
72
84
|
|
73
85
|
|
74
86
|
class RemoteAllocInitializer(abc.ABC):
|
@@ -214,7 +226,4 @@ class RemoteAllocator(RemoteAllocatorBase):
|
|
214
226
|
Returns:
|
215
227
|
- A future that will be fulfilled when the requested allocation is fulfilled.
|
216
228
|
"""
|
217
|
-
return Future(
|
218
|
-
lambda: self.allocate_nonblocking(spec),
|
219
|
-
lambda: self.allocate_blocking(spec),
|
220
|
-
)
|
229
|
+
return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
"""
|
8
|
+
This is the main function for the boostrapping a new process using a ProcessAllocator.
|
9
|
+
"""
|
10
|
+
|
11
|
+
import asyncio
|
12
|
+
import importlib.resources
|
13
|
+
import logging
|
14
|
+
import multiprocessing
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
|
18
|
+
# Import torch to avoid import-time races if a spawned actor tries to import torch.
|
19
|
+
try:
|
20
|
+
import torch # @manual
|
21
|
+
except ImportError:
|
22
|
+
pass
|
23
|
+
|
24
|
+
|
25
|
+
async def main():
|
26
|
+
from monarch._rust_bindings.monarch_hyperactor.bootstrap import bootstrap_main
|
27
|
+
|
28
|
+
await bootstrap_main()
|
29
|
+
|
30
|
+
|
31
|
+
def invoke_main():
|
32
|
+
# if this is invoked with the stdout piped somewhere, then print
|
33
|
+
# changes its buffering behavior. So we default to the standard
|
34
|
+
# behavior of std out as if it were a terminal.
|
35
|
+
sys.stdout.reconfigure(line_buffering=True)
|
36
|
+
global bootstrap_main
|
37
|
+
|
38
|
+
# TODO: figure out what from worker_main.py we should reproduce here.
|
39
|
+
from monarch._src.actor.telemetry import TracingForwarder
|
40
|
+
|
41
|
+
if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
|
42
|
+
raise RuntimeError("Error during bootstrap for testing")
|
43
|
+
|
44
|
+
# forward logs to rust tracing. Defaults to on.
|
45
|
+
if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
|
46
|
+
logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
|
47
|
+
# install opentelemetry tracing
|
48
|
+
|
49
|
+
try:
|
50
|
+
with (
|
51
|
+
importlib.resources.as_file(
|
52
|
+
importlib.resources.files("monarch") / "py-spy"
|
53
|
+
) as pyspy,
|
54
|
+
):
|
55
|
+
if pyspy.exists():
|
56
|
+
os.environ["PYSPY_BIN"] = str(pyspy)
|
57
|
+
# fallback to using local py-spy
|
58
|
+
except Exception as e:
|
59
|
+
logging.warning(f"Failed to set up py-spy: {e}")
|
60
|
+
|
61
|
+
from monarch._src.actor.debugger import remote_breakpointhook
|
62
|
+
|
63
|
+
sys.breakpointhook = remote_breakpointhook
|
64
|
+
|
65
|
+
# Start an event loop for PythonActors to use.
|
66
|
+
asyncio.run(main())
|
67
|
+
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
# Ensure that processes started via `multiprocessing` are spawned, not forked.
|
71
|
+
# forking is a terrible default, see: https://github.com/python/cpython/issues/84559
|
72
|
+
multiprocessing.set_start_method("spawn", force=True)
|
73
|
+
invoke_main() # pragma: no cover
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import contextlib
|
8
|
+
import dataclasses
|
9
|
+
import importlib
|
10
|
+
import importlib.abc
|
11
|
+
import importlib.util
|
12
|
+
import itertools
|
13
|
+
import site
|
14
|
+
import sys
|
15
|
+
import threading
|
16
|
+
from pathlib import Path
|
17
|
+
from types import ModuleType
|
18
|
+
from typing import Dict, List, Optional, Tuple
|
19
|
+
|
20
|
+
from monarch._src.actor.actor_mesh import Actor
|
21
|
+
from monarch._src.actor.endpoint import endpoint
|
22
|
+
|
23
|
+
|
24
|
+
class SysAuditHookGuard(contextlib.AbstractContextManager):
|
25
|
+
"""
|
26
|
+
A guard (and context manager), which will unregister an import hook when
|
27
|
+
closed or deleted.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(self, hooks, idx):
|
31
|
+
self._hooks = hooks
|
32
|
+
self._idx = idx
|
33
|
+
|
34
|
+
def close(self):
|
35
|
+
self._hooks.pop(self._idx, None)
|
36
|
+
|
37
|
+
def __enter__(self):
|
38
|
+
return self
|
39
|
+
|
40
|
+
def __exit__(self, *args):
|
41
|
+
self.close()
|
42
|
+
|
43
|
+
def __del__(self):
|
44
|
+
self.close()
|
45
|
+
|
46
|
+
|
47
|
+
class SysAuditHookMultiplexer:
|
48
|
+
"""
|
49
|
+
Multiplexes import hooks to multiple hooks.
|
50
|
+
|
51
|
+
`sys.addaudithook`s can only be added and not removed, so this class provides
|
52
|
+
a global singleton that can be used to multiplex multiple hooks which support
|
53
|
+
removal.
|
54
|
+
"""
|
55
|
+
|
56
|
+
def __init__(self):
|
57
|
+
self._idx = itertools.count()
|
58
|
+
self._hooks = {}
|
59
|
+
|
60
|
+
def _callback(self, event, args):
|
61
|
+
for hook in self._hooks.values():
|
62
|
+
hook(event, args)
|
63
|
+
|
64
|
+
def add(self, hook) -> SysAuditHookGuard:
|
65
|
+
idx = next(self._idx)
|
66
|
+
self._hooks[idx] = hook
|
67
|
+
return SysAuditHookGuard(self._hooks, idx)
|
68
|
+
|
69
|
+
_instance_lock = threading.Lock()
|
70
|
+
_instance = None
|
71
|
+
|
72
|
+
@classmethod
|
73
|
+
def singleton(cls):
|
74
|
+
if cls._instance is None:
|
75
|
+
with cls._instance_lock:
|
76
|
+
if cls._instance is None:
|
77
|
+
cls._instance = SysAuditHookMultiplexer()
|
78
|
+
sys.addaudithook(cls._instance._callback)
|
79
|
+
return cls._instance
|
80
|
+
|
81
|
+
|
82
|
+
@dataclasses.dataclass
|
83
|
+
class ThreadLocalState(threading.local):
|
84
|
+
last_import: Optional[str] = None
|
85
|
+
|
86
|
+
|
87
|
+
class SysAuditImportHook:
|
88
|
+
"""
|
89
|
+
An audit hook that processes and coalesces import/exec events and calls a
|
90
|
+
user-defined callback with the module name and module object which was
|
91
|
+
imported.
|
92
|
+
"""
|
93
|
+
|
94
|
+
def __init__(self, callback):
|
95
|
+
self._callback = callback
|
96
|
+
self._state = ThreadLocalState()
|
97
|
+
|
98
|
+
@classmethod
|
99
|
+
def install(cls, callback) -> SysAuditHookGuard:
|
100
|
+
return SysAuditHookMultiplexer.singleton().add(SysAuditImportHook(callback))
|
101
|
+
|
102
|
+
def _py_filename(self, filename: Path) -> Path:
|
103
|
+
if filename.suffix in (".pyc", ".pyo"):
|
104
|
+
return filename.with_suffix(".py")
|
105
|
+
return filename
|
106
|
+
|
107
|
+
def __call__(self, event, args):
|
108
|
+
if event == "import":
|
109
|
+
# While `filename` is specific as an argument to the import event, it's
|
110
|
+
# almost always `None`, so we need to wait for a subsequent exec event
|
111
|
+
# to get the filename.
|
112
|
+
module, _, _, _, _ = args
|
113
|
+
self._state.last_import = module
|
114
|
+
elif event == "exec":
|
115
|
+
module_name = self._state.last_import
|
116
|
+
if module_name is None:
|
117
|
+
return
|
118
|
+
# We always expect an exec right after an import, so we can clear the
|
119
|
+
# last import module name we store.
|
120
|
+
self._state.last_import = None
|
121
|
+
module = sys.modules.get(module_name)
|
122
|
+
if module is None:
|
123
|
+
return
|
124
|
+
if getattr(module, "__file__", None) is None:
|
125
|
+
return
|
126
|
+
(code_obj,) = args
|
127
|
+
if code_obj.co_filename is None:
|
128
|
+
return
|
129
|
+
# code objects store the original source name, not the pyc
|
130
|
+
if self._py_filename(Path(module.__file__)) != Path(code_obj.co_filename):
|
131
|
+
return
|
132
|
+
self._callback(module_name, module)
|
133
|
+
|
134
|
+
|
135
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
136
|
+
class Fingerprint:
|
137
|
+
mtime: float
|
138
|
+
size: int
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def for_path(cls, path: Path) -> "Fingerprint":
|
142
|
+
stat = path.stat()
|
143
|
+
return Fingerprint(mtime=stat.st_mtime, size=stat.st_size)
|
144
|
+
|
145
|
+
|
146
|
+
class AutoReloader:
|
147
|
+
"""
|
148
|
+
Track changes to modules and reloads them when they change.
|
149
|
+
"""
|
150
|
+
|
151
|
+
def __init__(self, reload=importlib.reload):
|
152
|
+
self._reload = reload
|
153
|
+
self._tracked_modules: Dict[str, Tuple[Path, Fingerprint]] = {}
|
154
|
+
self._track_all_imported()
|
155
|
+
|
156
|
+
def _maybe_track_module(self, name: str, module: ModuleType):
|
157
|
+
filename = getattr(module, "__file__", None)
|
158
|
+
if filename is None:
|
159
|
+
return
|
160
|
+
if filename == "static-extension":
|
161
|
+
return
|
162
|
+
filename = Path(filename)
|
163
|
+
|
164
|
+
# It's rare for modules to have relative path names, but can happen in
|
165
|
+
# weird special situations (e.g. `_ops.py` from `torch.ops`).
|
166
|
+
if not filename.is_absolute():
|
167
|
+
return
|
168
|
+
|
169
|
+
# Ignore builtin modules.
|
170
|
+
if filename.is_relative_to(sys.prefix):
|
171
|
+
for dirpath in site.getsitepackages():
|
172
|
+
if filename.is_relative_to(dirpath):
|
173
|
+
break
|
174
|
+
else:
|
175
|
+
return
|
176
|
+
|
177
|
+
self._tracked_modules[name] = (
|
178
|
+
filename,
|
179
|
+
Fingerprint.for_path(filename),
|
180
|
+
)
|
181
|
+
|
182
|
+
def _track_all_imported(self):
|
183
|
+
for name, module in sys.modules.items():
|
184
|
+
if module is None:
|
185
|
+
continue
|
186
|
+
self._maybe_track_module(name, module)
|
187
|
+
|
188
|
+
def import_callback(self, name: str, module: ModuleType):
|
189
|
+
"""
|
190
|
+
Callback for when a module has been imported.
|
191
|
+
"""
|
192
|
+
|
193
|
+
self._maybe_track_module(name, module)
|
194
|
+
|
195
|
+
def reload_changes(self) -> List[str]:
|
196
|
+
"""
|
197
|
+
Reload all modules that have changed since they were last imported.
|
198
|
+
"""
|
199
|
+
|
200
|
+
reloaded = []
|
201
|
+
|
202
|
+
for module_name, (filename, stored_fingerprint) in list(
|
203
|
+
self._tracked_modules.items()
|
204
|
+
):
|
205
|
+
fingerprint = Fingerprint.for_path(filename)
|
206
|
+
if fingerprint == stored_fingerprint:
|
207
|
+
continue
|
208
|
+
reloaded.append(module_name)
|
209
|
+
self._reload(sys.modules[module_name])
|
210
|
+
self._tracked_modules[module_name] = (filename, fingerprint)
|
211
|
+
|
212
|
+
return reloaded
|
213
|
+
|
214
|
+
|
215
|
+
class AutoReloadActor(Actor):
|
216
|
+
def __init__(self):
|
217
|
+
self._reloader = AutoReloader()
|
218
|
+
self._hook_guard = SysAuditImportHook.install(self._reloader.import_callback)
|
219
|
+
|
220
|
+
@endpoint
|
221
|
+
async def reload(self) -> None:
|
222
|
+
changed = self._reloader.reload_changes()
|
223
|
+
print(f"reloaded modules: {changed}")
|