torchmonarch-nightly 2025.7.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -765
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +48 -10
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -333
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
@@ -10,18 +10,17 @@ import abc
10
10
  import logging
11
11
  from typing import final, Optional
12
12
 
13
- from monarch import ActorFuture as Future
14
- from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
13
+ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
15
14
  Alloc,
16
15
  AllocSpec,
17
- )
18
-
19
- from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
20
16
  LocalAllocatorBase,
21
17
  ProcessAllocatorBase,
22
18
  RemoteAllocatorBase,
19
+ SimAllocatorBase,
23
20
  )
24
21
 
22
+ from monarch._src.actor.future import Future
23
+
25
24
  ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
26
25
 
27
26
  logger: logging.Logger = logging.getLogger(__name__)
@@ -43,10 +42,7 @@ class ProcessAllocator(ProcessAllocatorBase):
43
42
  Returns:
44
43
  - A future that will be fulfilled when the requested allocation is fulfilled.
45
44
  """
46
- return Future(
47
- lambda: self.allocate_nonblocking(spec),
48
- lambda: self.allocate_blocking(spec),
49
- )
45
+ return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
50
46
 
51
47
 
52
48
  @final
@@ -65,10 +61,26 @@ class LocalAllocator(LocalAllocatorBase):
65
61
  Returns:
66
62
  - A future that will be fulfilled when the requested allocation is fulfilled.
67
63
  """
68
- return Future(
69
- lambda: self.allocate_nonblocking(spec),
70
- lambda: self.allocate_blocking(spec),
71
- )
64
+ return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
65
+
66
+
67
+ @final
68
+ class SimAllocator(SimAllocatorBase):
69
+ """
70
+ An allocator that allocates by spawning actors into the current process using simulated channels for transport
71
+ """
72
+
73
+ def allocate(self, spec: AllocSpec) -> Future[Alloc]:
74
+ """
75
+ Allocate a process according to the provided spec.
76
+
77
+ Arguments:
78
+ - `spec`: The spec to allocate according to.
79
+
80
+ Returns:
81
+ - A future that will be fulfilled when the requested allocation is fulfilled.
82
+ """
83
+ return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
72
84
 
73
85
 
74
86
  class RemoteAllocInitializer(abc.ABC):
@@ -214,7 +226,4 @@ class RemoteAllocator(RemoteAllocatorBase):
214
226
  Returns:
215
227
  - A future that will be fulfilled when the requested allocation is fulfilled.
216
228
  """
217
- return Future(
218
- lambda: self.allocate_nonblocking(spec),
219
- lambda: self.allocate_blocking(spec),
220
- )
229
+ return Future(impl=lambda: self.allocate_nonblocking(spec), requires_loop=False)
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ This is the main function for the boostrapping a new process using a ProcessAllocator.
9
+ """
10
+
11
+ import asyncio
12
+ import importlib.resources
13
+ import logging
14
+ import multiprocessing
15
+ import os
16
+ import sys
17
+
18
+ # Import torch to avoid import-time races if a spawned actor tries to import torch.
19
+ try:
20
+ import torch # @manual
21
+ except ImportError:
22
+ pass
23
+
24
+
25
+ async def main():
26
+ from monarch._rust_bindings.monarch_hyperactor.bootstrap import bootstrap_main
27
+
28
+ await bootstrap_main()
29
+
30
+
31
+ def invoke_main():
32
+ # if this is invoked with the stdout piped somewhere, then print
33
+ # changes its buffering behavior. So we default to the standard
34
+ # behavior of std out as if it were a terminal.
35
+ sys.stdout.reconfigure(line_buffering=True)
36
+ global bootstrap_main
37
+
38
+ # TODO: figure out what from worker_main.py we should reproduce here.
39
+ from monarch._src.actor.telemetry import TracingForwarder
40
+
41
+ if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
42
+ raise RuntimeError("Error during bootstrap for testing")
43
+
44
+ # forward logs to rust tracing. Defaults to on.
45
+ if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
46
+ logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
47
+ # install opentelemetry tracing
48
+
49
+ try:
50
+ with (
51
+ importlib.resources.as_file(
52
+ importlib.resources.files("monarch") / "py-spy"
53
+ ) as pyspy,
54
+ ):
55
+ if pyspy.exists():
56
+ os.environ["PYSPY_BIN"] = str(pyspy)
57
+ # fallback to using local py-spy
58
+ except Exception as e:
59
+ logging.warning(f"Failed to set up py-spy: {e}")
60
+
61
+ from monarch._src.actor.debugger import remote_breakpointhook
62
+
63
+ sys.breakpointhook = remote_breakpointhook
64
+
65
+ # Start an event loop for PythonActors to use.
66
+ asyncio.run(main())
67
+
68
+
69
+ if __name__ == "__main__":
70
+ # Ensure that processes started via `multiprocessing` are spawned, not forked.
71
+ # forking is a terrible default, see: https://github.com/python/cpython/issues/84559
72
+ multiprocessing.set_start_method("spawn", force=True)
73
+ invoke_main() # pragma: no cover
@@ -5,6 +5,8 @@
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
7
  from monarch._rust_bindings.monarch_extension.code_sync import ( # noqa: F401
8
+ CodeSyncMeshClient,
8
9
  RemoteWorkspace,
9
- RsyncMeshClient,
10
+ WorkspaceLocation,
11
+ WorkspaceShape,
10
12
  )
@@ -0,0 +1,223 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import contextlib
8
+ import dataclasses
9
+ import importlib
10
+ import importlib.abc
11
+ import importlib.util
12
+ import itertools
13
+ import site
14
+ import sys
15
+ import threading
16
+ from pathlib import Path
17
+ from types import ModuleType
18
+ from typing import Dict, List, Optional, Tuple
19
+
20
+ from monarch._src.actor.actor_mesh import Actor
21
+ from monarch._src.actor.endpoint import endpoint
22
+
23
+
24
+ class SysAuditHookGuard(contextlib.AbstractContextManager):
25
+ """
26
+ A guard (and context manager), which will unregister an import hook when
27
+ closed or deleted.
28
+ """
29
+
30
+ def __init__(self, hooks, idx):
31
+ self._hooks = hooks
32
+ self._idx = idx
33
+
34
+ def close(self):
35
+ self._hooks.pop(self._idx, None)
36
+
37
+ def __enter__(self):
38
+ return self
39
+
40
+ def __exit__(self, *args):
41
+ self.close()
42
+
43
+ def __del__(self):
44
+ self.close()
45
+
46
+
47
+ class SysAuditHookMultiplexer:
48
+ """
49
+ Multiplexes import hooks to multiple hooks.
50
+
51
+ `sys.addaudithook`s can only be added and not removed, so this class provides
52
+ a global singleton that can be used to multiplex multiple hooks which support
53
+ removal.
54
+ """
55
+
56
+ def __init__(self):
57
+ self._idx = itertools.count()
58
+ self._hooks = {}
59
+
60
+ def _callback(self, event, args):
61
+ for hook in self._hooks.values():
62
+ hook(event, args)
63
+
64
+ def add(self, hook) -> SysAuditHookGuard:
65
+ idx = next(self._idx)
66
+ self._hooks[idx] = hook
67
+ return SysAuditHookGuard(self._hooks, idx)
68
+
69
+ _instance_lock = threading.Lock()
70
+ _instance = None
71
+
72
+ @classmethod
73
+ def singleton(cls):
74
+ if cls._instance is None:
75
+ with cls._instance_lock:
76
+ if cls._instance is None:
77
+ cls._instance = SysAuditHookMultiplexer()
78
+ sys.addaudithook(cls._instance._callback)
79
+ return cls._instance
80
+
81
+
82
+ @dataclasses.dataclass
83
+ class ThreadLocalState(threading.local):
84
+ last_import: Optional[str] = None
85
+
86
+
87
+ class SysAuditImportHook:
88
+ """
89
+ An audit hook that processes and coalesces import/exec events and calls a
90
+ user-defined callback with the module name and module object which was
91
+ imported.
92
+ """
93
+
94
+ def __init__(self, callback):
95
+ self._callback = callback
96
+ self._state = ThreadLocalState()
97
+
98
+ @classmethod
99
+ def install(cls, callback) -> SysAuditHookGuard:
100
+ return SysAuditHookMultiplexer.singleton().add(SysAuditImportHook(callback))
101
+
102
+ def _py_filename(self, filename: Path) -> Path:
103
+ if filename.suffix in (".pyc", ".pyo"):
104
+ return filename.with_suffix(".py")
105
+ return filename
106
+
107
+ def __call__(self, event, args):
108
+ if event == "import":
109
+ # While `filename` is specific as an argument to the import event, it's
110
+ # almost always `None`, so we need to wait for a subsequent exec event
111
+ # to get the filename.
112
+ module, _, _, _, _ = args
113
+ self._state.last_import = module
114
+ elif event == "exec":
115
+ module_name = self._state.last_import
116
+ if module_name is None:
117
+ return
118
+ # We always expect an exec right after an import, so we can clear the
119
+ # last import module name we store.
120
+ self._state.last_import = None
121
+ module = sys.modules.get(module_name)
122
+ if module is None:
123
+ return
124
+ if getattr(module, "__file__", None) is None:
125
+ return
126
+ (code_obj,) = args
127
+ if code_obj.co_filename is None:
128
+ return
129
+ # code objects store the original source name, not the pyc
130
+ if self._py_filename(Path(module.__file__)) != Path(code_obj.co_filename):
131
+ return
132
+ self._callback(module_name, module)
133
+
134
+
135
+ @dataclasses.dataclass(frozen=True, kw_only=True)
136
+ class Fingerprint:
137
+ mtime: float
138
+ size: int
139
+
140
+ @classmethod
141
+ def for_path(cls, path: Path) -> "Fingerprint":
142
+ stat = path.stat()
143
+ return Fingerprint(mtime=stat.st_mtime, size=stat.st_size)
144
+
145
+
146
+ class AutoReloader:
147
+ """
148
+ Track changes to modules and reloads them when they change.
149
+ """
150
+
151
+ def __init__(self, reload=importlib.reload):
152
+ self._reload = reload
153
+ self._tracked_modules: Dict[str, Tuple[Path, Fingerprint]] = {}
154
+ self._track_all_imported()
155
+
156
+ def _maybe_track_module(self, name: str, module: ModuleType):
157
+ filename = getattr(module, "__file__", None)
158
+ if filename is None:
159
+ return
160
+ if filename == "static-extension":
161
+ return
162
+ filename = Path(filename)
163
+
164
+ # It's rare for modules to have relative path names, but can happen in
165
+ # weird special situations (e.g. `_ops.py` from `torch.ops`).
166
+ if not filename.is_absolute():
167
+ return
168
+
169
+ # Ignore builtin modules.
170
+ if filename.is_relative_to(sys.prefix):
171
+ for dirpath in site.getsitepackages():
172
+ if filename.is_relative_to(dirpath):
173
+ break
174
+ else:
175
+ return
176
+
177
+ self._tracked_modules[name] = (
178
+ filename,
179
+ Fingerprint.for_path(filename),
180
+ )
181
+
182
+ def _track_all_imported(self):
183
+ for name, module in sys.modules.items():
184
+ if module is None:
185
+ continue
186
+ self._maybe_track_module(name, module)
187
+
188
+ def import_callback(self, name: str, module: ModuleType):
189
+ """
190
+ Callback for when a module has been imported.
191
+ """
192
+
193
+ self._maybe_track_module(name, module)
194
+
195
+ def reload_changes(self) -> List[str]:
196
+ """
197
+ Reload all modules that have changed since they were last imported.
198
+ """
199
+
200
+ reloaded = []
201
+
202
+ for module_name, (filename, stored_fingerprint) in list(
203
+ self._tracked_modules.items()
204
+ ):
205
+ fingerprint = Fingerprint.for_path(filename)
206
+ if fingerprint == stored_fingerprint:
207
+ continue
208
+ reloaded.append(module_name)
209
+ self._reload(sys.modules[module_name])
210
+ self._tracked_modules[module_name] = (filename, fingerprint)
211
+
212
+ return reloaded
213
+
214
+
215
+ class AutoReloadActor(Actor):
216
+ def __init__(self):
217
+ self._reloader = AutoReloader()
218
+ self._hook_guard = SysAuditImportHook.install(self._reloader.import_callback)
219
+
220
+ @endpoint
221
+ async def reload(self) -> None:
222
+ changed = self._reloader.reload_changes()
223
+ print(f"reloaded modules: {changed}")