torchmonarch-nightly 2025.8.1__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.3__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +414 -216
- monarch/_src/actor/allocator.py +75 -6
- monarch/_src/actor/bootstrap_main.py +7 -4
- monarch/_src/actor/code_sync/__init__.py +2 -0
- monarch/_src/actor/debugger/__init__.py +7 -0
- monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
- monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
- monarch/_src/actor/endpoint.py +27 -45
- monarch/_src/actor/future.py +86 -24
- monarch/_src/actor/host_mesh.py +125 -0
- monarch/_src/actor/logging.py +94 -0
- monarch/_src/actor/pickle.py +25 -0
- monarch/_src/actor/proc_mesh.py +423 -156
- monarch/_src/actor/python_extension_methods.py +90 -0
- monarch/_src/actor/shape.py +8 -1
- monarch/_src/actor/source_loader.py +45 -0
- monarch/_src/actor/telemetry/__init__.py +172 -0
- monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
- monarch/_src/debug_cli/__init__.py +7 -0
- monarch/_src/debug_cli/debug_cli.py +43 -0
- monarch/_src/tensor_engine/rdma.py +64 -9
- monarch/_testing.py +1 -3
- monarch/actor/__init__.py +24 -4
- monarch/common/_C.so +0 -0
- monarch/common/device_mesh.py +14 -0
- monarch/common/future.py +10 -0
- monarch/common/remote.py +14 -25
- monarch/common/tensor.py +12 -0
- monarch/debug_cli/__init__.py +7 -0
- monarch/debug_cli/__main__.py +12 -0
- monarch/fetch.py +2 -2
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/gradient_generator.py +4 -2
- monarch/mesh_controller.py +34 -14
- monarch/monarch_controller +0 -0
- monarch/tools/colors.py +25 -0
- monarch/tools/commands.py +42 -7
- monarch/tools/components/hyperactor.py +1 -1
- monarch/tools/config/__init__.py +31 -4
- monarch/tools/config/defaults.py +13 -3
- monarch/tools/config/environment.py +45 -0
- monarch/tools/config/workspace.py +165 -0
- monarch/tools/mesh_spec.py +2 -0
- monarch/utils/__init__.py +9 -0
- monarch/utils/utils.py +78 -0
- tests/error_test_binary.py +5 -3
- tests/python_actor_test_binary.py +52 -0
- tests/test_actor_error.py +142 -14
- tests/test_alloc.py +1 -1
- tests/test_allocator.py +59 -72
- tests/test_coalescing.py +1 -1
- tests/test_debugger.py +639 -45
- tests/test_env_before_cuda.py +4 -4
- tests/test_mesh_trait.py +38 -0
- tests/test_python_actors.py +979 -75
- tests/test_rdma.py +7 -6
- tests/test_tensor_engine.py +6 -6
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +64 -48
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.8.1.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0
monarch/_src/actor/allocator.py
CHANGED
@@ -8,7 +8,8 @@
|
|
8
8
|
|
9
9
|
import abc
|
10
10
|
import logging
|
11
|
-
from
|
11
|
+
from dataclasses import dataclass
|
12
|
+
from typing import Dict, final, Literal, Optional
|
12
13
|
|
13
14
|
from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
14
15
|
Alloc,
|
@@ -18,21 +19,59 @@ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monar
|
|
18
19
|
RemoteAllocatorBase,
|
19
20
|
SimAllocatorBase,
|
20
21
|
)
|
21
|
-
from monarch._src.actor.future import Future
|
22
22
|
|
23
|
-
|
24
|
-
|
23
|
+
from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
|
24
|
+
from monarch._src.actor.future import DeprecatedNotAFuture, Future
|
25
|
+
|
25
26
|
|
26
27
|
ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
|
27
28
|
|
28
29
|
logger: logging.Logger = logging.getLogger(__name__)
|
29
30
|
|
30
31
|
|
32
|
+
@dataclass
|
33
|
+
class AllocHandle(DeprecatedNotAFuture):
|
34
|
+
_hy_alloc: "Shared[Alloc]"
|
35
|
+
_extent: Dict[str, int]
|
36
|
+
_stream_logs: bool
|
37
|
+
|
38
|
+
def reshape(self, extent: Dict[str, int]) -> "AllocHandle":
|
39
|
+
async def task() -> Alloc:
|
40
|
+
alloc = await self._hy_alloc
|
41
|
+
return alloc.reshape(extent)
|
42
|
+
|
43
|
+
return AllocHandle(
|
44
|
+
PythonTask.from_coroutine(task()).spawn(), extent, self._stream_logs
|
45
|
+
)
|
46
|
+
|
47
|
+
@property
|
48
|
+
def initialized(self) -> Future[Literal[True]]:
|
49
|
+
"""
|
50
|
+
Future completes with 'True' when the alloc has initialized.
|
51
|
+
Because alloc are remote objects, there is no guarentee that the alloc is
|
52
|
+
still usable after this completes, only that at some point in the past it was usable.
|
53
|
+
"""
|
54
|
+
|
55
|
+
async def task() -> Literal[True]:
|
56
|
+
await self._hy_alloc
|
57
|
+
return True
|
58
|
+
|
59
|
+
return Future(coro=task())
|
60
|
+
|
61
|
+
@property
|
62
|
+
def stream_logs(self) -> bool:
|
63
|
+
"""
|
64
|
+
Whether to stream stdout/stderr logs from the allocated processes back to the client.
|
65
|
+
The default behavior is determined by the underlying allocator.
|
66
|
+
"""
|
67
|
+
return self._stream_logs
|
68
|
+
|
69
|
+
|
31
70
|
class AllocateMixin(abc.ABC):
|
32
71
|
@abc.abstractmethod
|
33
72
|
def allocate_nonblocking(self, spec: AllocSpec) -> "PythonTask[Alloc]": ...
|
34
73
|
|
35
|
-
def allocate(self, spec: AllocSpec) -> "
|
74
|
+
def allocate(self, spec: AllocSpec) -> "AllocHandle":
|
36
75
|
"""
|
37
76
|
Allocate a process according to the provided spec.
|
38
77
|
|
@@ -42,7 +81,25 @@ class AllocateMixin(abc.ABC):
|
|
42
81
|
Returns:
|
43
82
|
- A future that will be fulfilled when the requested allocation is fulfilled.
|
44
83
|
"""
|
45
|
-
return
|
84
|
+
return AllocHandle(
|
85
|
+
self.allocate_nonblocking(spec).spawn(),
|
86
|
+
spec.extent,
|
87
|
+
self._stream_logs(),
|
88
|
+
)
|
89
|
+
|
90
|
+
@abc.abstractmethod
|
91
|
+
def _stream_logs(self) -> bool:
|
92
|
+
"""
|
93
|
+
Whether to stream stdout/stderr logs from the allocated processes back to the client.
|
94
|
+
A common pattern is if the processes are allocated on the same host as the client,
|
95
|
+
then it is not necessary to stream logs back. But if the processes are remotely allocated,
|
96
|
+
it is recommended to stream logs back. It is up to each allocator to decide the default behavior.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
- A boolean indicating whether to stream logs back to the client.
|
100
|
+
"""
|
101
|
+
|
102
|
+
...
|
46
103
|
|
47
104
|
|
48
105
|
@final
|
@@ -51,6 +108,9 @@ class ProcessAllocator(ProcessAllocatorBase, AllocateMixin):
|
|
51
108
|
An allocator that allocates by spawning local processes.
|
52
109
|
"""
|
53
110
|
|
111
|
+
def _stream_logs(self) -> bool:
|
112
|
+
return False
|
113
|
+
|
54
114
|
|
55
115
|
@final
|
56
116
|
class LocalAllocator(LocalAllocatorBase, AllocateMixin):
|
@@ -58,6 +118,9 @@ class LocalAllocator(LocalAllocatorBase, AllocateMixin):
|
|
58
118
|
An allocator that allocates by spawning actors into the current process.
|
59
119
|
"""
|
60
120
|
|
121
|
+
def _stream_logs(self) -> bool:
|
122
|
+
return False
|
123
|
+
|
61
124
|
|
62
125
|
@final
|
63
126
|
class SimAllocator(SimAllocatorBase, AllocateMixin):
|
@@ -65,6 +128,9 @@ class SimAllocator(SimAllocatorBase, AllocateMixin):
|
|
65
128
|
An allocator that allocates by spawning actors into the current process using simulated channels for transport
|
66
129
|
"""
|
67
130
|
|
131
|
+
def _stream_logs(self) -> bool:
|
132
|
+
return False
|
133
|
+
|
68
134
|
|
69
135
|
class RemoteAllocInitializer(abc.ABC):
|
70
136
|
"""Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
|
@@ -198,3 +264,6 @@ class RemoteAllocator(RemoteAllocatorBase, AllocateMixin):
|
|
198
264
|
An allocator that allocates by spawning actors on a remote host.
|
199
265
|
The remote host must be running hyperactor's remote-process-allocator.
|
200
266
|
"""
|
267
|
+
|
268
|
+
def _stream_logs(self) -> bool:
|
269
|
+
return True
|
@@ -4,6 +4,8 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
# pyre-unsafe
|
8
|
+
|
7
9
|
"""
|
8
10
|
This is the main function for the boostrapping a new process using a ProcessAllocator.
|
9
11
|
"""
|
@@ -17,7 +19,7 @@ import sys
|
|
17
19
|
|
18
20
|
# Import torch to avoid import-time races if a spawned actor tries to import torch.
|
19
21
|
try:
|
20
|
-
import torch # @manual
|
22
|
+
import torch # @manual # noqa: F401
|
21
23
|
except ImportError:
|
22
24
|
pass
|
23
25
|
|
@@ -36,14 +38,15 @@ def invoke_main():
|
|
36
38
|
global bootstrap_main
|
37
39
|
|
38
40
|
# TODO: figure out what from worker_main.py we should reproduce here.
|
39
|
-
from monarch._src.actor.telemetry import TracingForwarder
|
41
|
+
from monarch._src.actor.telemetry import TracingForwarder # noqa
|
40
42
|
|
41
43
|
if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
|
42
44
|
raise RuntimeError("Error during bootstrap for testing")
|
43
45
|
|
44
46
|
# forward logs to rust tracing. Defaults to on.
|
45
47
|
if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
|
46
|
-
|
48
|
+
# we can stream python logs now; no need to forward them to rust processes
|
49
|
+
pass
|
47
50
|
# install opentelemetry tracing
|
48
51
|
|
49
52
|
try:
|
@@ -58,7 +61,7 @@ def invoke_main():
|
|
58
61
|
except Exception as e:
|
59
62
|
logging.warning(f"Failed to set up py-spy: {e}")
|
60
63
|
|
61
|
-
from monarch._src.actor.debugger import remote_breakpointhook
|
64
|
+
from monarch._src.actor.debugger.debugger import remote_breakpointhook
|
62
65
|
|
63
66
|
sys.breakpointhook = remote_breakpointhook
|
64
67
|
|