torchmonarch-nightly 2025.6.5__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.7__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/actor_mesh.py +33 -20
- monarch/bootstrap_main.py +3 -0
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/monarch_controller +0 -0
- monarch/rdma.py +0 -28
- monarch/sim_mesh.py +1 -1
- tests/error_test_binary.py +22 -11
- tests/test_actor_error.py +55 -34
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/METADATA +7 -7
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/RECORD +15 -15
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/actor_mesh.py
CHANGED
@@ -39,7 +39,7 @@ from typing import (
|
|
39
39
|
import monarch
|
40
40
|
from monarch import ActorFuture as Future
|
41
41
|
|
42
|
-
from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage
|
42
|
+
from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
|
43
43
|
from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
|
44
44
|
from monarch._rust_bindings.monarch_hyperactor.mailbox import (
|
45
45
|
Mailbox,
|
@@ -50,7 +50,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
|
|
50
50
|
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
51
51
|
from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
|
52
52
|
from monarch.common.pickle_flatten import flatten, unflatten
|
53
|
-
from monarch.common.shape import MeshTrait, NDSlice
|
53
|
+
from monarch.common.shape import MeshTrait, NDSlice
|
54
54
|
|
55
55
|
logger = logging.getLogger(__name__)
|
56
56
|
|
@@ -158,7 +158,7 @@ class _ActorMeshRefImpl:
|
|
158
158
|
mailbox,
|
159
159
|
hy_actor_mesh,
|
160
160
|
hy_actor_mesh.shape,
|
161
|
-
[cast(ActorId, hy_actor_mesh.get(i)) for i in range(len(shape
|
161
|
+
[cast(ActorId, hy_actor_mesh.get(i)) for i in range(len(shape))],
|
162
162
|
)
|
163
163
|
|
164
164
|
@staticmethod
|
@@ -204,7 +204,7 @@ class _ActorMeshRefImpl:
|
|
204
204
|
# The fix is to provide a first-class reference into Python, and always call "cast"
|
205
205
|
# on it, including for load balanced requests.
|
206
206
|
if selection == "choose":
|
207
|
-
idx = _load_balancing_seed.randrange(len(self._shape
|
207
|
+
idx = _load_balancing_seed.randrange(len(self._shape))
|
208
208
|
actor_rank = self._shape.ndslice[idx]
|
209
209
|
self._mailbox.post(self._please_replace_me_actor_ids[actor_rank], message)
|
210
210
|
return
|
@@ -223,9 +223,8 @@ class _ActorMeshRefImpl:
|
|
223
223
|
else:
|
224
224
|
raise ValueError(f"invalid selection: {selection}")
|
225
225
|
|
226
|
-
|
227
|
-
|
228
|
-
return len(self._shape.ndslice)
|
226
|
+
def __len__(self) -> int:
|
227
|
+
return len(self._shape)
|
229
228
|
|
230
229
|
|
231
230
|
class Endpoint(Generic[P, R]):
|
@@ -258,7 +257,7 @@ class Endpoint(Generic[P, R]):
|
|
258
257
|
return r.recv()
|
259
258
|
|
260
259
|
def call_one(self, *args: P.args, **kwargs: P.kwargs) -> Future[R]:
|
261
|
-
if self._actor_mesh
|
260
|
+
if len(self._actor_mesh) != 1:
|
262
261
|
raise ValueError(
|
263
262
|
f"Can only use 'call_one' on a single Actor but this actor has shape {self._actor_mesh._shape}"
|
264
263
|
)
|
@@ -270,8 +269,8 @@ class Endpoint(Generic[P, R]):
|
|
270
269
|
send(self, args, kwargs, port=p)
|
271
270
|
|
272
271
|
async def process():
|
273
|
-
results = [None] * self._actor_mesh
|
274
|
-
for _ in range(self._actor_mesh
|
272
|
+
results = [None] * len(self._actor_mesh)
|
273
|
+
for _ in range(len(self._actor_mesh)):
|
275
274
|
rank, value = await r.recv()
|
276
275
|
results[rank] = value
|
277
276
|
call_shape = Shape(
|
@@ -292,7 +291,7 @@ class Endpoint(Generic[P, R]):
|
|
292
291
|
p, r = port(self)
|
293
292
|
# pyre-ignore
|
294
293
|
send(self, args, kwargs, port=p)
|
295
|
-
for _ in range(self._actor_mesh
|
294
|
+
for _ in range(len(self._actor_mesh)):
|
296
295
|
yield await r.recv()
|
297
296
|
|
298
297
|
def broadcast(self, *args: P.args, **kwargs: P.kwargs) -> None:
|
@@ -346,6 +345,9 @@ class ValueMesh(MeshTrait, Generic[R]):
|
|
346
345
|
for rank in self._shape.ranks():
|
347
346
|
yield Point(rank, self._shape), self._values[rank]
|
348
347
|
|
348
|
+
def __len__(self):
|
349
|
+
return len(self._shape)
|
350
|
+
|
349
351
|
@property
|
350
352
|
def _ndslice(self) -> NDSlice:
|
351
353
|
return self._shape.ndslice
|
@@ -460,12 +462,12 @@ class _Actor:
|
|
460
462
|
def __init__(self) -> None:
|
461
463
|
self.instance: object | None = None
|
462
464
|
self.active_requests: asyncio.Queue[asyncio.Future[object]] = asyncio.Queue()
|
463
|
-
self.complete_task:
|
465
|
+
self.complete_task: asyncio.Task | None = None
|
464
466
|
|
465
467
|
def handle(
|
466
|
-
self, mailbox: Mailbox, message: PythonMessage
|
468
|
+
self, mailbox: Mailbox, message: PythonMessage, panic_flag: PanicFlag
|
467
469
|
) -> Optional[Coroutine[Any, Any, Any]]:
|
468
|
-
return self.handle_cast(mailbox, 0, singleton_shape, message)
|
470
|
+
return self.handle_cast(mailbox, 0, singleton_shape, message, panic_flag)
|
469
471
|
|
470
472
|
def handle_cast(
|
471
473
|
self,
|
@@ -473,6 +475,7 @@ class _Actor:
|
|
473
475
|
rank: int,
|
474
476
|
shape: Shape,
|
475
477
|
message: PythonMessage,
|
478
|
+
panic_flag: PanicFlag,
|
476
479
|
) -> Optional[Coroutine[Any, Any, Any]]:
|
477
480
|
port = None
|
478
481
|
try:
|
@@ -493,10 +496,10 @@ class _Actor:
|
|
493
496
|
port.send("result", result)
|
494
497
|
return None
|
495
498
|
|
496
|
-
return self.run_async(ctx, self.run_task(port, result))
|
499
|
+
return self.run_async(ctx, self.run_task(port, result, panic_flag))
|
497
500
|
except Exception as e:
|
498
501
|
traceback.print_exc()
|
499
|
-
s =
|
502
|
+
s = ActorError(e)
|
500
503
|
|
501
504
|
# The exception is delivered to exactly one of:
|
502
505
|
# (1) our caller, (2) our supervisor
|
@@ -508,17 +511,17 @@ class _Actor:
|
|
508
511
|
async def run_async(self, ctx, coroutine):
|
509
512
|
_context.set(ctx)
|
510
513
|
if self.complete_task is None:
|
511
|
-
asyncio.create_task(self._complete())
|
514
|
+
self.complete_task = asyncio.create_task(self._complete())
|
512
515
|
await self.active_requests.put(create_eager_task(coroutine))
|
513
516
|
|
514
|
-
async def run_task(self, port, coroutine):
|
517
|
+
async def run_task(self, port, coroutine, panic_flag):
|
515
518
|
try:
|
516
519
|
result = await coroutine
|
517
520
|
if port is not None:
|
518
521
|
port.send("result", result)
|
519
522
|
except Exception as e:
|
520
523
|
traceback.print_exc()
|
521
|
-
s =
|
524
|
+
s = ActorError(e)
|
522
525
|
|
523
526
|
# The exception is delivered to exactly one of:
|
524
527
|
# (1) our caller, (2) our supervisor
|
@@ -526,6 +529,16 @@ class _Actor:
|
|
526
529
|
port.send("exception", s)
|
527
530
|
else:
|
528
531
|
raise s from None
|
532
|
+
except BaseException as e:
|
533
|
+
# A BaseException can be thrown in the case of a Rust panic.
|
534
|
+
# In this case, we need a way to signal the panic to the Rust side.
|
535
|
+
# See [Panics in async endpoints]
|
536
|
+
try:
|
537
|
+
panic_flag.signal_panic(e)
|
538
|
+
except Exception:
|
539
|
+
# The channel might be closed if the Rust side has already detected the error
|
540
|
+
pass
|
541
|
+
raise
|
529
542
|
|
530
543
|
async def _complete(self) -> None:
|
531
544
|
while True:
|
@@ -653,7 +666,7 @@ class ActorMeshRef(MeshTrait):
|
|
653
666
|
)
|
654
667
|
|
655
668
|
|
656
|
-
class
|
669
|
+
class ActorError(Exception):
|
657
670
|
"""
|
658
671
|
Deterministic problem with the user's code.
|
659
672
|
For example, an OOM resulting in trying to allocate too much GPU memory, or violating
|
monarch/bootstrap_main.py
CHANGED
@@ -53,6 +53,9 @@ def invoke_main():
|
|
53
53
|
record.levelno,
|
54
54
|
)
|
55
55
|
|
56
|
+
if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
|
57
|
+
raise RuntimeError("Error during bootstrap for testing")
|
58
|
+
|
56
59
|
# forward logs to rust tracing. Defaults to on.
|
57
60
|
if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
|
58
61
|
logging.root.addHandler(TracingForwarder())
|
Binary file
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/rdma.py
CHANGED
@@ -6,10 +6,7 @@
|
|
6
6
|
|
7
7
|
import ctypes
|
8
8
|
|
9
|
-
import traceback
|
10
|
-
|
11
9
|
from dataclasses import dataclass
|
12
|
-
from traceback import extract_tb, StackSummary
|
13
10
|
from typing import cast, Dict, Optional, Tuple
|
14
11
|
|
15
12
|
import torch
|
@@ -163,28 +160,3 @@ class RDMABuffer:
|
|
163
160
|
src.numel(),
|
164
161
|
)
|
165
162
|
await RDMAManager.on_proc(self.proc_id).put.call_one(self.addr, offset, bytes)
|
166
|
-
|
167
|
-
|
168
|
-
class ActorMeshRefCallFailedException(Exception):
|
169
|
-
"""
|
170
|
-
Deterministic problem with the user's code.
|
171
|
-
For example, an OOM resulting in trying to allocate too much GPU memory, or violating
|
172
|
-
some invariant enforced by the various APIs.
|
173
|
-
"""
|
174
|
-
|
175
|
-
def __init__(
|
176
|
-
self,
|
177
|
-
exception: Exception,
|
178
|
-
message: str = "A remote service call has failed asynchronously.",
|
179
|
-
) -> None:
|
180
|
-
self.exception = exception
|
181
|
-
self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
|
182
|
-
self.message = message
|
183
|
-
|
184
|
-
def __str__(self) -> str:
|
185
|
-
exe = str(self.exception)
|
186
|
-
actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
|
187
|
-
return (
|
188
|
-
f"{self.message}\n"
|
189
|
-
f"Traceback of where the service call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
|
190
|
-
)
|
monarch/sim_mesh.py
CHANGED
@@ -205,7 +205,7 @@ class Bootstrap:
|
|
205
205
|
self.client_bootstrap_addr: str = (
|
206
206
|
f"sim!unix!@client,{proxy_addr},unix!@system,{proxy_addr}"
|
207
207
|
)
|
208
|
-
bootstrap_simulator_backend(self.bootstrap_addr, world_size)
|
208
|
+
bootstrap_simulator_backend(self.bootstrap_addr, proxy_addr, world_size)
|
209
209
|
|
210
210
|
self._simulator_client = SimulatorClient(proxy_addr)
|
211
211
|
for i in range(num_meshes):
|
tests/error_test_binary.py
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
import ctypes
|
8
8
|
import sys
|
9
9
|
|
10
|
+
import click
|
11
|
+
|
10
12
|
from monarch._rust_bindings.monarch_extension.panic import panicking_function
|
11
13
|
|
12
14
|
from monarch.actor_mesh import Actor, endpoint
|
@@ -115,24 +117,33 @@ def _run_error_test(num_procs, sync_endpoint, endpoint_name):
|
|
115
117
|
asyncio.run(run_test())
|
116
118
|
|
117
119
|
|
120
|
+
@click.group()
|
118
121
|
def main():
|
119
|
-
|
122
|
+
pass
|
120
123
|
|
121
|
-
parser = argparse.ArgumentParser()
|
122
|
-
parser.add_argument("--num-procs", type=int)
|
123
|
-
parser.add_argument("--sync-test-impl", type=bool)
|
124
|
-
parser.add_argument("--sync-endpoint", type=bool)
|
125
|
-
parser.add_argument("--endpoint-name", type=str)
|
126
|
-
args = parser.parse_args()
|
127
124
|
|
125
|
+
@main.command("error-endpoint")
|
126
|
+
@click.option("--num-procs", type=int, required=True)
|
127
|
+
@click.option("--sync-test-impl", type=bool, required=True)
|
128
|
+
@click.option("--sync-endpoint", type=bool, required=True)
|
129
|
+
@click.option("--endpoint-name", type=str, required=True)
|
130
|
+
def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
|
128
131
|
print(
|
129
|
-
f"Running segfault test: {
|
132
|
+
f"Running segfault test: {num_procs=} {sync_test_impl=} {sync_endpoint=}, {endpoint_name=}"
|
130
133
|
)
|
131
134
|
|
132
|
-
if
|
133
|
-
_run_error_test_sync(
|
135
|
+
if sync_test_impl:
|
136
|
+
_run_error_test_sync(num_procs, sync_endpoint, endpoint_name)
|
134
137
|
else:
|
135
|
-
_run_error_test(
|
138
|
+
_run_error_test(num_procs, sync_endpoint, endpoint_name)
|
139
|
+
|
140
|
+
|
141
|
+
@main.command("error-bootstrap")
|
142
|
+
def error_bootstrap():
|
143
|
+
print("I actually ran")
|
144
|
+
sys.stdout.flush()
|
145
|
+
|
146
|
+
proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
|
136
147
|
|
137
148
|
|
138
149
|
if __name__ == "__main__":
|
tests/test_actor_error.py
CHANGED
@@ -8,47 +8,36 @@ import importlib.resources
|
|
8
8
|
import subprocess
|
9
9
|
|
10
10
|
import pytest
|
11
|
-
from monarch.actor_mesh import Actor,
|
11
|
+
from monarch.actor_mesh import Actor, ActorError, endpoint
|
12
12
|
|
13
13
|
from monarch.proc_mesh import proc_mesh
|
14
14
|
|
15
15
|
|
16
16
|
class ExceptionActor(Actor):
|
17
|
-
"""An actor that has endpoints which raise exceptions."""
|
18
|
-
|
19
17
|
@endpoint
|
20
18
|
async def raise_exception(self) -> None:
|
21
|
-
"""Endpoint that raises an exception."""
|
22
19
|
raise Exception("This is a test exception")
|
23
20
|
|
24
21
|
|
25
22
|
class ExceptionActorSync(Actor):
|
26
|
-
"""An actor that has endpoints which raise exceptions."""
|
27
|
-
|
28
23
|
@endpoint # pyre-ignore
|
29
24
|
def raise_exception(self) -> None:
|
30
|
-
"""Endpoint that raises an exception."""
|
31
25
|
raise Exception("This is a test exception")
|
32
26
|
|
33
27
|
|
34
28
|
@pytest.mark.parametrize(
|
35
|
-
"actor_class
|
36
|
-
[
|
37
|
-
(ExceptionActor, "exception_actor_async_call"),
|
38
|
-
(ExceptionActorSync, "exception_actor_sync_call"),
|
39
|
-
],
|
29
|
+
"actor_class",
|
30
|
+
[ExceptionActor, ExceptionActorSync],
|
40
31
|
)
|
41
32
|
@pytest.mark.parametrize("num_procs", [1, 2])
|
42
|
-
async def test_actor_exception(actor_class,
|
33
|
+
async def test_actor_exception(actor_class, num_procs):
|
43
34
|
"""
|
44
35
|
Test that exceptions raised in actor endpoints are propagated to the client.
|
45
36
|
"""
|
46
37
|
proc = await proc_mesh(gpus=num_procs)
|
47
|
-
exception_actor = await proc.spawn(
|
38
|
+
exception_actor = await proc.spawn("exception_actor", actor_class)
|
48
39
|
|
49
|
-
with pytest.raises(
|
50
|
-
ActorMeshRefCallFailedException, match="This is a test exception"
|
51
|
-
):
|
40
|
+
with pytest.raises(ActorError, match="This is a test exception"):
|
52
41
|
if num_procs == 1:
|
53
42
|
await exception_actor.raise_exception.call_one()
|
54
43
|
else:
|
@@ -56,23 +45,18 @@ async def test_actor_exception(actor_class, actor_name, num_procs):
|
|
56
45
|
|
57
46
|
|
58
47
|
@pytest.mark.parametrize(
|
59
|
-
"actor_class
|
60
|
-
[
|
61
|
-
(ExceptionActor, "exception_actor_async_call"),
|
62
|
-
(ExceptionActorSync, "exception_actor_sync_call"),
|
63
|
-
],
|
48
|
+
"actor_class",
|
49
|
+
[ExceptionActor, ExceptionActorSync],
|
64
50
|
)
|
65
51
|
@pytest.mark.parametrize("num_procs", [1, 2])
|
66
|
-
def test_actor_exception_sync(actor_class,
|
52
|
+
def test_actor_exception_sync(actor_class, num_procs):
|
67
53
|
"""
|
68
54
|
Test that exceptions raised in actor endpoints are propagated to the client.
|
69
55
|
"""
|
70
56
|
proc = proc_mesh(gpus=num_procs).get()
|
71
|
-
exception_actor = proc.spawn(
|
57
|
+
exception_actor = proc.spawn("exception_actor", actor_class).get()
|
72
58
|
|
73
|
-
with pytest.raises(
|
74
|
-
ActorMeshRefCallFailedException, match="This is a test exception"
|
75
|
-
):
|
59
|
+
with pytest.raises(ActorError, match="This is a test exception"):
|
76
60
|
if num_procs == 1:
|
77
61
|
exception_actor.raise_exception.call_one().get()
|
78
62
|
else:
|
@@ -85,25 +69,62 @@ def test_actor_exception_sync(actor_class, actor_name, num_procs):
|
|
85
69
|
@pytest.mark.parametrize("sync_endpoint", [False, True])
|
86
70
|
@pytest.mark.parametrize("sync_test_impl", [False, True])
|
87
71
|
@pytest.mark.parametrize("endpoint_name", ["cause_segfault", "cause_panic"])
|
88
|
-
def
|
72
|
+
def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_name):
|
89
73
|
"""
|
90
|
-
Test that
|
91
|
-
This test spawns a subprocess that will segfault and checks its exit code.
|
74
|
+
Test that an endpoint causing spontaenous process exit is handled by the supervisor.
|
92
75
|
|
93
|
-
|
76
|
+
Today, these events are delivered to the client and cause the client process
|
77
|
+
to exit with a non-zero code, so the only way we can test it is via a
|
78
|
+
subprocess harness.
|
94
79
|
"""
|
95
80
|
# Run the segfault test in a subprocess
|
96
81
|
test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
|
97
82
|
cmd = [
|
98
83
|
str(test_bin),
|
84
|
+
"error-endpoint",
|
99
85
|
f"--num-procs={num_procs}",
|
100
86
|
f"--sync-endpoint={sync_endpoint}",
|
101
87
|
f"--sync-test-impl={sync_test_impl}",
|
102
88
|
f"--endpoint-name={endpoint_name}",
|
103
89
|
]
|
104
|
-
|
105
|
-
|
106
|
-
|
90
|
+
try:
|
91
|
+
process = subprocess.run(cmd, capture_output=True, timeout=180)
|
92
|
+
except subprocess.TimeoutExpired as e:
|
93
|
+
print("timeout expired")
|
94
|
+
if e.stdout is not None:
|
95
|
+
print(e.stdout.decode())
|
96
|
+
if e.stderr is not None:
|
97
|
+
print(e.stderr.decode())
|
98
|
+
raise
|
99
|
+
|
100
|
+
# Assert that the subprocess exited with a non-zero code
|
101
|
+
assert "I actually ran" in process.stdout.decode()
|
102
|
+
assert (
|
103
|
+
process.returncode != 0
|
104
|
+
), f"Expected non-zero exit code, got {process.returncode}"
|
105
|
+
|
106
|
+
|
107
|
+
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
108
|
+
@pytest.mark.oss_skip
|
109
|
+
def test_proc_mesh_bootstrap_error():
|
110
|
+
"""
|
111
|
+
Test that attempts to spawn a ProcMesh with a failure during bootstrap.
|
112
|
+
"""
|
113
|
+
# Run the segfault test in a subprocess
|
114
|
+
test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
|
115
|
+
cmd = [
|
116
|
+
str(test_bin),
|
117
|
+
"error-bootstrap",
|
118
|
+
]
|
119
|
+
try:
|
120
|
+
process = subprocess.run(cmd, capture_output=True, timeout=180)
|
121
|
+
except subprocess.TimeoutExpired as e:
|
122
|
+
print("timeout expired")
|
123
|
+
if e.stdout is not None:
|
124
|
+
print(e.stdout.decode())
|
125
|
+
if e.stderr is not None:
|
126
|
+
print(e.stderr.decode())
|
127
|
+
raise
|
107
128
|
|
108
129
|
# Assert that the subprocess exited with a non-zero code
|
109
130
|
assert "I actually ran" in process.stdout.decode()
|
{torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: torchmonarch-nightly
|
3
|
-
Version: 2025.6.
|
3
|
+
Version: 2025.6.7
|
4
4
|
Summary: Monarch: Single controller library
|
5
5
|
Author: Meta
|
6
6
|
Author-email: oncall+monarch@xmail.facebook.com
|
@@ -60,13 +60,13 @@ rustup default nightly
|
|
60
60
|
# Install non-python dependencies
|
61
61
|
conda install libunwind -y
|
62
62
|
|
63
|
-
# Install the correct cuda and cuda-toolkit versions for your machine
|
64
|
-
sudo dnf install cuda-toolkit-12-0 cuda-12-0
|
63
|
+
# Install the correct cuda and cuda-toolkit versions for your machine
|
64
|
+
sudo dnf install cuda-toolkit-12-0 cuda-12-0
|
65
65
|
|
66
|
-
# Install clang dev
|
67
|
-
sudo dnf install clang-devel
|
68
|
-
#
|
69
|
-
conda install conda-forge
|
66
|
+
# Install clang-dev and nccl-dev
|
67
|
+
sudo dnf install clang-devel libnccl-devel
|
68
|
+
# Or, in some envrionments, the following may be necessary instead
|
69
|
+
conda install -c conda-forge clangdev nccl
|
70
70
|
conda update -n monarchenv --all -c conda-forge -y
|
71
71
|
|
72
72
|
# Install build dependencies
|
@@ -1,15 +1,15 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=seyf4m6FoGBVbC4DBiG2dJdoSqsVRTWTbH9KeBvc1Is,39128520
|
3
3
|
monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
|
4
|
-
monarch/actor_mesh.py,sha256=
|
4
|
+
monarch/actor_mesh.py,sha256=_IVpdQ1HIC5JumB7cwcamdeWREPKqeMYjiz7NOV9Klw,22842
|
5
5
|
monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
|
6
|
-
monarch/bootstrap_main.py,sha256=
|
6
|
+
monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
8
8
|
monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
9
9
|
monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
|
10
10
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
11
11
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
12
|
-
monarch/monarch_controller,sha256=
|
12
|
+
monarch/monarch_controller,sha256=ptqqARjqzjjVP0R-1ZPAd0y_K1-0XHFQhE-HR4J6MOo,20389704
|
13
13
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
14
14
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
15
15
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
@@ -17,11 +17,11 @@ monarch/proc_mesh.py,sha256=sTMmwQLKqM0h-yY0mn8uSzOb9B_MX9DKWCI9EsyfD6s,6384
|
|
17
17
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
18
18
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
19
19
|
monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
|
20
|
-
monarch/rdma.py,sha256=
|
20
|
+
monarch/rdma.py,sha256=1pNh11S_FWeETRgkdUpauTMUlodrRohIq1UfQjKVnN8,5418
|
21
21
|
monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
|
22
22
|
monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
|
23
23
|
monarch/rust_local_mesh.py,sha256=7ASptybn3wy4J7eoBc7LhGW4j4AA6bigl5Kuhyflw8s,47405
|
24
|
-
monarch/sim_mesh.py,sha256=
|
24
|
+
monarch/sim_mesh.py,sha256=9wkS99L0EpG2Gldi-nzA-3ww7z__DQ7Qp2uReMfn188,12183
|
25
25
|
monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
|
26
26
|
monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
|
27
27
|
monarch/world_mesh.py,sha256=GqZpFoVNJPxYa70rLYgv0vu8Vg1nXqx_GYERRb1E9Pc,975
|
@@ -77,7 +77,7 @@ monarch/controller/rust_backend/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTY
|
|
77
77
|
monarch/controller/rust_backend/controller.py,sha256=-bZYE6u5sB9C0Cnc6NiBoBit9TvolKHRn05I-LUpB8I,9516
|
78
78
|
monarch/gradient/__init__.py,sha256=kqmzwt16mMpk0M3GhpgP_f7da4DGnaV9chDzbt66k4Q,308
|
79
79
|
monarch/gradient/_gradient_generator.pyi,sha256=6cX0UxaDt9NAlwgIhTgnweqGOf6qRhHiGnUzSWNCxdU,630
|
80
|
-
monarch/gradient/_gradient_generator.so,sha256=
|
80
|
+
monarch/gradient/_gradient_generator.so,sha256=RCslwjx2Ji9uqcA9M0IqnEsKSYAnS6NdExwyqfM71YA,11456536
|
81
81
|
monarch/parallel/__init__.py,sha256=6920kIkhiX7AiyjYvyc1ad8ccP-bStJJ1sS5KkeN2P0,352
|
82
82
|
monarch/parallel/pipelining/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
83
83
|
monarch/parallel/pipelining/runtime.py,sha256=KK8TG1gUYEzSsquiZoPTWGSIC74mlncD7cYknKxfb3c,32470
|
@@ -127,9 +127,9 @@ monarch_supervisor/python_executable.py,sha256=WfCiK3wdAvm9Jxx5jgjGF991NgGc9-oHU
|
|
127
127
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
128
128
|
tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
|
129
129
|
tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
|
130
|
-
tests/error_test_binary.py,sha256=
|
130
|
+
tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,4817
|
131
131
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
132
|
-
tests/test_actor_error.py,sha256=
|
132
|
+
tests/test_actor_error.py,sha256=K4buy0Z3MfCF7uSgIMRCpw7A2fTl3iRh8g_aNiJHnBU,4530
|
133
133
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
134
134
|
tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
|
135
135
|
tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
|
@@ -149,9 +149,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
149
149
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
150
150
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
151
151
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
152
|
-
torchmonarch_nightly-2025.6.
|
153
|
-
torchmonarch_nightly-2025.6.
|
154
|
-
torchmonarch_nightly-2025.6.
|
155
|
-
torchmonarch_nightly-2025.6.
|
156
|
-
torchmonarch_nightly-2025.6.
|
157
|
-
torchmonarch_nightly-2025.6.
|
152
|
+
torchmonarch_nightly-2025.6.7.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
153
|
+
torchmonarch_nightly-2025.6.7.dist-info/METADATA,sha256=8082irkhKa1D8z8Dq0GBZAfdQXh_JXcirAjEAI-A2_8,2771
|
154
|
+
torchmonarch_nightly-2025.6.7.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
155
|
+
torchmonarch_nightly-2025.6.7.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
156
|
+
torchmonarch_nightly-2025.6.7.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
157
|
+
torchmonarch_nightly-2025.6.7.dist-info/RECORD,,
|
File without changes
|
{torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/entry_points.txt
RENAMED
File without changes
|
{torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/licenses/LICENSE
RENAMED
File without changes
|
{torchmonarch_nightly-2025.6.5.dist-info → torchmonarch_nightly-2025.6.7.dist-info}/top_level.txt
RENAMED
File without changes
|