torchmonarch-nightly 2025.6.28__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/monarch_controller +0 -0
- monarch/proc_mesh.py +22 -1
- tests/test_actor_error.py +35 -2
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/RECORD +10 -10
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/proc_mesh.py
CHANGED
@@ -38,7 +38,10 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
|
|
38
38
|
AllocSpec,
|
39
39
|
)
|
40
40
|
from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
|
41
|
-
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import
|
41
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
42
|
+
ProcMesh as HyProcMesh,
|
43
|
+
ProcMeshMonitor,
|
44
|
+
)
|
42
45
|
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
43
46
|
from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
|
44
47
|
|
@@ -117,6 +120,24 @@ class ProcMesh(MeshTrait):
|
|
117
120
|
lambda: self._spawn_blocking(name, Class, *args, **kwargs),
|
118
121
|
)
|
119
122
|
|
123
|
+
async def monitor(self) -> ProcMeshMonitor:
|
124
|
+
"""
|
125
|
+
Get a monitor (async iterator) of the proc mesh, it is used to
|
126
|
+
monitor the status of the proc mesh. This function can be called at most once.
|
127
|
+
|
128
|
+
Note: This API is experimental and subject to change.
|
129
|
+
|
130
|
+
Example:
|
131
|
+
|
132
|
+
async def monitor_loop(monitor):
|
133
|
+
async for event in monitor:
|
134
|
+
await handle_exception_event(event)
|
135
|
+
|
136
|
+
# Kick off in background
|
137
|
+
asyncio.create_task(monitor_loop(monitor))
|
138
|
+
"""
|
139
|
+
return await self._proc_mesh.monitor()
|
140
|
+
|
120
141
|
@classmethod
|
121
142
|
def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
|
122
143
|
return Future(
|
tests/test_actor_error.py
CHANGED
@@ -9,9 +9,9 @@ import importlib.resources
|
|
9
9
|
import subprocess
|
10
10
|
|
11
11
|
import pytest
|
12
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
|
12
13
|
from monarch.actor_mesh import Actor, ActorError, endpoint, send
|
13
|
-
|
14
|
-
from monarch.proc_mesh import proc_mesh
|
14
|
+
from monarch.proc_mesh import local_proc_mesh, proc_mesh
|
15
15
|
|
16
16
|
|
17
17
|
class ExceptionActor(Actor):
|
@@ -238,3 +238,36 @@ async def test_exception_after_wait_unmonitored():
|
|
238
238
|
assert (
|
239
239
|
process.returncode != 0
|
240
240
|
), f"Expected non-zero exit code, got {process.returncode}"
|
241
|
+
|
242
|
+
|
243
|
+
class ErrorActor(Actor):
|
244
|
+
def __init__(self, message):
|
245
|
+
raise RuntimeError("fail on init")
|
246
|
+
|
247
|
+
@endpoint
|
248
|
+
async def check(self) -> None:
|
249
|
+
pass
|
250
|
+
|
251
|
+
|
252
|
+
async def test_proc_mesh_redundant_monitoring():
|
253
|
+
proc = await local_proc_mesh(hosts=1, gpus=1)
|
254
|
+
await proc.monitor()
|
255
|
+
|
256
|
+
with pytest.raises(
|
257
|
+
Exception, match="user already registered a monitor for this proc mesh"
|
258
|
+
):
|
259
|
+
await proc.monitor()
|
260
|
+
|
261
|
+
|
262
|
+
async def test_proc_mesh_monitoring():
|
263
|
+
proc = await local_proc_mesh(hosts=1, gpus=1)
|
264
|
+
monitor = await proc.monitor()
|
265
|
+
|
266
|
+
with pytest.raises(Exception):
|
267
|
+
e = await proc.spawn("error", ErrorActor, "failed to init the actor")
|
268
|
+
await asyncio.wait_for(e.check.call_one(), timeout=15)
|
269
|
+
|
270
|
+
event = await anext(monitor)
|
271
|
+
assert isinstance(event, ProcEvent.Crashed)
|
272
|
+
assert event[0] == 0 # check rank
|
273
|
+
assert "fail on init" in event[1] # check error message
|
{torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/RECORD
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=zpX8IO7B17UrHgsMt5Uhenq3Rsl-K-aQRiUqDwZYlLU,43688456
|
3
3
|
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
4
|
monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
|
5
5
|
monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
|
@@ -12,12 +12,12 @@ monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
|
|
12
12
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
13
13
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
14
14
|
monarch/mesh_controller.py,sha256=JIg2-MIGhqeFCXn793j8ivEu5-8ePksu--H_Pv0-Ltk,10377
|
15
|
-
monarch/monarch_controller,sha256=
|
15
|
+
monarch/monarch_controller,sha256=BtSoP1uMMH7kbn-EjN9V7inG1K7J-HXo7xkBHdxI93M,21705312
|
16
16
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
17
17
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
18
18
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
19
19
|
monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
|
20
|
-
monarch/proc_mesh.py,sha256=
|
20
|
+
monarch/proc_mesh.py,sha256=UX8qthL0RSPwOf7I5dLHejVAQrZtYAERGhBeUDR4Xfw,10950
|
21
21
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
22
22
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
23
23
|
monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
|
@@ -135,7 +135,7 @@ tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
|
|
135
135
|
tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
|
136
136
|
tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
|
137
137
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
138
|
-
tests/test_actor_error.py,sha256
|
138
|
+
tests/test_actor_error.py,sha256=hMpaJDkvowBM_fKuitQdKmq3iyvySaom8KNLmxEsZ-o,9534
|
139
139
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
140
140
|
tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
|
141
141
|
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
@@ -157,9 +157,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
157
157
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
158
158
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
159
159
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
160
|
-
torchmonarch_nightly-2025.6.
|
161
|
-
torchmonarch_nightly-2025.6.
|
162
|
-
torchmonarch_nightly-2025.6.
|
163
|
-
torchmonarch_nightly-2025.6.
|
164
|
-
torchmonarch_nightly-2025.6.
|
165
|
-
torchmonarch_nightly-2025.6.
|
160
|
+
torchmonarch_nightly-2025.6.30.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
161
|
+
torchmonarch_nightly-2025.6.30.dist-info/METADATA,sha256=Nq_Nx8bK1tiR5Kf-o4uoRmSZUsZ-u5hEp4u2cwUqFSM,2780
|
162
|
+
torchmonarch_nightly-2025.6.30.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
163
|
+
torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
164
|
+
torchmonarch_nightly-2025.6.30.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
165
|
+
torchmonarch_nightly-2025.6.30.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.28.dist-info → torchmonarch_nightly-2025.6.30.dist-info}/top_level.txt
RENAMED
File without changes
|