torchmonarch-nightly 2025.6.28__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
Binary file
monarch/proc_mesh.py CHANGED
@@ -38,7 +38,10 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
38
38
  AllocSpec,
39
39
  )
40
40
  from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
41
- from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
41
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
42
+ ProcMesh as HyProcMesh,
43
+ ProcMeshMonitor,
44
+ )
42
45
  from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
43
46
  from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
44
47
 
@@ -117,6 +120,24 @@ class ProcMesh(MeshTrait):
117
120
  lambda: self._spawn_blocking(name, Class, *args, **kwargs),
118
121
  )
119
122
 
123
+ async def monitor(self) -> ProcMeshMonitor:
124
+ """
125
+ Get a monitor (async iterator) of the proc mesh, it is used to
126
+ monitor the status of the proc mesh. This function can be called at most once.
127
+
128
+ Note: This API is experimental and subject to change.
129
+
130
+ Example:
131
+
132
+ async def monitor_loop(monitor):
133
+ async for event in monitor:
134
+ await handle_exception_event(event)
135
+
136
+ # Kick off in background
137
+ asyncio.create_task(monitor_loop(monitor))
138
+ """
139
+ return await self._proc_mesh.monitor()
140
+
120
141
  @classmethod
121
142
  def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
122
143
  return Future(
tests/test_actor_error.py CHANGED
@@ -9,9 +9,9 @@ import importlib.resources
9
9
  import subprocess
10
10
 
11
11
  import pytest
12
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
12
13
  from monarch.actor_mesh import Actor, ActorError, endpoint, send
13
-
14
- from monarch.proc_mesh import proc_mesh
14
+ from monarch.proc_mesh import local_proc_mesh, proc_mesh
15
15
 
16
16
 
17
17
  class ExceptionActor(Actor):
@@ -238,3 +238,36 @@ async def test_exception_after_wait_unmonitored():
238
238
  assert (
239
239
  process.returncode != 0
240
240
  ), f"Expected non-zero exit code, got {process.returncode}"
241
+
242
+
243
+ class ErrorActor(Actor):
244
+ def __init__(self, message):
245
+ raise RuntimeError("fail on init")
246
+
247
+ @endpoint
248
+ async def check(self) -> None:
249
+ pass
250
+
251
+
252
+ async def test_proc_mesh_redundant_monitoring():
253
+ proc = await local_proc_mesh(hosts=1, gpus=1)
254
+ await proc.monitor()
255
+
256
+ with pytest.raises(
257
+ Exception, match="user already registered a monitor for this proc mesh"
258
+ ):
259
+ await proc.monitor()
260
+
261
+
262
+ async def test_proc_mesh_monitoring():
263
+ proc = await local_proc_mesh(hosts=1, gpus=1)
264
+ monitor = await proc.monitor()
265
+
266
+ with pytest.raises(Exception):
267
+ e = await proc.spawn("error", ErrorActor, "failed to init the actor")
268
+ await asyncio.wait_for(e.check.call_one(), timeout=15)
269
+
270
+ event = await anext(monitor)
271
+ assert isinstance(event, ProcEvent.Crashed)
272
+ assert event[0] == 0 # check rank
273
+ assert "fail on init" in event[1] # check error message
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.28
3
+ Version: 2025.6.30
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,5 +1,5 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=qizsJb28BPG3JD2WG0Hsgj1Hp_QSXRbsJ_stV18uhKU,43303992
2
+ monarch/_rust_bindings.so,sha256=zpX8IO7B17UrHgsMt5Uhenq3Rsl-K-aQRiUqDwZYlLU,43688456
3
3
  monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
4
4
  monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
5
5
  monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
@@ -12,12 +12,12 @@ monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
12
12
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
13
13
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
14
14
  monarch/mesh_controller.py,sha256=JIg2-MIGhqeFCXn793j8ivEu5-8ePksu--H_Pv0-Ltk,10377
15
- monarch/monarch_controller,sha256=MkzPFU-UyGlKvJI0KGxduDkjTh7uCHkCw8Yi7sAOFoE,21739240
15
+ monarch/monarch_controller,sha256=BtSoP1uMMH7kbn-EjN9V7inG1K7J-HXo7xkBHdxI93M,21705312
16
16
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
17
17
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
18
18
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
19
19
  monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
20
- monarch/proc_mesh.py,sha256=oMHr6UiOY-pzgbvqjsBZBhtRYPrxCmPyYVrzb6kZzEM,10342
20
+ monarch/proc_mesh.py,sha256=UX8qthL0RSPwOf7I5dLHejVAQrZtYAERGhBeUDR4Xfw,10950
21
21
  monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
22
22
  monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
23
23
  monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
@@ -135,7 +135,7 @@ tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
135
135
  tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
136
136
  tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
137
137
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
138
- tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
138
+ tests/test_actor_error.py,sha256=hMpaJDkvowBM_fKuitQdKmq3iyvySaom8KNLmxEsZ-o,9534
139
139
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
140
140
  tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
141
141
  tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
@@ -157,9 +157,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
157
157
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
158
158
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
159
159
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
160
- torchmonarch_nightly-2025.6.28.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
161
- torchmonarch_nightly-2025.6.28.dist-info/METADATA,sha256=uhqmZckBXouJsgJ6XHRYObRlsA-v8iYdr0zhaFKcphE,2780
162
- torchmonarch_nightly-2025.6.28.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
163
- torchmonarch_nightly-2025.6.28.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
164
- torchmonarch_nightly-2025.6.28.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
165
- torchmonarch_nightly-2025.6.28.dist-info/RECORD,,
160
+ torchmonarch_nightly-2025.6.30.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
161
+ torchmonarch_nightly-2025.6.30.dist-info/METADATA,sha256=Nq_Nx8bK1tiR5Kf-o4uoRmSZUsZ-u5hEp4u2cwUqFSM,2780
162
+ torchmonarch_nightly-2025.6.30.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
163
+ torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
164
+ torchmonarch_nightly-2025.6.30.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
165
+ torchmonarch_nightly-2025.6.30.dist-info/RECORD,,