torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl → 2025.6.29__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
@@ -202,7 +202,6 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
202
202
  num_worker_procs = len(worker_point.shape)
203
203
  process_env = {
204
204
  **worker_env,
205
- "HYPERACTOR_MANAGED_SUBPROCESS": "1",
206
205
  "CUDA_VISIBLE_DEVICES": str(local_rank),
207
206
  "NCCL_HOSTID": f"{proc_id}_host_{worker_rank // gpus_per_host}",
208
207
  # This is needed to avoid a hard failure in ncclx when we do not
Binary file
monarch/proc_mesh.py CHANGED
@@ -38,7 +38,10 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
38
38
  AllocSpec,
39
39
  )
40
40
  from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
41
- from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
41
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
42
+ ProcMesh as HyProcMesh,
43
+ ProcMeshMonitor,
44
+ )
42
45
  from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
43
46
  from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
44
47
 
@@ -117,6 +120,24 @@ class ProcMesh(MeshTrait):
117
120
  lambda: self._spawn_blocking(name, Class, *args, **kwargs),
118
121
  )
119
122
 
123
+ async def monitor(self) -> ProcMeshMonitor:
124
+ """
125
+ Get a monitor (async iterator) of the proc mesh, it is used to
126
+ monitor the status of the proc mesh. This function can be called at most once.
127
+
128
+ Note: This API is experimental and subject to change.
129
+
130
+ Example:
131
+
132
+ async def monitor_loop(monitor):
133
+ async for event in monitor:
134
+ await handle_exception_event(event)
135
+
136
+ # Kick off in background
137
+ asyncio.create_task(monitor_loop(monitor))
138
+ """
139
+ return await self._proc_mesh.monitor()
140
+
120
141
  @classmethod
121
142
  def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
122
143
  return Future(
@@ -269,7 +290,6 @@ async def proc_mesh_nonblocking(
269
290
  env = env or {}
270
291
  cmd, args, base_env = _get_bootstrap_args()
271
292
  env.update(base_env)
272
- env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
273
293
  allocator = monarch.ProcessAllocator(cmd, args, env)
274
294
  alloc = await allocator.allocate(spec)
275
295
  return await ProcMesh.from_alloc(alloc)
@@ -284,7 +304,6 @@ def proc_mesh_blocking(
284
304
  env = env or {}
285
305
  cmd, args, base_env = _get_bootstrap_args()
286
306
  env.update(base_env)
287
- env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
288
307
  allocator = monarch.ProcessAllocator(cmd, args, env)
289
308
  alloc = allocator.allocate(spec).get()
290
309
  return ProcMesh.from_alloc(alloc).get()
@@ -117,9 +117,7 @@ class ControllerParams(NamedTuple):
117
117
  fail_on_worker_timeout: bool
118
118
 
119
119
 
120
- _PROC_ENV = {
121
- "HYPERACTOR_MANAGED_SUBPROCESS": str(1),
122
- }
120
+ _PROC_ENV: dict[str, str] = {}
123
121
 
124
122
 
125
123
  def get_controller_main() -> tuple[Path, dict[str, str]]:
@@ -988,7 +986,6 @@ class Bootstrap:
988
986
  raise ValueError(f"Unknown socket type: {socket_type}")
989
987
 
990
988
  env = os.environ.copy()
991
- env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
992
989
  self.env: dict[str, str] = env
993
990
 
994
991
  # Launch a single system globally
monarch/sim_mesh.py CHANGED
@@ -194,7 +194,6 @@ class Bootstrap:
194
194
  fake_call(lambda: 0)
195
195
 
196
196
  env = os.environ.copy()
197
- env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
198
197
  self.env: dict[str, str] = env
199
198
 
200
199
  self._mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]] = mesh_world_state
tests/test_actor_error.py CHANGED
@@ -9,9 +9,9 @@ import importlib.resources
9
9
  import subprocess
10
10
 
11
11
  import pytest
12
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
12
13
  from monarch.actor_mesh import Actor, ActorError, endpoint, send
13
-
14
- from monarch.proc_mesh import proc_mesh
14
+ from monarch.proc_mesh import local_proc_mesh, proc_mesh
15
15
 
16
16
 
17
17
  class ExceptionActor(Actor):
@@ -238,3 +238,36 @@ async def test_exception_after_wait_unmonitored():
238
238
  assert (
239
239
  process.returncode != 0
240
240
  ), f"Expected non-zero exit code, got {process.returncode}"
241
+
242
+
243
+ class ErrorActor(Actor):
244
+ def __init__(self, message):
245
+ raise RuntimeError("fail on init")
246
+
247
+ @endpoint
248
+ async def check(self) -> None:
249
+ pass
250
+
251
+
252
+ async def test_proc_mesh_redundant_monitoring():
253
+ proc = await local_proc_mesh(hosts=1, gpus=1)
254
+ await proc.monitor()
255
+
256
+ with pytest.raises(
257
+ Exception, match="user already registered a monitor for this proc mesh"
258
+ ):
259
+ await proc.monitor()
260
+
261
+
262
+ async def test_proc_mesh_monitoring():
263
+ proc = await local_proc_mesh(hosts=1, gpus=1)
264
+ monitor = await proc.monitor()
265
+
266
+ with pytest.raises(Exception):
267
+ e = await proc.spawn("error", ErrorActor, "failed to init the actor")
268
+ await asyncio.wait_for(e.check.call_one(), timeout=15)
269
+
270
+ event = await anext(monitor)
271
+ assert isinstance(event, ProcEvent.Crashed)
272
+ assert event[0] == 0 # check rank
273
+ assert "fail on init" in event[1] # check error message
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.27
3
+ Version: 2025.6.29
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,5 +1,5 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=oMDFcOj81KxjZCQCBum3L1giy97jfyGw4fXdpix-xS4,43392968
2
+ monarch/_rust_bindings.so,sha256=120L31EpmFrwc2rDVm_8lOry_zNeAcdU91Au78_yrmc,43743920
3
3
  monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
4
4
  monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
5
5
  monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
@@ -11,21 +11,21 @@ monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
11
11
  monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
12
12
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
13
13
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
14
- monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
15
- monarch/monarch_controller,sha256=a28zj3g-ze3sihp4xppXnCfQAXrjPD92X_EwTxPpT9c,21678216
14
+ monarch/mesh_controller.py,sha256=JIg2-MIGhqeFCXn793j8ivEu5-8ePksu--H_Pv0-Ltk,10377
15
+ monarch/monarch_controller,sha256=sUs7I26272aWQdarhsmG1rER5EBhe8FEALbNXmzkVVI,21723248
16
16
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
17
17
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
18
18
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
19
19
  monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
20
- monarch/proc_mesh.py,sha256=ZnNWjINoFTdkRVbu_ikos2jV4Ham-I9jqeWdEN-1ZtQ,10436
20
+ monarch/proc_mesh.py,sha256=UX8qthL0RSPwOf7I5dLHejVAQrZtYAERGhBeUDR4Xfw,10950
21
21
  monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
22
22
  monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
23
23
  monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
24
24
  monarch/rdma.py,sha256=1pNh11S_FWeETRgkdUpauTMUlodrRohIq1UfQjKVnN8,5418
25
25
  monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
26
26
  monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
27
- monarch/rust_local_mesh.py,sha256=7ASptybn3wy4J7eoBc7LhGW4j4AA6bigl5Kuhyflw8s,47405
28
- monarch/sim_mesh.py,sha256=kDsbubv28YFg9ZQN4urt3oJGzR3CnnUiATnjUiSxrkE,12256
27
+ monarch/rust_local_mesh.py,sha256=hcVBlX73UZkXBZM2AzV2Fp01ftM5WRNc5LVV81LFMr8,47324
28
+ monarch/sim_mesh.py,sha256=omx_dmHOjPQqj15PgphN-OOzSgfZ9MllOWhO_AYW-8U,12205
29
29
  monarch/telemetry.py,sha256=7JUZWaoD2Yn5Ae_7kNhkLFRBLYaSGfH071_m_qfVehI,525
30
30
  monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
31
31
  monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
@@ -135,7 +135,7 @@ tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
135
135
  tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
136
136
  tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
137
137
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
138
- tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
138
+ tests/test_actor_error.py,sha256=hMpaJDkvowBM_fKuitQdKmq3iyvySaom8KNLmxEsZ-o,9534
139
139
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
140
140
  tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
141
141
  tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
@@ -157,9 +157,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
157
157
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
158
158
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
159
159
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
160
- torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
161
- torchmonarch_nightly-2025.6.27.dist-info/METADATA,sha256=0PKqq2myfJJjhPa9nAZVJCp4vymD0dBmC1w-RmRKgYI,2780
162
- torchmonarch_nightly-2025.6.27.dist-info/WHEEL,sha256=lduYNUEDASmtUEDemd8SmeX1qOMvvA6YKAbAo1Qbwk8,104
163
- torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
164
- torchmonarch_nightly-2025.6.27.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
165
- torchmonarch_nightly-2025.6.27.dist-info/RECORD,,
160
+ torchmonarch_nightly-2025.6.29.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
161
+ torchmonarch_nightly-2025.6.29.dist-info/METADATA,sha256=pfxXahOAFVZWSQpbC-6VgvxpOnSxa8Acic1RsONWV6U,2780
162
+ torchmonarch_nightly-2025.6.29.dist-info/WHEEL,sha256=lduYNUEDASmtUEDemd8SmeX1qOMvvA6YKAbAo1Qbwk8,104
163
+ torchmonarch_nightly-2025.6.29.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
164
+ torchmonarch_nightly-2025.6.29.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
165
+ torchmonarch_nightly-2025.6.29.dist-info/RECORD,,