torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl → 2025.6.29__cp312-cp312-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/mesh_controller.py +0 -1
- monarch/monarch_controller +0 -0
- monarch/proc_mesh.py +22 -3
- monarch/rust_local_mesh.py +1 -4
- monarch/sim_mesh.py +0 -1
- tests/test_actor_error.py +35 -2
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/RECORD +13 -13
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/mesh_controller.py
CHANGED
@@ -202,7 +202,6 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
|
|
202
202
|
num_worker_procs = len(worker_point.shape)
|
203
203
|
process_env = {
|
204
204
|
**worker_env,
|
205
|
-
"HYPERACTOR_MANAGED_SUBPROCESS": "1",
|
206
205
|
"CUDA_VISIBLE_DEVICES": str(local_rank),
|
207
206
|
"NCCL_HOSTID": f"{proc_id}_host_{worker_rank // gpus_per_host}",
|
208
207
|
# This is needed to avoid a hard failure in ncclx when we do not
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/proc_mesh.py
CHANGED
@@ -38,7 +38,10 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
|
|
38
38
|
AllocSpec,
|
39
39
|
)
|
40
40
|
from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
|
41
|
-
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import
|
41
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
42
|
+
ProcMesh as HyProcMesh,
|
43
|
+
ProcMeshMonitor,
|
44
|
+
)
|
42
45
|
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
43
46
|
from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
|
44
47
|
|
@@ -117,6 +120,24 @@ class ProcMesh(MeshTrait):
|
|
117
120
|
lambda: self._spawn_blocking(name, Class, *args, **kwargs),
|
118
121
|
)
|
119
122
|
|
123
|
+
async def monitor(self) -> ProcMeshMonitor:
|
124
|
+
"""
|
125
|
+
Get a monitor (async iterator) of the proc mesh, it is used to
|
126
|
+
monitor the status of the proc mesh. This function can be called at most once.
|
127
|
+
|
128
|
+
Note: This API is experimental and subject to change.
|
129
|
+
|
130
|
+
Example:
|
131
|
+
|
132
|
+
async def monitor_loop(monitor):
|
133
|
+
async for event in monitor:
|
134
|
+
await handle_exception_event(event)
|
135
|
+
|
136
|
+
# Kick off in background
|
137
|
+
asyncio.create_task(monitor_loop(monitor))
|
138
|
+
"""
|
139
|
+
return await self._proc_mesh.monitor()
|
140
|
+
|
120
141
|
@classmethod
|
121
142
|
def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
|
122
143
|
return Future(
|
@@ -269,7 +290,6 @@ async def proc_mesh_nonblocking(
|
|
269
290
|
env = env or {}
|
270
291
|
cmd, args, base_env = _get_bootstrap_args()
|
271
292
|
env.update(base_env)
|
272
|
-
env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
|
273
293
|
allocator = monarch.ProcessAllocator(cmd, args, env)
|
274
294
|
alloc = await allocator.allocate(spec)
|
275
295
|
return await ProcMesh.from_alloc(alloc)
|
@@ -284,7 +304,6 @@ def proc_mesh_blocking(
|
|
284
304
|
env = env or {}
|
285
305
|
cmd, args, base_env = _get_bootstrap_args()
|
286
306
|
env.update(base_env)
|
287
|
-
env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
|
288
307
|
allocator = monarch.ProcessAllocator(cmd, args, env)
|
289
308
|
alloc = allocator.allocate(spec).get()
|
290
309
|
return ProcMesh.from_alloc(alloc).get()
|
monarch/rust_local_mesh.py
CHANGED
@@ -117,9 +117,7 @@ class ControllerParams(NamedTuple):
|
|
117
117
|
fail_on_worker_timeout: bool
|
118
118
|
|
119
119
|
|
120
|
-
_PROC_ENV = {
|
121
|
-
"HYPERACTOR_MANAGED_SUBPROCESS": str(1),
|
122
|
-
}
|
120
|
+
_PROC_ENV: dict[str, str] = {}
|
123
121
|
|
124
122
|
|
125
123
|
def get_controller_main() -> tuple[Path, dict[str, str]]:
|
@@ -988,7 +986,6 @@ class Bootstrap:
|
|
988
986
|
raise ValueError(f"Unknown socket type: {socket_type}")
|
989
987
|
|
990
988
|
env = os.environ.copy()
|
991
|
-
env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
|
992
989
|
self.env: dict[str, str] = env
|
993
990
|
|
994
991
|
# Launch a single system globally
|
monarch/sim_mesh.py
CHANGED
tests/test_actor_error.py
CHANGED
@@ -9,9 +9,9 @@ import importlib.resources
|
|
9
9
|
import subprocess
|
10
10
|
|
11
11
|
import pytest
|
12
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcEvent
|
12
13
|
from monarch.actor_mesh import Actor, ActorError, endpoint, send
|
13
|
-
|
14
|
-
from monarch.proc_mesh import proc_mesh
|
14
|
+
from monarch.proc_mesh import local_proc_mesh, proc_mesh
|
15
15
|
|
16
16
|
|
17
17
|
class ExceptionActor(Actor):
|
@@ -238,3 +238,36 @@ async def test_exception_after_wait_unmonitored():
|
|
238
238
|
assert (
|
239
239
|
process.returncode != 0
|
240
240
|
), f"Expected non-zero exit code, got {process.returncode}"
|
241
|
+
|
242
|
+
|
243
|
+
class ErrorActor(Actor):
|
244
|
+
def __init__(self, message):
|
245
|
+
raise RuntimeError("fail on init")
|
246
|
+
|
247
|
+
@endpoint
|
248
|
+
async def check(self) -> None:
|
249
|
+
pass
|
250
|
+
|
251
|
+
|
252
|
+
async def test_proc_mesh_redundant_monitoring():
|
253
|
+
proc = await local_proc_mesh(hosts=1, gpus=1)
|
254
|
+
await proc.monitor()
|
255
|
+
|
256
|
+
with pytest.raises(
|
257
|
+
Exception, match="user already registered a monitor for this proc mesh"
|
258
|
+
):
|
259
|
+
await proc.monitor()
|
260
|
+
|
261
|
+
|
262
|
+
async def test_proc_mesh_monitoring():
|
263
|
+
proc = await local_proc_mesh(hosts=1, gpus=1)
|
264
|
+
monitor = await proc.monitor()
|
265
|
+
|
266
|
+
with pytest.raises(Exception):
|
267
|
+
e = await proc.spawn("error", ErrorActor, "failed to init the actor")
|
268
|
+
await asyncio.wait_for(e.check.call_one(), timeout=15)
|
269
|
+
|
270
|
+
event = await anext(monitor)
|
271
|
+
assert isinstance(event, ProcEvent.Crashed)
|
272
|
+
assert event[0] == 0 # check rank
|
273
|
+
assert "fail on init" in event[1] # check error message
|
{torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/RECORD
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=120L31EpmFrwc2rDVm_8lOry_zNeAcdU91Au78_yrmc,43743920
|
3
3
|
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
4
|
monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
|
5
5
|
monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
|
@@ -11,21 +11,21 @@ monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
|
11
11
|
monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
|
12
12
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
13
13
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
14
|
-
monarch/mesh_controller.py,sha256=
|
15
|
-
monarch/monarch_controller,sha256=
|
14
|
+
monarch/mesh_controller.py,sha256=JIg2-MIGhqeFCXn793j8ivEu5-8ePksu--H_Pv0-Ltk,10377
|
15
|
+
monarch/monarch_controller,sha256=sUs7I26272aWQdarhsmG1rER5EBhe8FEALbNXmzkVVI,21723248
|
16
16
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
17
17
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
18
18
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
19
19
|
monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
|
20
|
-
monarch/proc_mesh.py,sha256=
|
20
|
+
monarch/proc_mesh.py,sha256=UX8qthL0RSPwOf7I5dLHejVAQrZtYAERGhBeUDR4Xfw,10950
|
21
21
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
22
22
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
23
23
|
monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
|
24
24
|
monarch/rdma.py,sha256=1pNh11S_FWeETRgkdUpauTMUlodrRohIq1UfQjKVnN8,5418
|
25
25
|
monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
|
26
26
|
monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
|
27
|
-
monarch/rust_local_mesh.py,sha256=
|
28
|
-
monarch/sim_mesh.py,sha256=
|
27
|
+
monarch/rust_local_mesh.py,sha256=hcVBlX73UZkXBZM2AzV2Fp01ftM5WRNc5LVV81LFMr8,47324
|
28
|
+
monarch/sim_mesh.py,sha256=omx_dmHOjPQqj15PgphN-OOzSgfZ9MllOWhO_AYW-8U,12205
|
29
29
|
monarch/telemetry.py,sha256=7JUZWaoD2Yn5Ae_7kNhkLFRBLYaSGfH071_m_qfVehI,525
|
30
30
|
monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
|
31
31
|
monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
|
@@ -135,7 +135,7 @@ tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
|
|
135
135
|
tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
|
136
136
|
tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
|
137
137
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
138
|
-
tests/test_actor_error.py,sha256
|
138
|
+
tests/test_actor_error.py,sha256=hMpaJDkvowBM_fKuitQdKmq3iyvySaom8KNLmxEsZ-o,9534
|
139
139
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
140
140
|
tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
|
141
141
|
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
@@ -157,9 +157,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
157
157
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
158
158
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
159
159
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
160
|
-
torchmonarch_nightly-2025.6.
|
161
|
-
torchmonarch_nightly-2025.6.
|
162
|
-
torchmonarch_nightly-2025.6.
|
163
|
-
torchmonarch_nightly-2025.6.
|
164
|
-
torchmonarch_nightly-2025.6.
|
165
|
-
torchmonarch_nightly-2025.6.
|
160
|
+
torchmonarch_nightly-2025.6.29.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
161
|
+
torchmonarch_nightly-2025.6.29.dist-info/METADATA,sha256=pfxXahOAFVZWSQpbC-6VgvxpOnSxa8Acic1RsONWV6U,2780
|
162
|
+
torchmonarch_nightly-2025.6.29.dist-info/WHEEL,sha256=lduYNUEDASmtUEDemd8SmeX1qOMvvA6YKAbAo1Qbwk8,104
|
163
|
+
torchmonarch_nightly-2025.6.29.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
164
|
+
torchmonarch_nightly-2025.6.29.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
165
|
+
torchmonarch_nightly-2025.6.29.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.27.dist-info → torchmonarch_nightly-2025.6.29.dist-info}/top_level.txt
RENAMED
File without changes
|