torchmonarch-nightly 2025.6.16__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.18__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/actor_mesh.py +53 -95
- monarch/common/device_mesh.py +9 -5
- monarch/common/shape.py +3 -0
- monarch/common/tensor.py +25 -4
- monarch/mesh_controller.py +10 -5
- monarch/monarch_controller +0 -0
- monarch/proc_mesh.py +62 -8
- monarch/sim_mesh.py +4 -2
- monarch/tools/cli.py +1 -1
- monarch/tools/components/hyperactor.py +4 -3
- monarch/tools/network.py +62 -0
- tests/error_test_binary.py +31 -1
- tests/test_actor_error.py +31 -1
- tests/test_python_actors.py +78 -8
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/RECORD +21 -20
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/actor_mesh.py
CHANGED
@@ -6,7 +6,6 @@
|
|
6
6
|
|
7
7
|
# pyre-unsafe
|
8
8
|
|
9
|
-
import asyncio
|
10
9
|
import collections
|
11
10
|
import contextvars
|
12
11
|
import functools
|
@@ -27,9 +26,7 @@ from typing import (
|
|
27
26
|
Callable,
|
28
27
|
cast,
|
29
28
|
Concatenate,
|
30
|
-
Coroutine,
|
31
29
|
Dict,
|
32
|
-
Generator,
|
33
30
|
Generic,
|
34
31
|
Iterable,
|
35
32
|
List,
|
@@ -99,39 +96,6 @@ _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
|
|
99
96
|
)
|
100
97
|
|
101
98
|
|
102
|
-
# this was implemented in python 3.12 as an argument to task
|
103
|
-
# but I have to backport to 3.10/3.11.
|
104
|
-
def create_eager_task(coro: Awaitable[None]) -> asyncio.Future:
|
105
|
-
iter = coro.__await__()
|
106
|
-
try:
|
107
|
-
first_yield = next(iter)
|
108
|
-
return asyncio.create_task(RestOfCoroutine(first_yield, iter).run())
|
109
|
-
except StopIteration as e:
|
110
|
-
t = asyncio.Future()
|
111
|
-
t.set_result(e.value)
|
112
|
-
return t
|
113
|
-
|
114
|
-
|
115
|
-
class RestOfCoroutine(Generic[T1, T2]):
|
116
|
-
def __init__(self, first_yield: T1, iter: Generator[T2, None, T2]) -> None:
|
117
|
-
self.first_yield: T1 | None = first_yield
|
118
|
-
self.iter: Generator[T2, None, T2] = iter
|
119
|
-
|
120
|
-
def __await__(self) -> Generator[T1, None, T1] | Generator[T2, None, T2]:
|
121
|
-
first_yield = self.first_yield
|
122
|
-
assert first_yield is not None
|
123
|
-
yield first_yield
|
124
|
-
self.first_yield = None
|
125
|
-
while True:
|
126
|
-
try:
|
127
|
-
yield next(self.iter)
|
128
|
-
except StopIteration as e:
|
129
|
-
return e.value
|
130
|
-
|
131
|
-
async def run(self) -> T1 | T2:
|
132
|
-
return await self
|
133
|
-
|
134
|
-
|
135
99
|
T = TypeVar("T")
|
136
100
|
P = ParamSpec("P")
|
137
101
|
R = TypeVar("R")
|
@@ -285,7 +249,18 @@ class Endpoint(Generic[P, R]):
|
|
285
249
|
async def process() -> ValueMesh[R]:
|
286
250
|
results: List[R] = [None] * len(self._actor_mesh) # pyre-fixme[9]
|
287
251
|
for _ in range(len(self._actor_mesh)):
|
288
|
-
rank, value = await r.recv()
|
252
|
+
rank, value = await r.recv()
|
253
|
+
results[rank] = value
|
254
|
+
call_shape = Shape(
|
255
|
+
self._actor_mesh._shape.labels,
|
256
|
+
NDSlice.new_row_major(self._actor_mesh._shape.ndslice.sizes),
|
257
|
+
)
|
258
|
+
return ValueMesh(call_shape, results)
|
259
|
+
|
260
|
+
def process_blocking() -> ValueMesh[R]:
|
261
|
+
results: List[R] = [None] * len(self._actor_mesh) # pyre-fixme[9]
|
262
|
+
for _ in range(len(self._actor_mesh)):
|
263
|
+
rank, value = r.recv().get()
|
289
264
|
results[rank] = value
|
290
265
|
call_shape = Shape(
|
291
266
|
self._actor_mesh._shape.labels,
|
@@ -293,7 +268,7 @@ class Endpoint(Generic[P, R]):
|
|
293
268
|
)
|
294
269
|
return ValueMesh(call_shape, results)
|
295
270
|
|
296
|
-
return Future(process)
|
271
|
+
return Future(process, process_blocking)
|
297
272
|
|
298
273
|
async def stream(self, *args: P.args, **kwargs: P.kwargs) -> AsyncGenerator[R, R]:
|
299
274
|
"""
|
@@ -362,6 +337,9 @@ class ValueMesh(MeshTrait, Generic[R]):
|
|
362
337
|
def __len__(self) -> int:
|
363
338
|
return len(self._shape)
|
364
339
|
|
340
|
+
def __repr__(self) -> str:
|
341
|
+
return f"ValueMesh({self._shape})"
|
342
|
+
|
365
343
|
@property
|
366
344
|
def _ndslice(self) -> NDSlice:
|
367
345
|
return self._shape.ndslice
|
@@ -485,24 +463,36 @@ singleton_shape = Shape([], NDSlice(offset=0, sizes=[], strides=[]))
|
|
485
463
|
|
486
464
|
|
487
465
|
class _Actor:
|
466
|
+
"""
|
467
|
+
This is the message handling implementation of a Python actor.
|
468
|
+
|
469
|
+
The layering goes:
|
470
|
+
Rust `PythonActor` -> `_Actor` -> user-provided `Actor` instance
|
471
|
+
|
472
|
+
Messages are received from the Rust backend, and forwarded to the `handle`
|
473
|
+
methods on this class.
|
474
|
+
|
475
|
+
This class wraps the actual `Actor` instance provided by the user, and
|
476
|
+
routes messages to it, managing argument serialization/deserialization and
|
477
|
+
error handling.
|
478
|
+
"""
|
479
|
+
|
488
480
|
def __init__(self) -> None:
|
489
481
|
self.instance: object | None = None
|
490
|
-
self.active_requests: asyncio.Queue[asyncio.Future[object]] = asyncio.Queue()
|
491
|
-
self.complete_task: asyncio.Task | None = None
|
492
482
|
|
493
|
-
def handle(
|
483
|
+
async def handle(
|
494
484
|
self, mailbox: Mailbox, message: PythonMessage, panic_flag: PanicFlag
|
495
|
-
) ->
|
496
|
-
return self.handle_cast(mailbox, 0, singleton_shape, message, panic_flag)
|
485
|
+
) -> None:
|
486
|
+
return await self.handle_cast(mailbox, 0, singleton_shape, message, panic_flag)
|
497
487
|
|
498
|
-
def handle_cast(
|
488
|
+
async def handle_cast(
|
499
489
|
self,
|
500
490
|
mailbox: Mailbox,
|
501
491
|
rank: int,
|
502
492
|
shape: Shape,
|
503
493
|
message: PythonMessage,
|
504
494
|
panic_flag: PanicFlag,
|
505
|
-
) ->
|
495
|
+
) -> None:
|
506
496
|
port = (
|
507
497
|
Port(message.response_port, mailbox, rank)
|
508
498
|
if message.response_port
|
@@ -515,26 +505,21 @@ class _Actor:
|
|
515
505
|
_context.set(ctx)
|
516
506
|
|
517
507
|
args, kwargs = _unpickle(message.message, mailbox)
|
508
|
+
|
518
509
|
if message.method == "__init__":
|
519
510
|
Class, *args = args
|
520
511
|
self.instance = Class(*args, **kwargs)
|
521
512
|
return None
|
522
|
-
else:
|
523
|
-
the_method = getattr(self.instance, message.method)._method
|
524
513
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
)
|
529
|
-
result = the_method(self.instance, *args, **kwargs)
|
530
|
-
exit_span()
|
531
|
-
if port is not None:
|
532
|
-
port.send("result", result)
|
533
|
-
return None
|
514
|
+
the_method = getattr(self.instance, message.method)._method
|
515
|
+
|
516
|
+
if inspect.iscoroutinefunction(the_method):
|
534
517
|
|
535
518
|
async def instrumented():
|
536
519
|
enter_span(
|
537
|
-
the_method.__module__,
|
520
|
+
the_method.__module__,
|
521
|
+
message.method,
|
522
|
+
str(ctx.mailbox.actor_id),
|
538
523
|
)
|
539
524
|
try:
|
540
525
|
result = await the_method(self.instance, *args, **kwargs)
|
@@ -547,39 +532,14 @@ class _Actor:
|
|
547
532
|
exit_span()
|
548
533
|
return result
|
549
534
|
|
550
|
-
|
551
|
-
ctx,
|
552
|
-
self.run_task(port, instrumented(), panic_flag),
|
553
|
-
)
|
554
|
-
except Exception as e:
|
555
|
-
traceback.print_exc()
|
556
|
-
s = ActorError(e)
|
557
|
-
|
558
|
-
# The exception is delivered to exactly one of:
|
559
|
-
# (1) our caller, (2) our supervisor
|
560
|
-
if port is not None:
|
561
|
-
port.send("exception", s)
|
535
|
+
result = await instrumented()
|
562
536
|
else:
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
coroutine: Awaitable[None],
|
569
|
-
) -> None:
|
570
|
-
_context.set(ctx)
|
571
|
-
if self.complete_task is None:
|
572
|
-
self.complete_task = asyncio.create_task(self._complete())
|
573
|
-
await self.active_requests.put(create_eager_task(coroutine))
|
537
|
+
enter_span(
|
538
|
+
the_method.__module__, message.method, str(ctx.mailbox.actor_id)
|
539
|
+
)
|
540
|
+
result = the_method(self.instance, *args, **kwargs)
|
541
|
+
exit_span()
|
574
542
|
|
575
|
-
async def run_task(
|
576
|
-
self,
|
577
|
-
port: Port | None,
|
578
|
-
coroutine: Awaitable[Any],
|
579
|
-
panic_flag: PanicFlag,
|
580
|
-
) -> None:
|
581
|
-
try:
|
582
|
-
result = await coroutine
|
583
543
|
if port is not None:
|
584
544
|
port.send("result", result)
|
585
545
|
except Exception as e:
|
@@ -603,11 +563,6 @@ class _Actor:
|
|
603
563
|
pass
|
604
564
|
raise
|
605
565
|
|
606
|
-
async def _complete(self) -> None:
|
607
|
-
while True:
|
608
|
-
task = await self.active_requests.get()
|
609
|
-
await task
|
610
|
-
|
611
566
|
|
612
567
|
def _is_mailbox(x: object) -> bool:
|
613
568
|
return isinstance(x, Mailbox)
|
@@ -648,8 +603,8 @@ class Actor(MeshTrait):
|
|
648
603
|
"actor implementations are not meshes, but we can't convince the typechecker of it..."
|
649
604
|
)
|
650
605
|
|
651
|
-
@endpoint
|
652
|
-
|
606
|
+
@endpoint # pyre-ignore
|
607
|
+
def _set_debug_client(self, client: "DebugClient") -> None:
|
653
608
|
point = MonarchContext.get().point
|
654
609
|
# For some reason, using a lambda instead of functools.partial
|
655
610
|
# confuses the pdb wrapper implementation.
|
@@ -750,6 +705,9 @@ class ActorMeshRef(MeshTrait):
|
|
750
705
|
self._mailbox,
|
751
706
|
)
|
752
707
|
|
708
|
+
def __repr__(self) -> str:
|
709
|
+
return f"ActorMeshRef(class={self._class}, shape={self._actor_mesh_ref._shape})"
|
710
|
+
|
753
711
|
|
754
712
|
class ActorError(Exception):
|
755
713
|
"""
|
monarch/common/device_mesh.py
CHANGED
@@ -244,24 +244,24 @@ class DeviceMesh(Referenceable, MeshTrait):
|
|
244
244
|
def rotate(self, **kwargs: Dict[str, int]):
|
245
245
|
raise NotImplementedError()
|
246
246
|
|
247
|
-
def rank(self, dims: Union[str, Sequence[str]]) ->
|
247
|
+
def rank(self, dims: Union[str, Sequence[str]]) -> torch.Tensor:
|
248
248
|
self.define_remotely()
|
249
249
|
if isinstance(dims, str):
|
250
250
|
if dims not in self.names:
|
251
251
|
raise KeyError(f"{self} does not have dimension {repr(dims)}")
|
252
252
|
return _remote(
|
253
|
-
|
253
|
+
_rank,
|
254
254
|
propagate=lambda _self, _dims: torch.full((), 0, dtype=torch.long),
|
255
255
|
)(self, dims)
|
256
256
|
|
257
|
-
combined_rank = 0
|
257
|
+
combined_rank: Any = 0
|
258
258
|
for dim in dims:
|
259
259
|
combined_rank *= self.size(dim)
|
260
260
|
combined_rank += self.rank(dim)
|
261
261
|
return combined_rank
|
262
262
|
|
263
263
|
@property
|
264
|
-
def ranks(self) -> dict[str,
|
264
|
+
def ranks(self) -> dict[str, torch.Tensor]:
|
265
265
|
return {dim: self.rank(dim) for dim in self.names}
|
266
266
|
|
267
267
|
def process_idx(self):
|
@@ -334,6 +334,10 @@ class _ActiveMesh(TorchDispatchMode):
|
|
334
334
|
return _remote(func, propagate=func)(*args, **kwargs)
|
335
335
|
|
336
336
|
|
337
|
+
def _rank(mesh, dim):
|
338
|
+
return torch.full((), mesh.dims[dim].rank, dtype=torch.long)
|
339
|
+
|
340
|
+
|
337
341
|
@contextmanager
|
338
342
|
def _dispatch():
|
339
343
|
global _dispatch_enabled
|
@@ -401,7 +405,7 @@ def to_mesh(
|
|
401
405
|
|
402
406
|
def slice_mesh(
|
403
407
|
tensors: Any,
|
404
|
-
**kwargs:
|
408
|
+
**kwargs: Union[int, slice],
|
405
409
|
) -> Any:
|
406
410
|
"""
|
407
411
|
Performs the slice_mesh operation for each tensor in tensors.
|
monarch/common/shape.py
CHANGED
@@ -44,6 +44,9 @@ class MeshTrait(ABC):
|
|
44
44
|
@abstractmethod
|
45
45
|
def _labels(self) -> Tuple[str, ...]: ...
|
46
46
|
|
47
|
+
# mesh trait guarentees that its own calls to _new_with_shape
|
48
|
+
# will only ever select a shape that is a subspace of the
|
49
|
+
# current _ndslice.
|
47
50
|
@abstractmethod
|
48
51
|
def _new_with_shape(self, shape: Shape) -> Self: ...
|
49
52
|
|
monarch/common/tensor.py
CHANGED
@@ -7,17 +7,20 @@
|
|
7
7
|
# pyre-unsafe
|
8
8
|
import itertools
|
9
9
|
import traceback
|
10
|
+
import typing
|
10
11
|
import warnings
|
11
12
|
from collections import defaultdict
|
12
13
|
from typing import (
|
13
14
|
Any,
|
14
15
|
Callable,
|
16
|
+
cast,
|
15
17
|
Dict,
|
16
18
|
Iterable,
|
17
19
|
List,
|
18
20
|
Literal,
|
19
21
|
NamedTuple,
|
20
22
|
Optional,
|
23
|
+
runtime_checkable,
|
21
24
|
Sequence,
|
22
25
|
TYPE_CHECKING,
|
23
26
|
TypeVar,
|
@@ -35,7 +38,8 @@ from .base_tensor import BaseTensor
|
|
35
38
|
from .borrows import StorageAliases
|
36
39
|
|
37
40
|
if TYPE_CHECKING:
|
38
|
-
from .device_mesh import DeviceMesh
|
41
|
+
from monarch.common.device_mesh import DeviceMesh
|
42
|
+
|
39
43
|
from .fake import fake_call
|
40
44
|
from .function import Propagator, ResolvableFunction
|
41
45
|
from .invocation import Invocation
|
@@ -52,6 +56,12 @@ _valid_reduce = Literal[
|
|
52
56
|
T = TypeVar("T")
|
53
57
|
|
54
58
|
|
59
|
+
@runtime_checkable
|
60
|
+
class HasDeviceMesh(typing.Protocol):
|
61
|
+
@property
|
62
|
+
def _device_mesh(self) -> "DeviceMesh": ...
|
63
|
+
|
64
|
+
|
55
65
|
class DropLocation(NamedTuple):
|
56
66
|
tensor_id: int
|
57
67
|
traceback: List[traceback.FrameSummary]
|
@@ -167,7 +177,11 @@ class Tensor(Referenceable, BaseTensor):
|
|
167
177
|
self._on_first_use(self)
|
168
178
|
self._on_first_use = None
|
169
179
|
|
170
|
-
def to_mesh(
|
180
|
+
def to_mesh(
|
181
|
+
self,
|
182
|
+
mesh: Union["DeviceMesh", "HasDeviceMesh"],
|
183
|
+
stream: Optional["Stream"] = None,
|
184
|
+
):
|
171
185
|
"""
|
172
186
|
Move data between one device mesh and another. Sizes of named dimensions must match.
|
173
187
|
If mesh has dimensions that self.mesh does not, it will broadcast to those dimensions.
|
@@ -177,6 +191,8 @@ class Tensor(Referenceable, BaseTensor):
|
|
177
191
|
t.slice_mesh(batch=0).to_mesh(t.mesh)
|
178
192
|
|
179
193
|
"""
|
194
|
+
if isinstance(mesh, HasDeviceMesh):
|
195
|
+
mesh = mesh._device_mesh
|
180
196
|
return MeshSliceTensor(self, self.mesh).to_mesh(mesh, stream)
|
181
197
|
|
182
198
|
def reduce_(
|
@@ -344,7 +360,7 @@ class Tensor(Referenceable, BaseTensor):
|
|
344
360
|
)
|
345
361
|
return r
|
346
362
|
|
347
|
-
def slice_mesh(self, **kwargs:
|
363
|
+
def slice_mesh(self, **kwargs: Union[int, slice]) -> "MeshSliceTensor":
|
348
364
|
# technically a slice of a device mesh and a device mesh are not same thing
|
349
365
|
# because a device mesh also has caches for doing collectives.
|
350
366
|
# but this is an easy way to create a MeshSliceTensor until we optimize
|
@@ -368,8 +384,13 @@ class MeshSliceTensor:
|
|
368
384
|
self.slicing = slicing
|
369
385
|
|
370
386
|
def to_mesh(
|
371
|
-
self,
|
387
|
+
self,
|
388
|
+
mesh: Union["DeviceMesh", "HasDeviceMesh"],
|
389
|
+
stream: Optional["Stream"] = None,
|
372
390
|
) -> "Tensor":
|
391
|
+
if isinstance(mesh, HasDeviceMesh):
|
392
|
+
mesh = mesh._device_mesh
|
393
|
+
|
373
394
|
if stream is None:
|
374
395
|
stream = self.tensor.stream
|
375
396
|
|
monarch/mesh_controller.py
CHANGED
@@ -11,7 +11,7 @@ import time
|
|
11
11
|
import traceback
|
12
12
|
from collections import deque
|
13
13
|
from logging import Logger
|
14
|
-
from typing import List, NamedTuple, Optional, Union
|
14
|
+
from typing import List, NamedTuple, Optional, TYPE_CHECKING, Union
|
15
15
|
|
16
16
|
import torch.utils._python_dispatch
|
17
17
|
|
@@ -24,7 +24,13 @@ from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
|
|
24
24
|
from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
25
25
|
ActorId,
|
26
26
|
)
|
27
|
-
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
|
30
|
+
ProcMesh as HyProcMesh,
|
31
|
+
)
|
32
|
+
from monarch.proc_mesh import ProcMesh
|
33
|
+
|
28
34
|
from monarch._rust_bindings.monarch_hyperactor.shape import Point
|
29
35
|
|
30
36
|
from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
|
@@ -33,7 +39,6 @@ from monarch.common.controller_api import LogMessage, MessageResult
|
|
33
39
|
from monarch.common.device_mesh import DeviceMesh, no_mesh
|
34
40
|
from monarch.common.invocation import DeviceException, RemoteException
|
35
41
|
from monarch.controller.debugger import read as debugger_read, write as debugger_write
|
36
|
-
from monarch.proc_mesh import ProcMesh
|
37
42
|
from monarch.rust_local_mesh import _get_worker_exec_info
|
38
43
|
from pyre_extensions import none_throws
|
39
44
|
|
@@ -41,7 +46,7 @@ logger: Logger = logging.getLogger(__name__)
|
|
41
46
|
|
42
47
|
|
43
48
|
class Controller(_Controller):
|
44
|
-
def __init__(self, workers: HyProcMesh) -> None:
|
49
|
+
def __init__(self, workers: "HyProcMesh") -> None:
|
45
50
|
super().__init__()
|
46
51
|
# Buffer for messages unrelated to debugging that are received while a
|
47
52
|
# debugger session is active.
|
@@ -250,7 +255,7 @@ class MeshClient(Client):
|
|
250
255
|
self.inner.drain_and_stop()
|
251
256
|
|
252
257
|
|
253
|
-
def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
|
258
|
+
def spawn_tensor_engine(proc_mesh: "ProcMesh") -> DeviceMesh:
|
254
259
|
# This argument to Controller
|
255
260
|
# is currently only used for debug printing. It should be fixed to
|
256
261
|
# report the proc ID instead of the rank it currently does.
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/proc_mesh.py
CHANGED
@@ -7,8 +7,22 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
9
|
import sys
|
10
|
+
from contextlib import AbstractContextManager
|
11
|
+
|
12
|
+
from typing import (
|
13
|
+
Any,
|
14
|
+
cast,
|
15
|
+
Dict,
|
16
|
+
List,
|
17
|
+
Optional,
|
18
|
+
Sequence,
|
19
|
+
Type,
|
20
|
+
TYPE_CHECKING,
|
21
|
+
TypeVar,
|
22
|
+
)
|
10
23
|
|
11
|
-
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import torch
|
12
26
|
|
13
27
|
import monarch
|
14
28
|
from monarch import ActorFuture as Future
|
@@ -24,7 +38,9 @@ from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
|
24
38
|
from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
|
25
39
|
|
26
40
|
from monarch.common._device_utils import _local_device_count
|
41
|
+
from monarch.common.device_mesh import DeviceMesh
|
27
42
|
from monarch.common.shape import MeshTrait
|
43
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
28
44
|
from monarch.rdma import RDMAManager
|
29
45
|
|
30
46
|
T = TypeVar("T")
|
@@ -45,25 +61,43 @@ def _allocate_blocking(alloc: Alloc) -> "ProcMesh":
|
|
45
61
|
|
46
62
|
|
47
63
|
class ProcMesh(MeshTrait):
|
48
|
-
def __init__(
|
64
|
+
def __init__(
|
65
|
+
self,
|
66
|
+
hy_proc_mesh: HyProcMesh,
|
67
|
+
_mock_shape: Optional[Shape] = None,
|
68
|
+
_device_mesh: Optional[DeviceMesh] = None,
|
69
|
+
) -> None:
|
49
70
|
self._proc_mesh = hy_proc_mesh
|
71
|
+
self._mock_shape: Optional[Shape] = _mock_shape
|
50
72
|
self._mailbox: Mailbox = self._proc_mesh.client
|
51
|
-
self._rdma_manager: RDMAManager =
|
52
|
-
|
53
|
-
|
73
|
+
self._rdma_manager: Optional[RDMAManager] = None
|
74
|
+
self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
|
75
|
+
if _mock_shape is None:
|
76
|
+
self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
|
77
|
+
|
78
|
+
@property
|
79
|
+
def _shape(self) -> Shape:
|
80
|
+
return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
|
54
81
|
|
55
82
|
@property
|
56
83
|
def _ndslice(self) -> Slice:
|
57
|
-
return self.
|
84
|
+
return self._shape.ndslice
|
58
85
|
|
59
86
|
@property
|
60
87
|
def _labels(self) -> List[str]:
|
61
|
-
return self.
|
88
|
+
return self._shape.labels
|
62
89
|
|
63
90
|
def _new_with_shape(self, shape: Shape) -> "ProcMesh":
|
64
|
-
|
91
|
+
device_mesh = (
|
92
|
+
None
|
93
|
+
if self._device_mesh is None
|
94
|
+
else self._device_mesh._new_with_shape(shape)
|
95
|
+
)
|
96
|
+
return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
|
65
97
|
|
66
98
|
def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
|
99
|
+
if self._mock_shape is not None:
|
100
|
+
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
67
101
|
return Future(
|
68
102
|
lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
|
69
103
|
lambda: self._spawn_blocking(name, Class, *args, **kwargs),
|
@@ -120,6 +154,26 @@ class ProcMesh(MeshTrait):
|
|
120
154
|
service._create(args, kwargs)
|
121
155
|
return cast(T, service)
|
122
156
|
|
157
|
+
@property
|
158
|
+
def _device_mesh(self) -> "DeviceMesh":
|
159
|
+
if self._maybe_device_mesh is None:
|
160
|
+
if self._mock_shape is not None:
|
161
|
+
raise NotImplementedError(
|
162
|
+
"NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
|
163
|
+
)
|
164
|
+
self._maybe_device_mesh = spawn_tensor_engine(self)
|
165
|
+
return self._maybe_device_mesh
|
166
|
+
|
167
|
+
# pyre-ignore
|
168
|
+
def activate(self) -> AbstractContextManager:
|
169
|
+
return self._device_mesh.activate()
|
170
|
+
|
171
|
+
def rank_tensor(self, dim: str | Sequence[str]) -> "torch.Tensor":
|
172
|
+
return self._device_mesh.rank(dim)
|
173
|
+
|
174
|
+
def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
|
175
|
+
return self._device_mesh.ranks
|
176
|
+
|
123
177
|
|
124
178
|
async def local_proc_mesh_nonblocking(
|
125
179
|
*, gpus: Optional[int] = None, hosts: int = 1
|
monarch/sim_mesh.py
CHANGED
@@ -201,9 +201,11 @@ class Bootstrap:
|
|
201
201
|
|
202
202
|
proxy_addr = proxy_addr or f"unix!@{_random_id()}-proxy"
|
203
203
|
self.bootstrap_addr: str = f"sim!unix!@system,{proxy_addr}"
|
204
|
-
|
204
|
+
|
205
|
+
client_proxy_addr = f"unix!@{_random_id()}-proxy"
|
206
|
+
self.client_listen_addr: str = f"sim!unix!@client,{client_proxy_addr}"
|
205
207
|
self.client_bootstrap_addr: str = (
|
206
|
-
f"sim!unix!@client,{
|
208
|
+
f"sim!unix!@client,{client_proxy_addr},unix!@system,{proxy_addr}"
|
207
209
|
)
|
208
210
|
bootstrap_simulator_backend(self.bootstrap_addr, proxy_addr, world_size)
|
209
211
|
|
monarch/tools/cli.py
CHANGED
@@ -25,6 +25,7 @@ def proc_mesh(
|
|
25
25
|
meshes: list[str] = _DEFAULT_MESHES,
|
26
26
|
env: Optional[dict[str, str]] = None,
|
27
27
|
port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
|
28
|
+
program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
|
28
29
|
) -> specs.AppDef:
|
29
30
|
"""
|
30
31
|
Args:
|
@@ -33,6 +34,7 @@ def proc_mesh(
|
|
33
34
|
meshes: list of mesh specs of the form "{name}:{num_hosts}:{host_type}"
|
34
35
|
env: environment variables to be passed to the main command (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
|
35
36
|
port: the port that the remote process allocator runs on (must be reachable from the client)
|
37
|
+
program: path to the binary that the remote process allocator spawns on an allocation request
|
36
38
|
"""
|
37
39
|
|
38
40
|
appdef = specs.AppDef(name)
|
@@ -41,11 +43,10 @@ def proc_mesh(
|
|
41
43
|
mesh_role = specs.Role(
|
42
44
|
name=mesh.name,
|
43
45
|
image=image,
|
44
|
-
entrypoint="process_allocator", #
|
46
|
+
entrypoint="process_allocator", # run "cargo install monarch_hyperactor" to get this binary
|
45
47
|
args=[
|
46
|
-
"mesh-worker",
|
47
48
|
f"--port={port}",
|
48
|
-
"--program=
|
49
|
+
f"--program={program}",
|
49
50
|
],
|
50
51
|
num_replicas=mesh.num_hosts,
|
51
52
|
resource=specs.resource(h=mesh.host_type),
|
monarch/tools/network.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
import logging
|
9
|
+
import socket
|
10
|
+
from typing import Optional
|
11
|
+
|
12
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_ip_addr(hostname: str) -> str:
|
16
|
+
"""Resolves and returns the ip address of the given hostname.
|
17
|
+
|
18
|
+
This function will return an ipv6 address if one that can bind
|
19
|
+
`SOCK_STREAM` (TCP) socket is found. Otherwise it will fall-back
|
20
|
+
to resolving an ipv4 `SOCK_STREAM` address.
|
21
|
+
|
22
|
+
Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def get_sockaddr(family: socket.AddressFamily) -> Optional[str]:
|
26
|
+
try:
|
27
|
+
# patternlint-disable-next-line python-dns-deps (only used for oss)
|
28
|
+
addrs = socket.getaddrinfo(
|
29
|
+
hostname, port=None, family=family, type=socket.SOCK_STREAM
|
30
|
+
) # tcp
|
31
|
+
if addrs:
|
32
|
+
# socket.getaddrinfo return a list of addr 5-tuple addr infos
|
33
|
+
_, _, _, _, sockaddr = addrs[0] # use the first address
|
34
|
+
|
35
|
+
# sockaddr is a tuple (ipv4) or a 4-tuple (ipv6) where the first element is the ip addr
|
36
|
+
ipaddr = str(sockaddr[0])
|
37
|
+
|
38
|
+
logger.info(
|
39
|
+
"Resolved %s address: `%s` for host: `%s`",
|
40
|
+
family.name,
|
41
|
+
ipaddr,
|
42
|
+
hostname,
|
43
|
+
)
|
44
|
+
return str(ipaddr)
|
45
|
+
else:
|
46
|
+
return None
|
47
|
+
except socket.gaierror as e:
|
48
|
+
logger.info(
|
49
|
+
"No %s address that can bind TCP sockets for host: %s. %s",
|
50
|
+
family.name,
|
51
|
+
hostname,
|
52
|
+
e,
|
53
|
+
)
|
54
|
+
return None
|
55
|
+
|
56
|
+
ipaddr = get_sockaddr(socket.AF_INET6) or get_sockaddr(socket.AF_INET)
|
57
|
+
if not ipaddr:
|
58
|
+
raise RuntimeError(
|
59
|
+
f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
|
60
|
+
" Check the network configuration on the host."
|
61
|
+
)
|
62
|
+
return ipaddr
|
tests/error_test_binary.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
import asyncio
|
7
8
|
import ctypes
|
8
9
|
import sys
|
9
10
|
|
@@ -11,7 +12,7 @@ import click
|
|
11
12
|
|
12
13
|
from monarch._rust_bindings.monarch_extension.panic import panicking_function
|
13
14
|
|
14
|
-
from monarch.actor_mesh import Actor, endpoint
|
15
|
+
from monarch.actor_mesh import Actor, endpoint, send
|
15
16
|
from monarch.proc_mesh import proc_mesh
|
16
17
|
|
17
18
|
|
@@ -35,6 +36,12 @@ class ErrorActor(Actor):
|
|
35
36
|
"""Endpoint that calls a Rust function that panics."""
|
36
37
|
panicking_function()
|
37
38
|
|
39
|
+
@endpoint
|
40
|
+
async def await_then_error(self) -> None:
|
41
|
+
await asyncio.sleep(0.1)
|
42
|
+
await asyncio.sleep(0.1)
|
43
|
+
raise RuntimeError("oh noez")
|
44
|
+
|
38
45
|
|
39
46
|
class ErrorActorSync(Actor):
|
40
47
|
"""An actor that has endpoints cause segfaults."""
|
@@ -146,5 +153,28 @@ def error_bootstrap():
|
|
146
153
|
proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
|
147
154
|
|
148
155
|
|
156
|
+
async def _error_unmonitored():
|
157
|
+
print("I actually ran")
|
158
|
+
sys.stdout.flush()
|
159
|
+
|
160
|
+
proc = await proc_mesh(gpus=1)
|
161
|
+
actor = await proc.spawn("error_actor", ErrorActor)
|
162
|
+
|
163
|
+
# fire and forget
|
164
|
+
send(actor.await_then_error, (), {}, None, "all")
|
165
|
+
|
166
|
+
# Wait. Eventually a supervision event will get propagated and the process
|
167
|
+
# will exit.
|
168
|
+
#
|
169
|
+
# If an event is not delivered, the test will time out before this sleep
|
170
|
+
# finishes.
|
171
|
+
await asyncio.sleep(300)
|
172
|
+
|
173
|
+
|
174
|
+
@main.command("error-unmonitored")
|
175
|
+
def error_unmonitored():
|
176
|
+
asyncio.run(_error_unmonitored())
|
177
|
+
|
178
|
+
|
149
179
|
if __name__ == "__main__":
|
150
180
|
main()
|
tests/test_actor_error.py
CHANGED
@@ -4,11 +4,12 @@
|
|
4
4
|
# This source code is licensed under the BSD-style license found in the
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
|
+
import asyncio
|
7
8
|
import importlib.resources
|
8
9
|
import subprocess
|
9
10
|
|
10
11
|
import pytest
|
11
|
-
from monarch.actor_mesh import Actor, ActorError, endpoint
|
12
|
+
from monarch.actor_mesh import Actor, ActorError, endpoint, send
|
12
13
|
|
13
14
|
from monarch.proc_mesh import proc_mesh
|
14
15
|
|
@@ -128,6 +129,7 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
|
|
128
129
|
f"--endpoint-name={endpoint_name}",
|
129
130
|
]
|
130
131
|
try:
|
132
|
+
print("running cmd", " ".join(cmd))
|
131
133
|
process = subprocess.run(cmd, capture_output=True, timeout=180)
|
132
134
|
except subprocess.TimeoutExpired as e:
|
133
135
|
print("timeout expired")
|
@@ -157,6 +159,7 @@ def test_proc_mesh_bootstrap_error():
|
|
157
159
|
"error-bootstrap",
|
158
160
|
]
|
159
161
|
try:
|
162
|
+
print("running cmd", " ".join(cmd))
|
160
163
|
process = subprocess.run(cmd, capture_output=True, timeout=180)
|
161
164
|
except subprocess.TimeoutExpired as e:
|
162
165
|
print("timeout expired")
|
@@ -208,3 +211,30 @@ async def test_broken_pickle_class(raise_on_getstate, raise_on_setstate, num_pro
|
|
208
211
|
await exception_actor.print_value.call_one(broken_obj)
|
209
212
|
else:
|
210
213
|
await exception_actor.print_value.call(broken_obj)
|
214
|
+
|
215
|
+
|
216
|
+
# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
|
217
|
+
@pytest.mark.oss_skip
|
218
|
+
async def test_exception_after_wait_unmonitored():
|
219
|
+
# Run the test in a subprocess
|
220
|
+
test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
|
221
|
+
cmd = [
|
222
|
+
str(test_bin),
|
223
|
+
"error-unmonitored",
|
224
|
+
]
|
225
|
+
try:
|
226
|
+
print("running cmd", " ".join(cmd))
|
227
|
+
process = subprocess.run(cmd, capture_output=True, timeout=180)
|
228
|
+
except subprocess.TimeoutExpired as e:
|
229
|
+
print("timeout expired")
|
230
|
+
if e.stdout is not None:
|
231
|
+
print(e.stdout.decode())
|
232
|
+
if e.stderr is not None:
|
233
|
+
print(e.stderr.decode())
|
234
|
+
raise
|
235
|
+
|
236
|
+
# Assert that the subprocess exited with a non-zero code
|
237
|
+
assert "I actually ran" in process.stdout.decode()
|
238
|
+
assert (
|
239
|
+
process.returncode != 0
|
240
|
+
), f"Expected non-zero exit code, got {process.returncode}"
|
tests/test_python_actors.py
CHANGED
@@ -391,10 +391,13 @@ def test_rust_binding_modules_correct() -> None:
|
|
391
391
|
check(bindings, "monarch._rust_bindings")
|
392
392
|
|
393
393
|
|
394
|
-
|
394
|
+
two_gpu = pytest.mark.skipif(
|
395
395
|
torch.cuda.device_count() < 2,
|
396
396
|
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
397
397
|
)
|
398
|
+
|
399
|
+
|
400
|
+
@two_gpu
|
398
401
|
def test_tensor_engine() -> None:
|
399
402
|
pm = proc_mesh(gpus=2).get()
|
400
403
|
|
@@ -581,13 +584,80 @@ async def test_actor_tls() -> None:
|
|
581
584
|
pm = await proc_mesh(gpus=1)
|
582
585
|
am = await pm.spawn("tls", TLSActor)
|
583
586
|
await am.increment.call_one()
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
+
await am.increment_async.call_one()
|
588
|
+
await am.increment.call_one()
|
589
|
+
await am.increment_async.call_one()
|
590
|
+
|
591
|
+
assert 4 == await am.get.call_one()
|
592
|
+
assert 4 == await am.get_async.call_one()
|
593
|
+
|
594
|
+
|
595
|
+
class TLSActorFullSync(Actor):
|
596
|
+
"""An actor that manages thread-local state."""
|
597
|
+
|
598
|
+
def __init__(self):
|
599
|
+
self.local = threading.local()
|
600
|
+
self.local.value = 0
|
601
|
+
|
602
|
+
@endpoint
|
603
|
+
def increment(self):
|
604
|
+
self.local.value += 1
|
605
|
+
|
606
|
+
@endpoint
|
607
|
+
def get(self):
|
608
|
+
return self.local.value
|
609
|
+
|
587
610
|
|
588
|
-
|
611
|
+
async def test_actor_tls_full_sync() -> None:
|
612
|
+
"""Test that thread-local state is respected."""
|
613
|
+
pm = await proc_mesh(gpus=1)
|
614
|
+
am = await pm.spawn("tls", TLSActorFullSync)
|
615
|
+
await am.increment.call_one()
|
616
|
+
await am.increment.call_one()
|
617
|
+
await am.increment.call_one()
|
589
618
|
await am.increment.call_one()
|
590
|
-
# await am.increment_async.call_one()
|
591
619
|
|
592
|
-
assert
|
593
|
-
|
620
|
+
assert 4 == await am.get.call_one()
|
621
|
+
|
622
|
+
|
623
|
+
@two_gpu
|
624
|
+
def test_proc_mesh_tensor_engine() -> None:
|
625
|
+
pm = proc_mesh(gpus=2).get()
|
626
|
+
with pm.activate():
|
627
|
+
f = 10 * pm.rank_tensor("gpus").cuda()
|
628
|
+
a = monarch.inspect(f, hosts=0, gpus=0)
|
629
|
+
b = monarch.inspect(f, hosts=0, gpus=1)
|
630
|
+
|
631
|
+
one = pm.slice(gpus=1)
|
632
|
+
with one.activate():
|
633
|
+
sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
|
634
|
+
c = monarch.inspect(sliced_b * 10)
|
635
|
+
assert a == 0
|
636
|
+
assert b == 10
|
637
|
+
assert c == 100
|
638
|
+
|
639
|
+
|
640
|
+
class AsyncActor(Actor):
|
641
|
+
def __init__(self):
|
642
|
+
self.should_exit = False
|
643
|
+
|
644
|
+
@endpoint
|
645
|
+
async def sleep(self) -> None:
|
646
|
+
while True and not self.should_exit:
|
647
|
+
await asyncio.sleep(1)
|
648
|
+
|
649
|
+
@endpoint
|
650
|
+
async def no_more(self) -> None:
|
651
|
+
self.should_exit = True
|
652
|
+
|
653
|
+
|
654
|
+
@pytest.mark.timeout(15)
|
655
|
+
async def test_async_concurrency():
|
656
|
+
"""Test that async endpoints will be processed concurrently."""
|
657
|
+
pm = await proc_mesh(gpus=1)
|
658
|
+
am = await pm.spawn("async", AsyncActor)
|
659
|
+
fut = am.sleep.call()
|
660
|
+
# This call should go through and exit the sleep loop, as long as we are
|
661
|
+
# actually concurrently processing messages.
|
662
|
+
await am.no_more.call()
|
663
|
+
await fut
|
{torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/RECORD
RENAMED
@@ -1,7 +1,7 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=RlkNuWQ74oxTOEfmaVFsgESTEdMP84vug1sRY4xya60,40803008
|
3
3
|
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
|
-
monarch/actor_mesh.py,sha256=
|
4
|
+
monarch/actor_mesh.py,sha256=8hjIy0TSby33xfVXp_xZnqlPkxy3l6IGqEyPOhVtjvU,24197
|
5
5
|
monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
|
6
6
|
monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
@@ -10,13 +10,13 @@ monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
|
10
10
|
monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
|
11
11
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
12
12
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
13
|
-
monarch/mesh_controller.py,sha256=
|
14
|
-
monarch/monarch_controller,sha256=
|
13
|
+
monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
|
14
|
+
monarch/monarch_controller,sha256=HucZG4CSJhkVpbHElarAp2LUz1xW5bMNnAR3TNjWKks,20335344
|
15
15
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
16
16
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
17
17
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
18
18
|
monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
|
19
|
-
monarch/proc_mesh.py,sha256=
|
19
|
+
monarch/proc_mesh.py,sha256=5RaKPQZJD-sKzEAbqMorKsZA7SOUzIflk3Fn6cdfzvw,8607
|
20
20
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
21
21
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
22
22
|
monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
|
@@ -24,7 +24,7 @@ monarch/rdma.py,sha256=1pNh11S_FWeETRgkdUpauTMUlodrRohIq1UfQjKVnN8,5418
|
|
24
24
|
monarch/remote_class.py,sha256=-OAowzU1aDP6i4ik_SjXntVUC9h4dqAzgqwohkQ6Grc,4167
|
25
25
|
monarch/rust_backend_mesh.py,sha256=1htC62of4MgFtkezWGlsxSFtKJdc0CIeqeSuOx7yu3M,9944
|
26
26
|
monarch/rust_local_mesh.py,sha256=7ASptybn3wy4J7eoBc7LhGW4j4AA6bigl5Kuhyflw8s,47405
|
27
|
-
monarch/sim_mesh.py,sha256=
|
27
|
+
monarch/sim_mesh.py,sha256=kDsbubv28YFg9ZQN4urt3oJGzR3CnnUiATnjUiSxrkE,12256
|
28
28
|
monarch/telemetry.py,sha256=7JUZWaoD2Yn5Ae_7kNhkLFRBLYaSGfH071_m_qfVehI,525
|
29
29
|
monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI,8300
|
30
30
|
monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
|
@@ -50,7 +50,7 @@ monarch/common/client.py,sha256=axo37s_z17nYQGOZG5fi_0zUEJ_8qw7INjs-Kw2vaVo,2493
|
|
50
50
|
monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
|
51
51
|
monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
|
52
52
|
monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
|
53
|
-
monarch/common/device_mesh.py,sha256=
|
53
|
+
monarch/common/device_mesh.py,sha256=jo_qEIRlX6KzBlP2BUSS4XEELL-6_H08a47bUz8QYsA,12159
|
54
54
|
monarch/common/fake.py,sha256=h57Cggz2qXNqImZ7yPuOZOSe9-l9i553ki1z-YHlgQA,1801
|
55
55
|
monarch/common/function.py,sha256=V8kdgSRTvild2SpcewWa5IETX3QiWDZQ2BEIDFa5zz8,4374
|
56
56
|
monarch/common/function_caching.py,sha256=HVdbWtv6Eea7ENMWi8iv36w1G1TaVuUJhkUX_JxGx5A,5060
|
@@ -67,9 +67,9 @@ monarch/common/recording.py,sha256=hoI9VY_FyW_xVx-jmfsKydqX5vW2GulwcDWsBdUVOm8,4
|
|
67
67
|
monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,938
|
68
68
|
monarch/common/remote.py,sha256=qZWXkShX20l07TseQSpVECh2yXZaVKYUvQXkeEM-zvY,9220
|
69
69
|
monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
|
70
|
-
monarch/common/shape.py,sha256=
|
70
|
+
monarch/common/shape.py,sha256=B-7DI768ZhT8ECUNCJcI7DfCB7iDFGFH0r-HmXaAfcM,8296
|
71
71
|
monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
|
72
|
-
monarch/common/tensor.py,sha256=
|
72
|
+
monarch/common/tensor.py,sha256=G26E8-qv7HnjZfz3Ka5a-u3vb6DadcDChOn6wpjkeZo,29273
|
73
73
|
monarch/common/tensor_factory.py,sha256=qm8NZx-5ezMAFjNLiXQvb66okm5XgdboB_GRarGOdN0,801
|
74
74
|
monarch/common/tree.py,sha256=1DG3siiE7ixBV6v5cwN8RT_17aJhYZTE-L3i7wZe2_c,2282
|
75
75
|
monarch/controller/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
@@ -106,11 +106,12 @@ monarch/timer/example_spmd.py,sha256=p8i3_tO1AmpwSkZryiSjgkh7qaEZ6QXp2Fy1qtPpECA
|
|
106
106
|
monarch/timer/execution_timer.py,sha256=1YsrLIZirdohKOeFAU2H4UcONhQXHuctJbYcoX8I6gY,6985
|
107
107
|
monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_uxl9SOHak,4486
|
108
108
|
monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
109
|
-
monarch/tools/cli.py,sha256=
|
109
|
+
monarch/tools/cli.py,sha256=EIdarsfuFX0WqRCe29_5GNKWJBhxx0lABalw3zPSagw,4977
|
110
110
|
monarch/tools/commands.py,sha256=BfmXndJmU_cZP4cMPlknkxGca1NjqYd8_ReDePWksXw,6908
|
111
111
|
monarch/tools/mesh_spec.py,sha256=JLykhgy1dClXiNbH1Qsl2fX5MbqplQAhl8LGoragvbo,3702
|
112
|
+
monarch/tools/network.py,sha256=bRj-jOs5qDqnM3BcE9MSXCLS01hiMN4YSWfKZ_d7bc4,2182
|
112
113
|
monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
113
|
-
monarch/tools/components/hyperactor.py,sha256=
|
114
|
+
monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
|
114
115
|
monarch/tools/config/__init__.py,sha256=OPSflEmJB2zxAaRVzzWSWXV5M5vlknLgpulGdW1ze5U,510
|
115
116
|
monarch/tools/config/defaults.py,sha256=34a3HQhyXqt9qR2SYMVCROoNsnwk37rIwLXXiKwqtog,1894
|
116
117
|
monarch/worker/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
@@ -131,9 +132,9 @@ monarch_supervisor/python_executable.py,sha256=WfCiK3wdAvm9Jxx5jgjGF991NgGc9-oHU
|
|
131
132
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
132
133
|
tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
|
133
134
|
tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
|
134
|
-
tests/error_test_binary.py,sha256=
|
135
|
+
tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
|
135
136
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
136
|
-
tests/test_actor_error.py,sha256
|
137
|
+
tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
|
137
138
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
138
139
|
tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
|
139
140
|
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
@@ -144,7 +145,7 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
|
|
144
145
|
tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
|
145
146
|
tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
|
146
147
|
tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
|
147
|
-
tests/test_python_actors.py,sha256=
|
148
|
+
tests/test_python_actors.py,sha256=ls0x_ie4i9KLuouecfxG_fHHZSZc2g_mQSAPJg70pgw,18949
|
148
149
|
tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
|
149
150
|
tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
|
150
151
|
tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
|
@@ -154,9 +155,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
154
155
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
155
156
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
156
157
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
157
|
-
torchmonarch_nightly-2025.6.
|
158
|
-
torchmonarch_nightly-2025.6.
|
159
|
-
torchmonarch_nightly-2025.6.
|
160
|
-
torchmonarch_nightly-2025.6.
|
161
|
-
torchmonarch_nightly-2025.6.
|
162
|
-
torchmonarch_nightly-2025.6.
|
158
|
+
torchmonarch_nightly-2025.6.18.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
159
|
+
torchmonarch_nightly-2025.6.18.dist-info/METADATA,sha256=lPDac3GQrS5MmEp41wt6YCWHIluJzBgFfPY37x0cKJM,2772
|
160
|
+
torchmonarch_nightly-2025.6.18.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
161
|
+
torchmonarch_nightly-2025.6.18.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
162
|
+
torchmonarch_nightly-2025.6.18.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
163
|
+
torchmonarch_nightly-2025.6.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.16.dist-info → torchmonarch_nightly-2025.6.18.dist-info}/top_level.txt
RENAMED
File without changes
|