torchmonarch-nightly 2025.9.3__cp311-cp311-manylinux2014_x86_64.whl → 2025.9.5__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +92 -3
- monarch/_src/actor/host_mesh.py +21 -1
- monarch/_src/actor/proc_mesh.py +99 -18
- monarch/actor/__init__.py +4 -0
- monarch/common/tensor.py +0 -4
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/monarch_controller +0 -0
- monarch/tools/components/hyperactor.py +5 -3
- monarch/tools/config/__init__.py +10 -14
- monarch/tools/config/defaults.py +2 -2
- monarch/tools/mesh_spec.py +1 -3
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/METADATA +11 -4
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/RECORD +18 -18
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/_src/actor/actor_mesh.py
CHANGED
@@ -529,14 +529,38 @@ def as_endpoint(
|
|
529
529
|
|
530
530
|
|
531
531
|
class Accumulator(Generic[P, R, A]):
|
532
|
+
"""
|
533
|
+
Accumulate the result of a broadcast invocation of an endpoint
|
534
|
+
across a sliced mesh.
|
535
|
+
|
536
|
+
Usage:
|
537
|
+
>>> counter = Accumulator(Actor.increment, 0, lambda x, y: x + y)
|
538
|
+
"""
|
539
|
+
|
532
540
|
def __init__(
|
533
541
|
self, endpoint: Endpoint[P, R], identity: A, combine: Callable[[A, R], A]
|
534
542
|
) -> None:
|
543
|
+
"""
|
544
|
+
Args:
|
545
|
+
endpoint: Endpoint to accumulate the result of.
|
546
|
+
identity: Initial value of the accumulated value before the first combine invocation.
|
547
|
+
combine: Lambda invoked for combining the result of the endpoint with the accumulated value.
|
548
|
+
"""
|
535
549
|
self._endpoint: Endpoint[P, R] = endpoint
|
536
550
|
self._identity: A = identity
|
537
551
|
self._combine: Callable[[A, R], A] = combine
|
538
552
|
|
539
553
|
def accumulate(self, *args: P.args, **kwargs: P.kwargs) -> "Future[A]":
|
554
|
+
"""
|
555
|
+
Accumulate the result of the endpoint invocation.
|
556
|
+
|
557
|
+
Args:
|
558
|
+
args: Arguments to pass to the endpoint.
|
559
|
+
kwargs: Keyword arguments to pass to the endpoint.
|
560
|
+
|
561
|
+
Returns:
|
562
|
+
Future that resolves to the accumulated value.
|
563
|
+
"""
|
540
564
|
gen: Generator[Future[R], None, None] = self._endpoint.stream(*args, **kwargs)
|
541
565
|
|
542
566
|
async def impl() -> A:
|
@@ -550,7 +574,7 @@ class Accumulator(Generic[P, R, A]):
|
|
550
574
|
|
551
575
|
class ValueMesh(MeshTrait, Generic[R]):
|
552
576
|
"""
|
553
|
-
|
577
|
+
A mesh that holds the result of an endpoint invocation.
|
554
578
|
"""
|
555
579
|
|
556
580
|
def __init__(self, shape: Shape, values: List[R]) -> None:
|
@@ -561,6 +585,18 @@ class ValueMesh(MeshTrait, Generic[R]):
|
|
561
585
|
return ValueMesh(shape, self._values)
|
562
586
|
|
563
587
|
def item(self, **kwargs) -> R:
|
588
|
+
"""
|
589
|
+
Get the value at the given coordinates.
|
590
|
+
|
591
|
+
Args:
|
592
|
+
kwargs: Coordinates to get the value at.
|
593
|
+
|
594
|
+
Returns:
|
595
|
+
Value at the given coordinate.
|
596
|
+
|
597
|
+
Raises:
|
598
|
+
KeyError: If invalid coordinates are provided.
|
599
|
+
"""
|
564
600
|
coordinates = [kwargs.pop(label) for label in self._labels]
|
565
601
|
if kwargs:
|
566
602
|
raise KeyError(f"item has extra dimensions: {list(kwargs.keys())}")
|
@@ -568,6 +604,12 @@ class ValueMesh(MeshTrait, Generic[R]):
|
|
568
604
|
return self._values[self._ndslice.nditem(coordinates)]
|
569
605
|
|
570
606
|
def items(self) -> Iterable[Tuple[Point, R]]:
|
607
|
+
"""
|
608
|
+
Generator that returns values for the provided coordinates.
|
609
|
+
|
610
|
+
Returns:
|
611
|
+
Values at all coordinates.
|
612
|
+
"""
|
571
613
|
extent = self._shape.extent
|
572
614
|
for i, rank in enumerate(self._shape.ranks()):
|
573
615
|
yield Point(i, extent), self._values[rank]
|
@@ -596,14 +638,27 @@ def send(
|
|
596
638
|
selection: Selection = "all",
|
597
639
|
) -> None:
|
598
640
|
"""
|
599
|
-
|
641
|
+
Fire-and-forget broadcast invocation of the endpoint across a given selection of the mesh.
|
642
|
+
|
643
|
+
This sends the message to all actors but does not wait for any result. Use the port provided to
|
644
|
+
send the response back to the caller.
|
600
645
|
|
601
|
-
|
646
|
+
Args:
|
647
|
+
endpoint: Endpoint to invoke.
|
648
|
+
args: Arguments to pass to the endpoint.
|
649
|
+
kwargs: Keyword arguments to pass to the endpoint.
|
650
|
+
port: Handle to send the response to.
|
651
|
+
selection: Selection query representing a subset of the mesh.
|
602
652
|
"""
|
603
653
|
endpoint._send(args, kwargs, port, selection)
|
604
654
|
|
605
655
|
|
606
656
|
class Port(Generic[R]):
|
657
|
+
"""
|
658
|
+
Handle used to send reliable in-order messages through a channel to
|
659
|
+
a PortReceiver.
|
660
|
+
"""
|
661
|
+
|
607
662
|
def __init__(
|
608
663
|
self,
|
609
664
|
port_ref: PortRef | OncePortRef,
|
@@ -615,6 +670,13 @@ class Port(Generic[R]):
|
|
615
670
|
self._rank = rank
|
616
671
|
|
617
672
|
def send(self, obj: R) -> None:
|
673
|
+
"""
|
674
|
+
Fire-and-forget send R-typed objects in order
|
675
|
+
through a channel to its corresponding PortReceiver.
|
676
|
+
|
677
|
+
Args:
|
678
|
+
obj: R-typed object to send.
|
679
|
+
"""
|
618
680
|
self._port_ref.send(
|
619
681
|
self._mailbox,
|
620
682
|
PythonMessage(PythonMessageKind.Result(self._rank), _pickle(obj)),
|
@@ -656,8 +718,17 @@ T = TypeVar("T")
|
|
656
718
|
# not part of the Endpoint API because they way it accepts arguments
|
657
719
|
# and handles concerns is different.
|
658
720
|
class Channel(Generic[R]):
|
721
|
+
"""
|
722
|
+
An advanced low level API for a communication channel used for message passing
|
723
|
+
between actors.
|
724
|
+
|
725
|
+
Provides static methods to create communication channels with port pairs
|
726
|
+
for sending and receiving messages of type R.
|
727
|
+
"""
|
728
|
+
|
659
729
|
@staticmethod
|
660
730
|
def open(once: bool = False) -> Tuple["Port[R]", "PortReceiver[R]"]:
|
731
|
+
""" """
|
661
732
|
mailbox = context().actor_instance._mailbox
|
662
733
|
handle, receiver = mailbox.open_once_port() if once else mailbox.open_port()
|
663
734
|
port_ref = handle.bind()
|
@@ -673,6 +744,14 @@ class Channel(Generic[R]):
|
|
673
744
|
|
674
745
|
|
675
746
|
class PortReceiver(Generic[R]):
|
747
|
+
"""
|
748
|
+
Receiver for messages sent through a communication channel.
|
749
|
+
|
750
|
+
Handles receiving R-typed objects sent from a corresponding Port.
|
751
|
+
Asynchronously message reception with optional supervision
|
752
|
+
monitoring for error handling.
|
753
|
+
"""
|
754
|
+
|
676
755
|
def __init__(
|
677
756
|
self,
|
678
757
|
mailbox: Mailbox,
|
@@ -784,6 +863,7 @@ class _Actor:
|
|
784
863
|
ins.rank = ctx.message_rank
|
785
864
|
try:
|
786
865
|
self.instance = Class(*args, **kwargs)
|
866
|
+
self._maybe_exit_debugger()
|
787
867
|
except Exception as e:
|
788
868
|
self._saved_error = ActorError(
|
789
869
|
e, f"Remote actor {Class}.__init__ call failed."
|
@@ -956,6 +1036,15 @@ class Actor(MeshTrait, DeprecatedNotAFuture):
|
|
956
1036
|
|
957
1037
|
|
958
1038
|
class ActorMesh(MeshTrait, Generic[T], DeprecatedNotAFuture):
|
1039
|
+
"""
|
1040
|
+
A group of actor instances of the same class.
|
1041
|
+
|
1042
|
+
Represents a collection of T-typed actor instances spawned at most once per process
|
1043
|
+
that can be communicated with collectively or individually. Provides
|
1044
|
+
methods for spawning actors, managing their lifecycle, and creating
|
1045
|
+
endpoints for method invocation across the mesh.
|
1046
|
+
"""
|
1047
|
+
|
959
1048
|
def __init__(
|
960
1049
|
self,
|
961
1050
|
Class: Type[T],
|
monarch/_src/actor/host_mesh.py
CHANGED
@@ -35,11 +35,23 @@ def this_proc() -> "ProcMesh":
|
|
35
35
|
|
36
36
|
|
37
37
|
def create_local_host_mesh() -> "HostMesh":
|
38
|
+
"""
|
39
|
+
Create a local host mesh for the current machine.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
HostMesh: A single-host mesh configured for local process allocation.
|
43
|
+
"""
|
38
44
|
cmd, args, env = _get_bootstrap_args()
|
39
45
|
return HostMesh(Shape.unity(), ProcessAllocator(cmd, args, env))
|
40
46
|
|
41
47
|
|
42
48
|
class HostMesh(MeshTrait):
|
49
|
+
"""
|
50
|
+
HostMesh represents a collection of compute hosts that can be used to spawn
|
51
|
+
processes and actors. The class requires you to provide your AllocateMixin that
|
52
|
+
interfaces with the underlying resource allocator of your choice.
|
53
|
+
"""
|
54
|
+
|
43
55
|
def __init__(self, shape: Shape, allocator: AllocateMixin):
|
44
56
|
self._allocator = allocator
|
45
57
|
self._shape = shape
|
@@ -57,12 +69,14 @@ class HostMesh(MeshTrait):
|
|
57
69
|
"""
|
58
70
|
Start new processes on this host mesh. By default this starts one proc
|
59
71
|
on each host in the mesh. Additional procs can be started using `per_host` to
|
60
|
-
specify the local shape, e.g
|
72
|
+
specify the local shape, e.g.`
|
61
73
|
per_host = {'gpus': 8}
|
62
74
|
Will create a proc mesh with an additional 'gpus' dimension.
|
63
75
|
|
64
76
|
`bootstrap` is a function that will be run at startup on each proc and can be used to e.g.
|
65
77
|
configure CUDA or NCCL. We guarantee that CUDA has not been initialized before boostrap is called.
|
78
|
+
|
79
|
+
TODO: For now, a new allocator is created for every new ProcMesh.
|
66
80
|
"""
|
67
81
|
if per_host is None:
|
68
82
|
per_host = {}
|
@@ -107,6 +121,12 @@ class HostMesh(MeshTrait):
|
|
107
121
|
|
108
122
|
|
109
123
|
def fake_in_process_host() -> "HostMesh":
|
124
|
+
"""
|
125
|
+
Create a host mesh for testing and development using a local allocator.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
HostMesh: A host mesh configured with local allocation for in-process use.
|
129
|
+
"""
|
110
130
|
return HostMesh(Shape.unity(), LocalAllocator())
|
111
131
|
|
112
132
|
|
monarch/_src/actor/proc_mesh.py
CHANGED
@@ -95,14 +95,10 @@ if TYPE_CHECKING:
|
|
95
95
|
|
96
96
|
class SetupActor(Actor):
|
97
97
|
"""
|
98
|
-
A helper actor to
|
99
|
-
Typically used to setup the environment variables.
|
98
|
+
A helper actor to set up the actor mesh with user defined setup method.
|
100
99
|
"""
|
101
100
|
|
102
101
|
def __init__(self, env: Callable[[], None]) -> None:
|
103
|
-
"""
|
104
|
-
Initialize the setup actor with the user defined setup method.
|
105
|
-
"""
|
106
102
|
self._setup_method = env
|
107
103
|
|
108
104
|
@endpoint
|
@@ -133,8 +129,12 @@ def _use_standin_mesh() -> bool:
|
|
133
129
|
return os.getenv("USE_STANDIN_ACTOR_MESH", default="0") != "0"
|
134
130
|
|
135
131
|
|
136
|
-
# Ultra-hack to allow actors to identify proc meshes but with no real functionality.
|
137
132
|
class ProcMeshRef:
|
133
|
+
"""
|
134
|
+
A serializable remote reference to a ProcMesh. The reference is weak: No support
|
135
|
+
for refcount'ing. Spawning actors on a ProcMeshRef a stopped or a failed mesh will fail.
|
136
|
+
"""
|
137
|
+
|
138
138
|
def __init__(self, proc_mesh_id: int) -> None:
|
139
139
|
self._proc_mesh_id = proc_mesh_id
|
140
140
|
self._host_mesh: Optional["HostMesh"] = None
|
@@ -179,6 +179,17 @@ def _deref_proc_mesh(proc_mesh: ProcMeshRef) -> "ProcMesh":
|
|
179
179
|
|
180
180
|
|
181
181
|
class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
182
|
+
"""
|
183
|
+
A distributed mesh of processes for actor computation.
|
184
|
+
|
185
|
+
ProcMesh represents a collection of processes that can spawn and manage actors.
|
186
|
+
It provides the foundation for distributed actor systems by managing process
|
187
|
+
allocation, lifecycle, and communication across multiple hosts and devices.
|
188
|
+
|
189
|
+
The ProcMesh supports spawning actors, monitoring process health, logging
|
190
|
+
configuration, and code synchronization across distributed processes.
|
191
|
+
"""
|
192
|
+
|
182
193
|
def __init__(
|
183
194
|
self,
|
184
195
|
hy_proc_mesh: "Shared[HyProcMesh]",
|
@@ -249,6 +260,22 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
249
260
|
return pm
|
250
261
|
|
251
262
|
def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> T:
|
263
|
+
"""
|
264
|
+
Spawn a T-typed actor mesh on the process mesh.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
- `name`: The name of the actor.
|
268
|
+
- `Class`: The class of the actor to spawn.
|
269
|
+
- `args`: Positional arguments to pass to the actor's constructor.
|
270
|
+
- `kwargs`: Keyword arguments to pass to the actor's constructor.
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
- The actor instance.
|
274
|
+
|
275
|
+
Usage:
|
276
|
+
>>> procs: ProcMesh = host_mesh.spawn_procs(per_host={"gpus": 8})
|
277
|
+
>>> counters: Counter = procs.spawn("counters", Counter, 0)
|
278
|
+
"""
|
252
279
|
if self._slice:
|
253
280
|
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
254
281
|
return self._spawn_nonblocking(name, Class, *args, **kwargs)
|
@@ -294,19 +321,9 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
294
321
|
Allocate a process mesh according to the provided alloc.
|
295
322
|
Returns when the mesh is fully allocated.
|
296
323
|
|
297
|
-
|
298
|
-
- `alloc`:
|
324
|
+
Args:
|
325
|
+
- `alloc`: A generator that yields a list of allocations.
|
299
326
|
- `setup`: An optional lambda function to configure environment variables on the allocated mesh.
|
300
|
-
Use the `current_rank()` method within the lambda to obtain the rank.
|
301
|
-
|
302
|
-
Example of a setup method to initialize torch distributed environment variables:
|
303
|
-
```
|
304
|
-
def setup():
|
305
|
-
rank = current_rank()
|
306
|
-
os.environ["RANK"] = str(rank)
|
307
|
-
os.environ["WORLD_SIZE"] = str(len(rank.shape))
|
308
|
-
os.environ["LOCAL_RANK"] = str(rank["gpus"])
|
309
|
-
```
|
310
327
|
"""
|
311
328
|
|
312
329
|
async def task() -> HyProcMesh:
|
@@ -432,6 +449,14 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
432
449
|
conda: bool = False,
|
433
450
|
auto_reload: bool = False,
|
434
451
|
) -> None:
|
452
|
+
"""
|
453
|
+
Sync local code changes to the remote processes.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
workspace: The workspace to sync.
|
457
|
+
conda: If True, also sync the currently activated conda env.
|
458
|
+
auto_reload: If True, automatically reload the workspace on changes.
|
459
|
+
"""
|
435
460
|
if self._code_sync_client is None:
|
436
461
|
self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
|
437
462
|
proc_mesh=await self._proc_mesh_for_asyncio_fixme,
|
@@ -525,6 +550,10 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
525
550
|
return self
|
526
551
|
|
527
552
|
def stop(self) -> Future[None]:
|
553
|
+
"""
|
554
|
+
This will stop all processes (and actors) in the mesh and
|
555
|
+
release any resources associated with the mesh.
|
556
|
+
"""
|
528
557
|
self._logging_manager.stop()
|
529
558
|
|
530
559
|
async def _stop_nonblocking() -> None:
|
@@ -574,6 +603,23 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
|
|
574
603
|
|
575
604
|
|
576
605
|
def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
|
606
|
+
"""
|
607
|
+
Create a local process mesh for testing and development.
|
608
|
+
|
609
|
+
This function creates a process mesh using local allocation instead of
|
610
|
+
distributed process allocation. Primarily used for testing scenarios.
|
611
|
+
|
612
|
+
Args:
|
613
|
+
gpus: Number of GPUs to allocate per host. If None, uses local device count.
|
614
|
+
hosts: Number of hosts to allocate. Defaults to 1.
|
615
|
+
|
616
|
+
Returns:
|
617
|
+
ProcMesh: A locally allocated process mesh.
|
618
|
+
|
619
|
+
Warning:
|
620
|
+
This function is deprecated. Use `fake_in_process_host().spawn_procs()`
|
621
|
+
for testing or `this_proc().spawn_procs()` for current process actors.
|
622
|
+
"""
|
577
623
|
warnings.warn(
|
578
624
|
"Use monarch._src.actor.host_mesh.fake_in_process_host().spawn_procs for testing. For launching an actor in the current process use this_proc().spawn_procs()",
|
579
625
|
DeprecationWarning,
|
@@ -596,6 +642,22 @@ def sim_proc_mesh(
|
|
596
642
|
dcs: int = 1,
|
597
643
|
regions: int = 1,
|
598
644
|
) -> ProcMesh:
|
645
|
+
"""Create a simulated process mesh for testing distributed scenarios.
|
646
|
+
|
647
|
+
This function creates a process mesh using simulation allocation to test
|
648
|
+
distributed behavior without requiring actual remote resources.
|
649
|
+
|
650
|
+
Args:
|
651
|
+
gpus: Number of GPUs per host. Defaults to 1.
|
652
|
+
hosts: Number of hosts. Defaults to 1.
|
653
|
+
racks: Number of racks. Defaults to 1.
|
654
|
+
zones: Number of zones. Defaults to 1.
|
655
|
+
dcs: Number of data centers. Defaults to 1.
|
656
|
+
regions: Number of regions. Defaults to 1.
|
657
|
+
|
658
|
+
Returns:
|
659
|
+
ProcMesh: A simulated process mesh with the specified topology.
|
660
|
+
"""
|
599
661
|
spec: AllocSpec = AllocSpec(
|
600
662
|
AllocConstraints(),
|
601
663
|
hosts=hosts,
|
@@ -658,6 +720,25 @@ def proc_mesh(
|
|
658
720
|
env: dict[str, str] | None = None,
|
659
721
|
setup: Callable[[], None] | None = None,
|
660
722
|
) -> ProcMesh:
|
723
|
+
"""
|
724
|
+
Create a distributed process mesh across hosts.
|
725
|
+
|
726
|
+
This function creates a process mesh using distributed process allocation
|
727
|
+
across multiple hosts and GPUs. Used for production distributed computing.
|
728
|
+
|
729
|
+
Args:
|
730
|
+
gpus: Number of GPUs per host. If None, uses local device count.
|
731
|
+
hosts: Number of hosts to allocate. Defaults to 1.
|
732
|
+
env: Environment variables to set on remote processes.
|
733
|
+
setup: Optional setup function to run on each process at startup.
|
734
|
+
|
735
|
+
Returns:
|
736
|
+
ProcMesh: A distributed process mesh with the specified configuration.
|
737
|
+
|
738
|
+
Warning:
|
739
|
+
This function is deprecated. Use `this_host().spawn_procs()` with
|
740
|
+
appropriate per_host configuration instead.
|
741
|
+
"""
|
661
742
|
warnings.warn(
|
662
743
|
"use this_host().spawn_procs(per_host = {'hosts': 2, 'gpus': 3}) instead of monarch.actor.proc_mesh(hosts=2, gpus=3)",
|
663
744
|
DeprecationWarning,
|
monarch/actor/__init__.py
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
Monarch Actor API - Public interface for actor functionality.
|
10
10
|
"""
|
11
11
|
|
12
|
+
from monarch._rust_bindings.monarch_hyperactor.shape import Extent
|
12
13
|
from monarch._src.actor.actor_mesh import (
|
13
14
|
Accumulator,
|
14
15
|
Actor,
|
@@ -19,6 +20,7 @@ from monarch._src.actor.actor_mesh import (
|
|
19
20
|
current_actor_name,
|
20
21
|
current_rank,
|
21
22
|
current_size,
|
23
|
+
Endpoint,
|
22
24
|
Point,
|
23
25
|
Port,
|
24
26
|
PortReceiver,
|
@@ -70,4 +72,6 @@ __all__ = [
|
|
70
72
|
"hosts_from_config",
|
71
73
|
"Port",
|
72
74
|
"PortReceiver",
|
75
|
+
"Endpoint",
|
76
|
+
"Extent",
|
73
77
|
]
|
monarch/common/tensor.py
CHANGED
@@ -80,10 +80,6 @@ class Tensor(Referenceable, BaseTensor):
|
|
80
80
|
in a device mesh. It provides the same interface as PyTorch tensors but
|
81
81
|
enables distributed operations and communication patterns.
|
82
82
|
|
83
|
-
Args:
|
84
|
-
fake (torch.Tensor): A fake tensor representing the shape and type
|
85
|
-
mesh (DeviceMesh): The device mesh this tensor is distributed across
|
86
|
-
stream (Stream): The computation stream for this tensor
|
87
83
|
"""
|
88
84
|
|
89
85
|
# pyre-fixme[13]: Attribute `stream` is never initialized.
|
Binary file
|
monarch/monarch_controller
CHANGED
Binary file
|
@@ -9,7 +9,8 @@ import getpass
|
|
9
9
|
from typing import Optional
|
10
10
|
|
11
11
|
from monarch.tools import mesh_spec
|
12
|
-
|
12
|
+
|
13
|
+
from monarch.tools.config import NOT_SET
|
13
14
|
from monarch.tools.mesh_spec import mesh_spec_from_str
|
14
15
|
from torchx import specs
|
15
16
|
|
@@ -19,6 +20,7 @@ _USER: str = getpass.getuser()
|
|
19
20
|
|
20
21
|
DEFAULT_NAME: str = f"monarch-{_USER}"
|
21
22
|
|
23
|
+
|
22
24
|
__version__ = "latest" # TODO get version from monarch.__version_
|
23
25
|
|
24
26
|
|
@@ -28,7 +30,7 @@ def host_mesh(
|
|
28
30
|
env: Optional[dict[str, str]] = None,
|
29
31
|
port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
|
30
32
|
program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
|
31
|
-
) ->
|
33
|
+
) -> specs.AppDef:
|
32
34
|
"""
|
33
35
|
Args:
|
34
36
|
name: the name of the monarch server job
|
@@ -39,7 +41,7 @@ def host_mesh(
|
|
39
41
|
program: path to the binary that the remote process allocator spawns on an allocation request
|
40
42
|
"""
|
41
43
|
|
42
|
-
appdef =
|
44
|
+
appdef = specs.AppDef(name=NOT_SET)
|
43
45
|
|
44
46
|
for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
|
45
47
|
mesh_role = specs.Role(
|
monarch/tools/config/__init__.py
CHANGED
@@ -7,26 +7,22 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
import warnings
|
9
9
|
from dataclasses import dataclass, field
|
10
|
-
from typing import Any
|
10
|
+
from typing import Any
|
11
11
|
|
12
12
|
from monarch.tools.config.workspace import Workspace
|
13
13
|
|
14
|
-
#
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
# Gracefully handle cases where torchx might not be installed
|
15
|
+
# NOTE: this can be removed once torchx.specs moves to monarch.session
|
16
|
+
try:
|
17
|
+
from torchx import specs
|
18
|
+
except ImportError:
|
19
|
+
pass
|
18
20
|
|
19
21
|
NOT_SET: str = "__NOT_SET__"
|
20
22
|
|
21
23
|
|
22
|
-
|
23
|
-
|
24
|
-
"""
|
25
|
-
A TorchX AppDef without a name.
|
26
|
-
"""
|
27
|
-
|
28
|
-
roles: List["Role"] = field(default_factory=list)
|
29
|
-
metadata: Dict[str, str] = field(default_factory=dict)
|
24
|
+
def _empty_appdef() -> "specs.AppDef":
|
25
|
+
return specs.AppDef(name=NOT_SET)
|
30
26
|
|
31
27
|
|
32
28
|
@dataclass
|
@@ -39,7 +35,7 @@ class Config:
|
|
39
35
|
scheduler_args: dict[str, Any] = field(default_factory=dict)
|
40
36
|
workspace: Workspace = field(default_factory=Workspace.null)
|
41
37
|
dryrun: bool = False
|
42
|
-
appdef:
|
38
|
+
appdef: "specs.AppDef" = field(default_factory=_empty_appdef)
|
43
39
|
|
44
40
|
def __post_init__(self) -> None:
|
45
41
|
# workspace used to be Optional[str]
|
monarch/tools/config/defaults.py
CHANGED
@@ -12,7 +12,7 @@ import warnings
|
|
12
12
|
from typing import Callable
|
13
13
|
|
14
14
|
from monarch.tools.components import hyperactor
|
15
|
-
from monarch.tools.config import Config
|
15
|
+
from monarch.tools.config import Config
|
16
16
|
from monarch.tools.config.workspace import Workspace
|
17
17
|
|
18
18
|
from torchx import specs
|
@@ -25,7 +25,7 @@ from torchx.schedulers import (
|
|
25
25
|
)
|
26
26
|
|
27
27
|
|
28
|
-
def component_fn(scheduler: str) -> Callable[...,
|
28
|
+
def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
|
29
29
|
"""The default TorchX component function for the scheduler"""
|
30
30
|
return hyperactor.host_mesh
|
31
31
|
|
monarch/tools/mesh_spec.py
CHANGED
@@ -9,8 +9,6 @@ import string
|
|
9
9
|
from dataclasses import dataclass, field
|
10
10
|
from typing import Any, Optional
|
11
11
|
|
12
|
-
from monarch.tools.config import UnnamedAppDef
|
13
|
-
|
14
12
|
from monarch.tools.network import get_sockaddr
|
15
13
|
from torchx import specs
|
16
14
|
from torchx.specs.api import is_terminal
|
@@ -72,7 +70,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
|
|
72
70
|
return string.Template(tag_template).substitute(mesh_name=mesh_name)
|
73
71
|
|
74
72
|
|
75
|
-
def tag_as_metadata(mesh_spec: MeshSpec, appdef:
|
73
|
+
def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
|
76
74
|
appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
|
77
75
|
appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
|
78
76
|
appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
|
{torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: torchmonarch-nightly
|
3
|
-
Version: 2025.9.
|
3
|
+
Version: 2025.9.5
|
4
4
|
Summary: Monarch: Single controller library
|
5
5
|
Author: Meta
|
6
6
|
Author-email: oncall+monarch@xmail.facebook.com
|
@@ -9,6 +9,7 @@ Requires-Python: >= 3.10
|
|
9
9
|
Description-Content-Type: text/markdown
|
10
10
|
License-File: LICENSE
|
11
11
|
Requires-Dist: torch
|
12
|
+
Requires-Dist: torchshow
|
12
13
|
Requires-Dist: pyzmq
|
13
14
|
Requires-Dist: requests
|
14
15
|
Requires-Dist: numpy
|
@@ -64,7 +65,7 @@ fut.get()
|
|
64
65
|
```
|
65
66
|
|
66
67
|
|
67
|
-
The [introduction to monarch concepts](getting_started.html) provides an introduction to using these features.
|
68
|
+
The [introduction to monarch concepts](https://meta-pytorch.org/monarch/generated/examples/getting_started.html) provides an introduction to using these features.
|
68
69
|
|
69
70
|
> ⚠️ **Early Development Warning** Monarch is currently in an experimental
|
70
71
|
> stage. You should expect bugs, incomplete features, and APIs that may change
|
@@ -73,18 +74,24 @@ The [introduction to monarch concepts](getting_started.html) provides an introdu
|
|
73
74
|
> work. It's recommended that you signal your intention to contribute in the
|
74
75
|
> issue tracker, either by filing a new issue or by claiming an existing one.
|
75
76
|
|
76
|
-
Note: Monarch is currently only supported on Linux systems
|
77
|
-
|
78
77
|
## 📖 Documentation
|
79
78
|
|
80
79
|
View Monarch's hosted documentation [at this link](https://meta-pytorch.org/monarch/).
|
81
80
|
|
82
81
|
## Installation
|
82
|
+
Note for running distributed tensors, the local torch version must match the version that monarch was built with.
|
83
83
|
|
84
84
|
### On Fedora distributions
|
85
|
+
## Stable
|
86
|
+
`pip install torchmonarch`
|
87
|
+
|
88
|
+
torchmonarch stable is built with the latest stable torch.
|
85
89
|
|
90
|
+
## Nightly
|
86
91
|
`pip install torchmonarch-nightly`
|
87
92
|
|
93
|
+
torchmonarch-nightly is built with torch nightly.
|
94
|
+
|
88
95
|
or manually
|
89
96
|
|
90
97
|
```sh
|
@@ -1,5 +1,5 @@
|
|
1
1
|
monarch/__init__.py,sha256=mgKiyD1kxky-1pvhMlNfF4VmxWnhi-FSYZNFzkW1BEM,7052
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=4PSOldY67JPiHo6-GBN1Cd5zhbVOxcCU5sTSrMACKAc,61372560
|
3
3
|
monarch/_testing.py,sha256=5BDMVA4hBMo780rsJ39vRmUZi6mTN8aYY7I9grJRjJ8,7841
|
4
4
|
monarch/actor_mesh.py,sha256=VtPU9syi_vUdwDSJJ639Z4Y_EcWZUScyoj0lQ88RQPs,421
|
5
5
|
monarch/bootstrap_main.py,sha256=39OZpNMrfvvNJf-iwuNzgslzYA_ItaRPHfXGn_V74N0,524
|
@@ -8,7 +8,7 @@ monarch/fetch.py,sha256=CssP25dMqyJnJAWoC41lwkMnSbvS-f2DL9PRbudJXfc,1704
|
|
8
8
|
monarch/gradient_generator.py,sha256=b7PmoN_F3c5hQglfHeW_v5htYnePKvJGkzZN-tpHR4A,6396
|
9
9
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
10
10
|
monarch/mesh_controller.py,sha256=Y_26Cnmp72TccNbWdDQhq18j7de7pSw83E_fREJX9Yo,15372
|
11
|
-
monarch/monarch_controller,sha256=
|
11
|
+
monarch/monarch_controller,sha256=8pl9HaAiJvsa1X8MsUQnbVKiD6oClL730oHsUUdkzwo,32446984
|
12
12
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
13
13
|
monarch/opaque_module.py,sha256=jCcg0DjbcEVXA9WNG0NhUzGteLHOJLTZEBvrIYJIAns,10436
|
14
14
|
monarch/opaque_object.py,sha256=x1LoX6RIMGh4ux52xIfhPgoh6PhZHdkf9bMccHW3DW0,2808
|
@@ -25,17 +25,17 @@ monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
|
|
25
25
|
monarch/world_mesh.py,sha256=ob5dJWaC49Uw0xqClHBm8CQLvL4xKnjd4TGzk7k8NxI,980
|
26
26
|
monarch/_src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
27
|
monarch/_src/actor/__init__.py,sha256=4iK3nzQZmEPe0HGNY70fABBenw3lCVVaaF0xddF5Fa0,235
|
28
|
-
monarch/_src/actor/actor_mesh.py,sha256=
|
28
|
+
monarch/_src/actor/actor_mesh.py,sha256=P9b4CvxYeYfJecPxeEtohAasvGgzzElcvxh9chALeAA,40526
|
29
29
|
monarch/_src/actor/allocator.py,sha256=UVGhrkPQMqPQp6vUngPI361s6yCEfZ0gfz8WTtG2om4,9392
|
30
30
|
monarch/_src/actor/bootstrap_main.py,sha256=7T7ARumcHLZ5RI-k5jej9tBil0J7-BUSVFKwAZO2tJU,2413
|
31
31
|
monarch/_src/actor/device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4j4,577
|
32
32
|
monarch/_src/actor/endpoint.py,sha256=_VaPHc0Fcj1P5nDzUXt8xnS6iw7-HO6hGx7W5RtU3eU,10916
|
33
33
|
monarch/_src/actor/event_loop.py,sha256=2i4fKIkemBzua_t47BqVa2roZ6fWB6sbmMFPNx2zKN0,2832
|
34
34
|
monarch/_src/actor/future.py,sha256=idgqzU_B5qWfClIP5dTLapmBflWq5va-ujAzUbT1Asc,7490
|
35
|
-
monarch/_src/actor/host_mesh.py,sha256=
|
35
|
+
monarch/_src/actor/host_mesh.py,sha256=8SOkg_LhHuzLyhpwxT7Yw1_h8QrIlwfWhrSwHyAvfnk,5083
|
36
36
|
monarch/_src/actor/logging.py,sha256=9aguohqCtvLVwWGTFM7o-rBptT26BjI2s6E5Of2imM4,3311
|
37
37
|
monarch/_src/actor/pickle.py,sha256=FhdbAEsGrsD7f25bxF7HlROLm6j2TTvmToq8P1kyhB8,2913
|
38
|
-
monarch/_src/actor/proc_mesh.py,sha256=
|
38
|
+
monarch/_src/actor/proc_mesh.py,sha256=lYrRMQNOGAdFXuFvc3lQ68xIS01YJWMkpi8qH5HHAHE,27791
|
39
39
|
monarch/_src/actor/python_extension_methods.py,sha256=QujLWOQQbDdGCin8tZfDxyIwkM-Md4t9QtcTGTHOE_s,3493
|
40
40
|
monarch/_src/actor/shape.py,sha256=PJqxpQEISHlxK8rrlKWpcNMEHiGxBbc6TsHcGZCOsyE,8472
|
41
41
|
monarch/_src/actor/source_loader.py,sha256=TGHmExLyxPDcCyuG254zo6aUqHMpl-j0VWzxa9rkJYQ,1405
|
@@ -52,7 +52,7 @@ monarch/_src/debug_cli/__init__.py,sha256=NNrKh5KdiYdbxOhin8x-gw_-tvcuGex2UbS_z7
|
|
52
52
|
monarch/_src/debug_cli/debug_cli.py,sha256=OJqqVFXcMkj-bnrxcE2VnjIgA5xrlKjEtCstrsdPcm0,1146
|
53
53
|
monarch/_src/tensor_engine/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
|
54
54
|
monarch/_src/tensor_engine/rdma.py,sha256=62saqcXo6oUxH9rEZShNGLYdRZ_zizLQKhub7LDSaBg,8418
|
55
|
-
monarch/actor/__init__.py,sha256=
|
55
|
+
monarch/actor/__init__.py,sha256=hHf8ri2czQwi-Z23Z9cYZ2FvkVbYOcDA_GTLW_rju7k,1569
|
56
56
|
monarch/builtins/__init__.py,sha256=QcfnHZGbc2qktBg7DyZt2ruE6VahnIt4S8lEZLHdJqU,443
|
57
57
|
monarch/builtins/log.py,sha256=H1QkuVzwxyi36Zyv-XR0VN0QsNimBWwxE1__fjs0_2o,554
|
58
58
|
monarch/builtins/random.py,sha256=wPbvscg7u53EXpMFo885fO2XOlsyjrNAJ4rBxLzfxdg,1839
|
@@ -84,7 +84,7 @@ monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,9
|
|
84
84
|
monarch/common/remote.py,sha256=uc2JUbYHMnEZRnw9ZuS4mgvK_UHCuRaIIFbzZpx89hQ,11485
|
85
85
|
monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
|
86
86
|
monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
|
87
|
-
monarch/common/tensor.py,sha256=
|
87
|
+
monarch/common/tensor.py,sha256=9FIUn5--VeacYWhEhMvysdG0yc_zq4eW3X3526RCw3w,29598
|
88
88
|
monarch/common/tensor_factory.py,sha256=qm8NZx-5ezMAFjNLiXQvb66okm5XgdboB_GRarGOdN0,801
|
89
89
|
monarch/common/tree.py,sha256=1DG3siiE7ixBV6v5cwN8RT_17aJhYZTE-L3i7wZe2_c,2282
|
90
90
|
monarch/controller/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
@@ -98,7 +98,7 @@ monarch/debug_cli/__init__.py,sha256=NNrKh5KdiYdbxOhin8x-gw_-tvcuGex2UbS_z7MV9g0
|
|
98
98
|
monarch/debug_cli/__main__.py,sha256=FGsQn54RkC_3gpRrm_UFrGiDDHRbMeGzXXsGANr5UHU,317
|
99
99
|
monarch/gradient/__init__.py,sha256=kqmzwt16mMpk0M3GhpgP_f7da4DGnaV9chDzbt66k4Q,308
|
100
100
|
monarch/gradient/_gradient_generator.pyi,sha256=6cX0UxaDt9NAlwgIhTgnweqGOf6qRhHiGnUzSWNCxdU,630
|
101
|
-
monarch/gradient/_gradient_generator.so,sha256=
|
101
|
+
monarch/gradient/_gradient_generator.so,sha256=kGzXSoii5ODi8ZA7gz9D38Lt5sxGi5POXcGYmidS0-Q,12174336
|
102
102
|
monarch/parallel/__init__.py,sha256=6920kIkhiX7AiyjYvyc1ad8ccP-bStJJ1sS5KkeN2P0,352
|
103
103
|
monarch/parallel/pipelining/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
104
104
|
monarch/parallel/pipelining/runtime.py,sha256=KK8TG1gUYEzSsquiZoPTWGSIC74mlncD7cYknKxfb3c,32470
|
@@ -127,13 +127,13 @@ monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
|
127
127
|
monarch/tools/cli.py,sha256=b3mKZnK-MwP7JwskTxHI0KcJXxSU6498jEb2ntVr_VM,5001
|
128
128
|
monarch/tools/colors.py,sha256=XrBkslKoaoDeXqiTluiiuvFLYd-weKp1sjw7DYWz2RY,581
|
129
129
|
monarch/tools/commands.py,sha256=z4vCPtn_Ypic7L4_Jd3nMJWyyE4olUPqDe4cpJsDKZ4,13873
|
130
|
-
monarch/tools/mesh_spec.py,sha256=
|
130
|
+
monarch/tools/mesh_spec.py,sha256=lkKZ7RxuJKY19X6kdiU_V6IWlH1GHidynOaTbuCOsAY,7983
|
131
131
|
monarch/tools/network.py,sha256=mN8Fx9mervxM3VdFHRn4ZXt4z7yWxZp52BTxx2tfpus,2455
|
132
132
|
monarch/tools/utils.py,sha256=gcZyalfoBC6Y3v65h-QMngwXsn24ejXh2TH8RxlgXkA,1888
|
133
133
|
monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
134
|
-
monarch/tools/components/hyperactor.py,sha256=
|
135
|
-
monarch/tools/config/__init__.py,sha256=
|
136
|
-
monarch/tools/config/defaults.py,sha256=
|
134
|
+
monarch/tools/components/hyperactor.py,sha256=OR5JtH2UCao3ke3vMohzVbuo_L0gZ_jTw8ud82qLj3M,2175
|
135
|
+
monarch/tools/config/__init__.py,sha256=wCw2qwGJL1gFuo9Wpvnrva6NKDLyjf2Yglm6Q9UJYkI,2224
|
136
|
+
monarch/tools/config/defaults.py,sha256=twUF6eT9HjJyxEZYrz2SoROHHXi3YPUDSeAelJRLBSU,2187
|
137
137
|
monarch/tools/config/environment.py,sha256=ikEZKATa2e_8h9pN4_3TzhIHWb4ZZfRT5XtOVoOmHjI,1628
|
138
138
|
monarch/tools/config/workspace.py,sha256=a2YzFBTLUB_VrO3kt6dCV5TlmhCH4LyRX3JCMzu7Iv0,6049
|
139
139
|
monarch/utils/__init__.py,sha256=9ofjBGAMZo1VGsn7ufiDlrVheMw4Ye34p-isDfveUxc,295
|
@@ -186,9 +186,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
186
186
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
187
187
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
188
188
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
189
|
-
torchmonarch_nightly-2025.9.
|
190
|
-
torchmonarch_nightly-2025.9.
|
191
|
-
torchmonarch_nightly-2025.9.
|
192
|
-
torchmonarch_nightly-2025.9.
|
193
|
-
torchmonarch_nightly-2025.9.
|
194
|
-
torchmonarch_nightly-2025.9.
|
189
|
+
torchmonarch_nightly-2025.9.5.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
190
|
+
torchmonarch_nightly-2025.9.5.dist-info/METADATA,sha256=6WKzpl0pCJnxLhSxTckSsA7C5ncuUZJ2_NzujIuxhiQ,6474
|
191
|
+
torchmonarch_nightly-2025.9.5.dist-info/WHEEL,sha256=JC9FVdjbTDi9l3EyrqUd11CgmN9LkBi1g5dFHayafwA,104
|
192
|
+
torchmonarch_nightly-2025.9.5.dist-info/entry_points.txt,sha256=60QVSpYVzkzS4iDOiLp0fsLxVp47X3J2l3v7W-59LMo,117
|
193
|
+
torchmonarch_nightly-2025.9.5.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
194
|
+
torchmonarch_nightly-2025.9.5.dist-info/RECORD,,
|
File without changes
|
{torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/entry_points.txt
RENAMED
File without changes
|
{torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/licenses/LICENSE
RENAMED
File without changes
|
{torchmonarch_nightly-2025.9.3.dist-info → torchmonarch_nightly-2025.9.5.dist-info}/top_level.txt
RENAMED
File without changes
|