torchmonarch-nightly 2025.6.20__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.27__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/actor_mesh.py +13 -5
- monarch/allocator.py +87 -1
- monarch/code_sync.py +10 -0
- monarch/debugger.py +4 -2
- monarch/monarch_controller +0 -0
- monarch/proc_mesh.py +43 -3
- monarch/tools/mesh_spec.py +42 -4
- monarch/tools/network.py +34 -27
- tests/test_allocator.py +154 -6
- tests/test_python_actors.py +8 -44
- tests/test_tensor_engine.py +52 -0
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/METADATA +2 -2
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/RECORD +18 -16
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/actor_mesh.py
CHANGED
@@ -288,11 +288,11 @@ class Endpoint(Generic[P, R]):
|
|
288
288
|
|
289
289
|
def broadcast(self, *args: P.args, **kwargs: P.kwargs) -> None:
|
290
290
|
"""
|
291
|
-
|
291
|
+
Fire-and-forget broadcast to all actors without waiting for actors to
|
292
|
+
acknowledge receipt.
|
292
293
|
|
293
|
-
|
294
|
-
|
295
|
-
return any results.
|
294
|
+
In other words, the return of this method does not guarrantee the
|
295
|
+
delivery of the message.
|
296
296
|
"""
|
297
297
|
# pyre-ignore
|
298
298
|
send(self, args, kwargs)
|
@@ -319,6 +319,10 @@ class Accumulator(Generic[P, R, A]):
|
|
319
319
|
|
320
320
|
|
321
321
|
class ValueMesh(MeshTrait, Generic[R]):
|
322
|
+
"""
|
323
|
+
Container of return values, indexed by rank.
|
324
|
+
"""
|
325
|
+
|
322
326
|
def __init__(self, shape: Shape, values: List[R]) -> None:
|
323
327
|
self._shape = shape
|
324
328
|
self._values = values
|
@@ -516,6 +520,10 @@ class _Actor:
|
|
516
520
|
self.instance = Class(*args, **kwargs)
|
517
521
|
return None
|
518
522
|
|
523
|
+
if self.instance is None:
|
524
|
+
raise AssertionError(
|
525
|
+
"__init__ failed earlier and no Actor object is available"
|
526
|
+
)
|
519
527
|
the_method = getattr(self.instance, message.method)._method
|
520
528
|
|
521
529
|
if inspect.iscoroutinefunction(the_method):
|
@@ -622,7 +630,7 @@ class Actor(MeshTrait):
|
|
622
630
|
)
|
623
631
|
|
624
632
|
|
625
|
-
class ActorMeshRef(MeshTrait):
|
633
|
+
class ActorMeshRef(MeshTrait, Generic[T]):
|
626
634
|
def __init__(
|
627
635
|
self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
|
628
636
|
) -> None:
|
monarch/allocator.py
CHANGED
@@ -7,7 +7,8 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
9
|
import abc
|
10
|
-
|
10
|
+
import logging
|
11
|
+
from typing import final, Optional
|
11
12
|
|
12
13
|
from monarch import ActorFuture as Future
|
13
14
|
from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
|
@@ -21,6 +22,10 @@ from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monar
|
|
21
22
|
RemoteAllocatorBase,
|
22
23
|
)
|
23
24
|
|
25
|
+
ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
|
26
|
+
|
27
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
28
|
+
|
24
29
|
|
25
30
|
@final
|
26
31
|
class ProcessAllocator(ProcessAllocatorBase):
|
@@ -111,6 +116,87 @@ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
|
|
111
116
|
return list(self.addrs)
|
112
117
|
|
113
118
|
|
119
|
+
class TorchXRemoteAllocInitializer(RemoteAllocInitializer):
|
120
|
+
"""
|
121
|
+
For monarch runtimes running as a job on a supported scheduler.
|
122
|
+
Such runtimes are typically launched using the monarch CLI (e.g `monarch create --scheduler slurm ...`).
|
123
|
+
|
124
|
+
Returns the server addresses of a specific monarch runtime by using TorchX's status API
|
125
|
+
to get the hostnames of the nodes.
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(
|
129
|
+
self,
|
130
|
+
server_handle: str,
|
131
|
+
/,
|
132
|
+
transport: Optional[str] = None,
|
133
|
+
port: Optional[int] = None,
|
134
|
+
) -> None:
|
135
|
+
"""
|
136
|
+
NOTE: If `transport` and `port` specified, they are used over the `transport` and `port`
|
137
|
+
information that is tagged as metadata on the server's job. This is useful in two specific
|
138
|
+
situations:
|
139
|
+
1) The job was NOT created wit monarch CLI (hence no metadata tags exist)
|
140
|
+
2) The scheduler does not support job metadata tagging
|
141
|
+
|
142
|
+
Arguments:
|
143
|
+
- `server_handle`: points to a monarch runtime. Of the form `{scheduler}://{namespace}/{job_id}`.
|
144
|
+
the `{namespace}` can be empty if not configured (e.g. `slurm:///1234` - notice the triple slashes).
|
145
|
+
- `transport`: the channel transport that should be used to connect to the remote process allocator address
|
146
|
+
- `port`: the port that the remote process allocator is running on
|
147
|
+
|
148
|
+
"""
|
149
|
+
self.server_handle = server_handle
|
150
|
+
self.transport = transport
|
151
|
+
self.port = port
|
152
|
+
|
153
|
+
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
154
|
+
# lazy import since torchx-fb is not included in `fbcode//monarch/python/monarch:monarch.whl`
|
155
|
+
# nor any of the base conda environments
|
156
|
+
from monarch.tools.commands import server_ready
|
157
|
+
|
158
|
+
mesh_name = match_labels.get(ALLOC_LABEL_PROC_MESH_NAME)
|
159
|
+
|
160
|
+
server = await server_ready(self.server_handle)
|
161
|
+
|
162
|
+
# job does not exist or it is in a terminal state (SUCCEEDED, FAILED, CANCELLED)
|
163
|
+
if not (server and server.is_running):
|
164
|
+
raise ValueError(
|
165
|
+
f"{self.server_handle} does not exist or is in a terminal state"
|
166
|
+
)
|
167
|
+
|
168
|
+
if not mesh_name:
|
169
|
+
logger.info(
|
170
|
+
"no match label `%s` specified in alloc constraints",
|
171
|
+
ALLOC_LABEL_PROC_MESH_NAME,
|
172
|
+
)
|
173
|
+
|
174
|
+
num_meshes = len(server.meshes)
|
175
|
+
|
176
|
+
if num_meshes == 1:
|
177
|
+
logger.info(
|
178
|
+
"found a single proc mesh `%s` in %s, will allocate on it",
|
179
|
+
server.meshes[0].name,
|
180
|
+
self.server_handle,
|
181
|
+
)
|
182
|
+
else:
|
183
|
+
raise RuntimeError(
|
184
|
+
f"{num_meshes} proc meshes in {self.server_handle},"
|
185
|
+
f" please specify the mesh name as a match label `{ALLOC_LABEL_PROC_MESH_NAME}`"
|
186
|
+
f" in allocation constraints of the alloc spec"
|
187
|
+
)
|
188
|
+
mesh = server.meshes[0]
|
189
|
+
else:
|
190
|
+
mesh = server.get_mesh_spec(mesh_name)
|
191
|
+
|
192
|
+
server_addrs = mesh.server_addrs(self.transport, self.port)
|
193
|
+
|
194
|
+
logger.info(
|
195
|
+
"initializing alloc on remote allocator addresses: %s", server_addrs
|
196
|
+
)
|
197
|
+
return server_addrs
|
198
|
+
|
199
|
+
|
114
200
|
@final
|
115
201
|
class RemoteAllocator(RemoteAllocatorBase):
|
116
202
|
"""
|
monarch/code_sync.py
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
from monarch._rust_bindings.monarch_extension.code_sync import ( # noqa: F401
|
8
|
+
RemoteWorkspace,
|
9
|
+
RsyncMeshClient,
|
10
|
+
)
|
monarch/debugger.py
CHANGED
@@ -11,7 +11,7 @@ from dataclasses import dataclass
|
|
11
11
|
from typing import Dict, List, Tuple, Union
|
12
12
|
|
13
13
|
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
14
|
-
from monarch.actor_mesh import Actor, endpoint
|
14
|
+
from monarch.actor_mesh import Actor, ActorMeshRef, endpoint
|
15
15
|
|
16
16
|
from monarch.pdb_wrapper import DebuggerWrite
|
17
17
|
|
@@ -370,7 +370,9 @@ class DebugClient(Actor):
|
|
370
370
|
await session.debugger_write(write)
|
371
371
|
|
372
372
|
|
373
|
-
async def init_debugging(
|
373
|
+
async def init_debugging(
|
374
|
+
actor_mesh: ActorMeshRef,
|
375
|
+
) -> ActorMeshRef[DebugClient]:
|
374
376
|
debugger_proc_mesh = await local_proc_mesh(gpus=1, hosts=1)
|
375
377
|
debug_client_mesh = await debugger_proc_mesh.spawn("debug_client", DebugClient)
|
376
378
|
await actor_mesh._set_debug_client.call(debug_client_mesh)
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/proc_mesh.py
CHANGED
@@ -6,6 +6,7 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
|
+
import os
|
9
10
|
import sys
|
10
11
|
from contextlib import AbstractContextManager
|
11
12
|
|
@@ -27,6 +28,10 @@ if TYPE_CHECKING:
|
|
27
28
|
import monarch
|
28
29
|
from monarch import ActorFuture as Future
|
29
30
|
|
31
|
+
# Conditionally import DeviceMesh and spawn_tensor_engine only if tensor_engine is available
|
32
|
+
# pyre-ignore[21]
|
33
|
+
from monarch._rust_bindings import has_tensor_engine
|
34
|
+
|
30
35
|
from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
|
31
36
|
Alloc,
|
32
37
|
AllocConstraints,
|
@@ -37,12 +42,18 @@ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyPr
|
|
37
42
|
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
38
43
|
from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
|
39
44
|
|
45
|
+
from monarch.code_sync import RemoteWorkspace, RsyncMeshClient
|
40
46
|
from monarch.common._device_utils import _local_device_count
|
41
|
-
from monarch.common.device_mesh import DeviceMesh
|
42
47
|
from monarch.common.shape import MeshTrait
|
43
|
-
from monarch.mesh_controller import spawn_tensor_engine
|
44
48
|
from monarch.rdma import RDMAManager
|
45
49
|
|
50
|
+
if has_tensor_engine():
|
51
|
+
from monarch.common.device_mesh import DeviceMesh
|
52
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
53
|
+
else:
|
54
|
+
DeviceMesh = None
|
55
|
+
spawn_tensor_engine = None
|
56
|
+
|
46
57
|
T = TypeVar("T")
|
47
58
|
try:
|
48
59
|
from __manifest__ import fbmake # noqa
|
@@ -71,6 +82,7 @@ class ProcMesh(MeshTrait):
|
|
71
82
|
self._mock_shape: Optional[Shape] = _mock_shape
|
72
83
|
self._mailbox: Mailbox = self._proc_mesh.client
|
73
84
|
self._rdma_manager: Optional[RDMAManager] = None
|
85
|
+
self._rsync_mesh_client: Optional[RsyncMeshClient] = None
|
74
86
|
self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
|
75
87
|
if _mock_shape is None:
|
76
88
|
self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
|
@@ -95,7 +107,9 @@ class ProcMesh(MeshTrait):
|
|
95
107
|
)
|
96
108
|
return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
|
97
109
|
|
98
|
-
def spawn(
|
110
|
+
def spawn(
|
111
|
+
self, name: str, Class: Type[T], *args: Any, **kwargs: Any
|
112
|
+
) -> Future[ActorMeshRef[T]]:
|
99
113
|
if self._mock_shape is not None:
|
100
114
|
raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
|
101
115
|
return Future(
|
@@ -156,6 +170,10 @@ class ProcMesh(MeshTrait):
|
|
156
170
|
|
157
171
|
@property
|
158
172
|
def _device_mesh(self) -> "DeviceMesh":
|
173
|
+
if spawn_tensor_engine is None:
|
174
|
+
raise RuntimeError(
|
175
|
+
"DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
|
176
|
+
)
|
159
177
|
if self._maybe_device_mesh is None:
|
160
178
|
if self._mock_shape is not None:
|
161
179
|
raise NotImplementedError(
|
@@ -174,6 +192,28 @@ class ProcMesh(MeshTrait):
|
|
174
192
|
def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
|
175
193
|
return self._device_mesh.ranks
|
176
194
|
|
195
|
+
async def sync_workspace(self) -> None:
|
196
|
+
if self._rsync_mesh_client is None:
|
197
|
+
# TODO(agallagher): We need some way to configure and pass this
|
198
|
+
# in -- right now we're assuming the `gpu` dimension, which isn't
|
199
|
+
# correct.
|
200
|
+
assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
|
201
|
+
# The workspace shape (i.e. only perform one rsync per host).
|
202
|
+
workspace_shape = self.slice(gpus=slice(0, 1, 1))._mock_shape
|
203
|
+
assert workspace_shape is not None
|
204
|
+
# TODO(agallagher): We should probably hide this behind something
|
205
|
+
# like a `Workspace` class and support abstracting/configuring
|
206
|
+
# different sync methods.
|
207
|
+
self._rsync_mesh_client = RsyncMeshClient.spawn_blocking(
|
208
|
+
proc_mesh=self._proc_mesh,
|
209
|
+
shape=workspace_shape,
|
210
|
+
# TODO(agallagher): Is there a better way to infer/set the local
|
211
|
+
# workspace dir, rather than use PWD?
|
212
|
+
local_workspace=os.getcwd(),
|
213
|
+
remote_workspace=RemoteWorkspace.FromEnvVar("WORKSPACE_DIR"),
|
214
|
+
)
|
215
|
+
await self._rsync_mesh_client.sync_workspace()
|
216
|
+
|
177
217
|
|
178
218
|
async def local_proc_mesh_nonblocking(
|
179
219
|
*, gpus: Optional[int] = None, hosts: int = 1
|
monarch/tools/mesh_spec.py
CHANGED
@@ -9,6 +9,7 @@ import string
|
|
9
9
|
from dataclasses import dataclass, field
|
10
10
|
from typing import Any, Optional
|
11
11
|
|
12
|
+
from monarch.tools.network import get_sockaddr
|
12
13
|
from torchx import specs
|
13
14
|
|
14
15
|
DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
|
@@ -16,6 +17,10 @@ DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
|
|
16
17
|
_TAG_MESHES_PREFIX = "monarch/meshes/${mesh_name}/"
|
17
18
|
_TAG_HOST_TYPE: str = _TAG_MESHES_PREFIX + "host_type"
|
18
19
|
_TAG_GPUS: str = _TAG_MESHES_PREFIX + "gpus"
|
20
|
+
_TAG_TRANSPORT: str = _TAG_MESHES_PREFIX + "transport"
|
21
|
+
|
22
|
+
_UNSET_INT = -1
|
23
|
+
_UNSET_STR = "__UNSET__"
|
19
24
|
|
20
25
|
|
21
26
|
@dataclass
|
@@ -26,11 +31,38 @@ class MeshSpec:
|
|
26
31
|
|
27
32
|
name: str
|
28
33
|
num_hosts: int
|
29
|
-
host_type: str
|
30
|
-
gpus: int
|
34
|
+
host_type: str = _UNSET_STR
|
35
|
+
gpus: int = _UNSET_INT
|
36
|
+
# NOTE: using str over monarch._rust_bindings.monarch_hyperactor.channel.ChannelTransport enum
|
37
|
+
# b/c the rust binding doesn't have Python enum semantics, hence doesn't serialize well
|
38
|
+
transport: str = "tcp"
|
31
39
|
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
|
32
40
|
hostnames: list[str] = field(default_factory=list)
|
33
41
|
|
42
|
+
def server_addrs(
|
43
|
+
self, transport: Optional[str] = None, port: Optional[int] = None
|
44
|
+
) -> list[str]:
|
45
|
+
"""
|
46
|
+
Returns the hostnames (servers) in channel address format.
|
47
|
+
`transport` and `port` is typically taken from this mesh spec's fields, but
|
48
|
+
the caller can override them when calling this function.
|
49
|
+
"""
|
50
|
+
|
51
|
+
transport = transport or self.transport
|
52
|
+
port = port or self.port
|
53
|
+
|
54
|
+
if transport == "tcp":
|
55
|
+
# need to resolve hostnames to ip address for TCP
|
56
|
+
return [
|
57
|
+
f"tcp!{get_sockaddr(hostname, port)}" for hostname in self.hostnames
|
58
|
+
]
|
59
|
+
elif transport == "metatls":
|
60
|
+
return [f"metatls!{hostname}:{port}" for hostname in self.hostnames]
|
61
|
+
else:
|
62
|
+
raise ValueError(
|
63
|
+
f"Unsupported transport: {transport}. Must be one of: 'tcp' or 'metatls'"
|
64
|
+
)
|
65
|
+
|
34
66
|
|
35
67
|
def _tag(mesh_name: str, tag_template: str) -> str:
|
36
68
|
return string.Template(tag_template).substitute(mesh_name=mesh_name)
|
@@ -39,6 +71,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
|
|
39
71
|
def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
|
40
72
|
appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
|
41
73
|
appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
|
74
|
+
appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
|
42
75
|
|
43
76
|
|
44
77
|
def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[MeshSpec]:
|
@@ -47,8 +80,13 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
|
|
47
80
|
return MeshSpec(
|
48
81
|
name=mesh_name,
|
49
82
|
num_hosts=role.num_replicas,
|
50
|
-
host_type=appdef.metadata.get(
|
51
|
-
|
83
|
+
host_type=appdef.metadata.get(
|
84
|
+
_tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR
|
85
|
+
),
|
86
|
+
gpus=int(
|
87
|
+
appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), str(_UNSET_INT))
|
88
|
+
),
|
89
|
+
transport=appdef.metadata.get(_tag(mesh_name, _TAG_TRANSPORT), "tcp"),
|
52
90
|
port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
|
53
91
|
)
|
54
92
|
|
monarch/tools/network.py
CHANGED
@@ -12,51 +12,58 @@ from typing import Optional
|
|
12
12
|
logger: logging.Logger = logging.getLogger(__name__)
|
13
13
|
|
14
14
|
|
15
|
-
def
|
16
|
-
"""
|
15
|
+
def get_sockaddr(hostname: str, port: int) -> str:
|
16
|
+
"""Returns either an IPv6 or IPv4 socket address (that supports TCP) of the given hostname and port.
|
17
|
+
The socket address is of the form:
|
18
|
+
1. `{ipv4.address}:{port}` (e.g. `127.0.0.1:8080`)
|
19
|
+
2. `[{ipv6:address}]:{port}` (e.g. `[::1]:8080`)
|
17
20
|
|
18
|
-
|
19
|
-
`SOCK_STREAM` (TCP)
|
20
|
-
to resolving an ipv4 `SOCK_STREAM` address.
|
21
|
+
The hostname is resolved to an IPv6 (or IPv4 if IPv6 is not available on the host) address that
|
22
|
+
supports `SOCK_STREAM` (TCP).
|
21
23
|
|
22
24
|
Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
|
23
25
|
"""
|
24
26
|
|
25
|
-
def
|
27
|
+
def resolve_sockaddr(family: socket.AddressFamily) -> Optional[str]:
|
26
28
|
try:
|
27
29
|
# patternlint-disable-next-line python-dns-deps (only used for oss)
|
28
|
-
addrs = socket.getaddrinfo(
|
29
|
-
hostname, port=None, family=family, type=socket.SOCK_STREAM
|
30
|
-
) # tcp
|
30
|
+
addrs = socket.getaddrinfo(hostname, port, family, type=socket.SOCK_STREAM)
|
31
31
|
if addrs:
|
32
|
-
|
33
|
-
_, _, _, _, sockaddr = addrs[0] # use the first address
|
32
|
+
family, _, _, _, sockaddr = addrs[0] # use the first address
|
34
33
|
|
35
|
-
# sockaddr is a tuple (ipv4) or a 4-tuple (ipv6)
|
34
|
+
# sockaddr is a tuple (ipv4) or a 4-tuple (ipv6)
|
35
|
+
# in both cases the first element is the ip addr
|
36
36
|
ipaddr = str(sockaddr[0])
|
37
37
|
|
38
|
+
if family == socket.AF_INET6:
|
39
|
+
socket_address = f"[{ipaddr}]:{port}"
|
40
|
+
else: # socket.AF_INET
|
41
|
+
socket_address = f"{ipaddr}:{port}"
|
42
|
+
|
38
43
|
logger.info(
|
39
|
-
"
|
44
|
+
"resolved %s address `%s` for `%s:%d`",
|
40
45
|
family.name,
|
41
|
-
|
46
|
+
socket_address,
|
42
47
|
hostname,
|
48
|
+
port,
|
43
49
|
)
|
44
|
-
|
45
|
-
|
46
|
-
return None
|
50
|
+
|
51
|
+
return socket_address
|
47
52
|
except socket.gaierror as e:
|
48
53
|
logger.info(
|
49
|
-
"
|
54
|
+
"no %s address that can bind TCP sockets for `%s:%d` (error: %s)",
|
50
55
|
family.name,
|
51
56
|
hostname,
|
57
|
+
port,
|
52
58
|
e,
|
53
59
|
)
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
60
|
+
return None
|
61
|
+
|
62
|
+
for family in [socket.AF_INET6, socket.AF_INET]:
|
63
|
+
if ipaddr := resolve_sockaddr(family):
|
64
|
+
return ipaddr
|
65
|
+
|
66
|
+
raise RuntimeError(
|
67
|
+
f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
|
68
|
+
" Check the network configuration on the host."
|
69
|
+
)
|
tests/test_allocator.py
CHANGED
@@ -14,9 +14,11 @@ import subprocess
|
|
14
14
|
import sys
|
15
15
|
import unittest
|
16
16
|
from datetime import timedelta
|
17
|
-
from typing import Generator
|
17
|
+
from typing import Generator, Optional
|
18
|
+
from unittest import mock
|
18
19
|
|
19
20
|
import cloudpickle
|
21
|
+
import pytest
|
20
22
|
|
21
23
|
import torch
|
22
24
|
import torch.distributed as dist
|
@@ -26,20 +28,28 @@ from monarch._rust_bindings.hyperactor_extension.alloc import (
|
|
26
28
|
AllocConstraints,
|
27
29
|
AllocSpec,
|
28
30
|
)
|
29
|
-
|
30
31
|
from monarch._rust_bindings.monarch_hyperactor.channel import (
|
31
32
|
ChannelAddr,
|
32
33
|
ChannelTransport,
|
33
34
|
)
|
34
35
|
from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
|
35
|
-
|
36
|
-
|
36
|
+
from monarch.allocator import (
|
37
|
+
ALLOC_LABEL_PROC_MESH_NAME,
|
38
|
+
RemoteAllocator,
|
39
|
+
StaticRemoteAllocInitializer,
|
40
|
+
TorchXRemoteAllocInitializer,
|
41
|
+
)
|
37
42
|
from monarch.proc_mesh import ProcMesh
|
43
|
+
from monarch.tools.mesh_spec import MeshSpec, ServerSpec
|
44
|
+
from monarch.tools.network import get_sockaddr
|
38
45
|
|
39
46
|
from torch.distributed.elastic.utils.distributed import get_free_port
|
47
|
+
from torchx.specs import AppState
|
40
48
|
|
41
49
|
_100_MILLISECONDS = timedelta(milliseconds=100)
|
42
50
|
|
51
|
+
SERVER_READY = "monarch.tools.commands.server_ready"
|
52
|
+
|
43
53
|
|
44
54
|
class TestActor(Actor):
|
45
55
|
"""Silly actor that computes the world size by all-reducing rank-hot tensors"""
|
@@ -63,9 +73,9 @@ class TestActor(Actor):
|
|
63
73
|
|
64
74
|
|
65
75
|
@contextlib.contextmanager
|
66
|
-
def remote_process_allocator() -> Generator[str, None, None]:
|
76
|
+
def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None, None]:
|
67
77
|
with importlib.resources.path(__package__, "") as package_path:
|
68
|
-
addr = ChannelAddr.any(ChannelTransport.Unix)
|
78
|
+
addr = addr or ChannelAddr.any(ChannelTransport.Unix)
|
69
79
|
|
70
80
|
process_allocator = subprocess.Popen(
|
71
81
|
args=[
|
@@ -215,3 +225,141 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
215
225
|
|
216
226
|
self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
|
217
227
|
self.assert_computed_world_size(results_b, 6) # b is a 1x6 mesh
|
228
|
+
|
229
|
+
async def test_torchx_remote_alloc_initializer_no_server(self) -> None:
|
230
|
+
with mock.patch(SERVER_READY, return_value=None):
|
231
|
+
initializer = TorchXRemoteAllocInitializer("slurm:///123")
|
232
|
+
allocator = RemoteAllocator(world_id="test", initializer=initializer)
|
233
|
+
|
234
|
+
with self.assertRaisesRegex(
|
235
|
+
RuntimeError,
|
236
|
+
r"slurm:///123 does not exist or is in a terminal state",
|
237
|
+
):
|
238
|
+
await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
|
239
|
+
|
240
|
+
async def test_torchx_remote_alloc_initializer_no_match_label_gt_1_meshes(
|
241
|
+
self,
|
242
|
+
) -> None:
|
243
|
+
# asserts that an exception is raised if no match label is specified in alloc constraints
|
244
|
+
# but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
|
245
|
+
|
246
|
+
server = ServerSpec(
|
247
|
+
name="__UNUSED__",
|
248
|
+
state=AppState.RUNNING,
|
249
|
+
meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
|
250
|
+
)
|
251
|
+
|
252
|
+
with mock.patch(SERVER_READY, return_value=server):
|
253
|
+
initializer = TorchXRemoteAllocInitializer("slurm:///123")
|
254
|
+
allocator = RemoteAllocator(world_id="test", initializer=initializer)
|
255
|
+
|
256
|
+
with self.assertRaisesRegex(
|
257
|
+
RuntimeError,
|
258
|
+
r"2 proc meshes in slurm:///123, please specify the mesh name as a match label `procmesh.monarch.meta.com/name`",
|
259
|
+
):
|
260
|
+
await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
|
261
|
+
|
262
|
+
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
263
|
+
async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
|
264
|
+
server = ServerSpec(
|
265
|
+
name="__UNUSED__",
|
266
|
+
state=AppState.RUNNING,
|
267
|
+
meshes=[
|
268
|
+
MeshSpec(
|
269
|
+
name="x",
|
270
|
+
num_hosts=1,
|
271
|
+
transport="tcp",
|
272
|
+
hostnames=["localhost"],
|
273
|
+
)
|
274
|
+
],
|
275
|
+
)
|
276
|
+
port = get_free_port()
|
277
|
+
with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
|
278
|
+
with mock.patch(SERVER_READY, return_value=server):
|
279
|
+
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
280
|
+
allocator = RemoteAllocator(
|
281
|
+
world_id="test",
|
282
|
+
initializer=initializer,
|
283
|
+
heartbeat_interval=_100_MILLISECONDS,
|
284
|
+
)
|
285
|
+
alloc = await allocator.allocate(
|
286
|
+
AllocSpec(AllocConstraints(), host=1, gpu=4)
|
287
|
+
)
|
288
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
289
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
290
|
+
results = await actor.compute_world_size.call(
|
291
|
+
master_addr="0.0.0.0", master_port=get_free_port()
|
292
|
+
)
|
293
|
+
self.assert_computed_world_size(results, 4) # 1x4 mesh
|
294
|
+
|
295
|
+
@pytest.mark.oss_skip # pyre-ignore[56] TODO T228752279
|
296
|
+
async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
|
297
|
+
server = ServerSpec(
|
298
|
+
name="__UNUSED__",
|
299
|
+
state=AppState.RUNNING,
|
300
|
+
meshes=[
|
301
|
+
MeshSpec(
|
302
|
+
name="x",
|
303
|
+
num_hosts=1,
|
304
|
+
transport="tcp",
|
305
|
+
hostnames=["localhost"],
|
306
|
+
)
|
307
|
+
],
|
308
|
+
)
|
309
|
+
port = get_free_port()
|
310
|
+
with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
|
311
|
+
with mock.patch(SERVER_READY, return_value=server):
|
312
|
+
initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
|
313
|
+
allocator = RemoteAllocator(
|
314
|
+
world_id="test",
|
315
|
+
initializer=initializer,
|
316
|
+
heartbeat_interval=_100_MILLISECONDS,
|
317
|
+
)
|
318
|
+
alloc = await allocator.allocate(
|
319
|
+
AllocSpec(
|
320
|
+
AllocConstraints(
|
321
|
+
match_labels={ALLOC_LABEL_PROC_MESH_NAME: "x"}
|
322
|
+
),
|
323
|
+
host=1,
|
324
|
+
gpu=3,
|
325
|
+
)
|
326
|
+
)
|
327
|
+
proc_mesh = await ProcMesh.from_alloc(alloc)
|
328
|
+
actor = await proc_mesh.spawn("test_actor", TestActor)
|
329
|
+
results = await actor.compute_world_size.call(
|
330
|
+
master_addr="0.0.0.0", master_port=get_free_port()
|
331
|
+
)
|
332
|
+
self.assert_computed_world_size(results, 3) # 1x3 mesh
|
333
|
+
|
334
|
+
async def test_torchx_remote_alloc_initializer_with_match_label_no_match(
|
335
|
+
self,
|
336
|
+
) -> None:
|
337
|
+
# assert that match label with a mesh name that does not exist should error out
|
338
|
+
|
339
|
+
server = ServerSpec(
|
340
|
+
name="test",
|
341
|
+
state=AppState.RUNNING,
|
342
|
+
meshes=[
|
343
|
+
MeshSpec(
|
344
|
+
name="x",
|
345
|
+
num_hosts=1,
|
346
|
+
transport="tcp",
|
347
|
+
hostnames=["localhost"],
|
348
|
+
)
|
349
|
+
],
|
350
|
+
)
|
351
|
+
|
352
|
+
with mock.patch(SERVER_READY, return_value=server):
|
353
|
+
with self.assertRaisesRegex(RuntimeError, r"'y' not found in job: test"):
|
354
|
+
initializer = TorchXRemoteAllocInitializer("local:///test")
|
355
|
+
allocator = RemoteAllocator(world_id="test", initializer=initializer)
|
356
|
+
alloc = await allocator.allocate(
|
357
|
+
AllocSpec(
|
358
|
+
AllocConstraints(
|
359
|
+
match_labels={ALLOC_LABEL_PROC_MESH_NAME: "y"}
|
360
|
+
),
|
361
|
+
host=1,
|
362
|
+
gpu=1,
|
363
|
+
)
|
364
|
+
)
|
365
|
+
await ProcMesh.from_alloc(alloc)
|
tests/test_python_actors.py
CHANGED
@@ -6,7 +6,6 @@
|
|
6
6
|
|
7
7
|
import asyncio
|
8
8
|
import operator
|
9
|
-
import os
|
10
9
|
import re
|
11
10
|
import threading
|
12
11
|
import time
|
@@ -31,11 +30,14 @@ from monarch.actor_mesh import (
|
|
31
30
|
from monarch.debugger import init_debugging
|
32
31
|
from monarch.future import ActorFuture
|
33
32
|
|
34
|
-
from monarch.mesh_controller import spawn_tensor_engine
|
35
|
-
|
36
33
|
from monarch.proc_mesh import local_proc_mesh, proc_mesh
|
37
34
|
from monarch.rdma import RDMABuffer
|
38
35
|
|
36
|
+
needs_cuda = pytest.mark.skipif(
|
37
|
+
not torch.cuda.is_available(),
|
38
|
+
reason="CUDA not available",
|
39
|
+
)
|
40
|
+
|
39
41
|
|
40
42
|
class Counter(Actor):
|
41
43
|
def __init__(self, v: int):
|
@@ -116,6 +118,7 @@ class ParameterClient(Actor):
|
|
116
118
|
return self.buffer
|
117
119
|
|
118
120
|
|
121
|
+
@needs_cuda
|
119
122
|
async def test_proc_mesh_rdma():
|
120
123
|
proc = await proc_mesh(gpus=1)
|
121
124
|
server = await proc.spawn("server", ParameterServer)
|
@@ -284,6 +287,7 @@ class GeneratorActor(Actor):
|
|
284
287
|
), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
|
285
288
|
|
286
289
|
|
290
|
+
@needs_cuda
|
287
291
|
async def test_gpu_trainer_generator():
|
288
292
|
trainer_proc = await proc_mesh(gpus=1)
|
289
293
|
gen_proc = await proc_mesh(gpus=1)
|
@@ -313,6 +317,7 @@ async def test_sync_actor():
|
|
313
317
|
assert r == 5
|
314
318
|
|
315
319
|
|
320
|
+
@needs_cuda
|
316
321
|
def test_gpu_trainer_generator_sync() -> None:
|
317
322
|
trainer_proc = proc_mesh(gpus=1).get()
|
318
323
|
gen_proc = proc_mesh(gpus=1).get()
|
@@ -403,30 +408,6 @@ def test_proc_mesh_liveness() -> None:
|
|
403
408
|
counter.value.call().get()
|
404
409
|
|
405
410
|
|
406
|
-
two_gpu = pytest.mark.skipif(
|
407
|
-
torch.cuda.device_count() < 2,
|
408
|
-
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
409
|
-
)
|
410
|
-
|
411
|
-
|
412
|
-
@two_gpu
|
413
|
-
def test_tensor_engine() -> None:
|
414
|
-
pm = proc_mesh(gpus=2).get()
|
415
|
-
|
416
|
-
dm = spawn_tensor_engine(pm)
|
417
|
-
with dm.activate():
|
418
|
-
r = monarch.inspect(2 * torch.zeros(3, 4))
|
419
|
-
|
420
|
-
fm = dm.flatten("all")
|
421
|
-
with fm.activate():
|
422
|
-
f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
|
423
|
-
|
424
|
-
assert torch.allclose(torch.zeros(3, 4), r)
|
425
|
-
assert torch.allclose(torch.zeros(3, 4), f)
|
426
|
-
|
427
|
-
dm.exit()
|
428
|
-
|
429
|
-
|
430
411
|
def _debugee_actor_internal(rank):
|
431
412
|
if rank == 0:
|
432
413
|
breakpoint() # noqa
|
@@ -632,23 +613,6 @@ async def test_actor_tls_full_sync() -> None:
|
|
632
613
|
assert 4 == await am.get.call_one()
|
633
614
|
|
634
615
|
|
635
|
-
@two_gpu
|
636
|
-
def test_proc_mesh_tensor_engine() -> None:
|
637
|
-
pm = proc_mesh(gpus=2).get()
|
638
|
-
with pm.activate():
|
639
|
-
f = 10 * pm.rank_tensor("gpus").cuda()
|
640
|
-
a = monarch.inspect(f, hosts=0, gpus=0)
|
641
|
-
b = monarch.inspect(f, hosts=0, gpus=1)
|
642
|
-
|
643
|
-
one = pm.slice(gpus=1)
|
644
|
-
with one.activate():
|
645
|
-
sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
|
646
|
-
c = monarch.inspect(sliced_b * 10)
|
647
|
-
assert a == 0
|
648
|
-
assert b == 10
|
649
|
-
assert c == 100
|
650
|
-
|
651
|
-
|
652
616
|
class AsyncActor(Actor):
|
653
617
|
def __init__(self):
|
654
618
|
self.should_exit = False
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import monarch
|
8
|
+
import pytest
|
9
|
+
import torch
|
10
|
+
from monarch.mesh_controller import spawn_tensor_engine
|
11
|
+
from monarch.proc_mesh import proc_mesh
|
12
|
+
|
13
|
+
|
14
|
+
two_gpu = pytest.mark.skipif(
|
15
|
+
torch.cuda.device_count() < 2,
|
16
|
+
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
@two_gpu
|
21
|
+
def test_tensor_engine() -> None:
|
22
|
+
pm = proc_mesh(gpus=2).get()
|
23
|
+
|
24
|
+
dm = spawn_tensor_engine(pm)
|
25
|
+
with dm.activate():
|
26
|
+
r = monarch.inspect(2 * torch.zeros(3, 4))
|
27
|
+
|
28
|
+
fm = dm.flatten("all")
|
29
|
+
with fm.activate():
|
30
|
+
f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
|
31
|
+
|
32
|
+
assert torch.allclose(torch.zeros(3, 4), r)
|
33
|
+
assert torch.allclose(torch.zeros(3, 4), f)
|
34
|
+
|
35
|
+
dm.exit()
|
36
|
+
|
37
|
+
|
38
|
+
@two_gpu
|
39
|
+
def test_proc_mesh_tensor_engine() -> None:
|
40
|
+
pm = proc_mesh(gpus=2).get()
|
41
|
+
with pm.activate():
|
42
|
+
f = 10 * pm.rank_tensor("gpus").cuda()
|
43
|
+
a = monarch.inspect(f, hosts=0, gpus=0)
|
44
|
+
b = monarch.inspect(f, hosts=0, gpus=1)
|
45
|
+
|
46
|
+
one = pm.slice(gpus=1)
|
47
|
+
with one.activate():
|
48
|
+
sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
|
49
|
+
c = monarch.inspect(sliced_b * 10)
|
50
|
+
assert a == 0
|
51
|
+
assert b == 10
|
52
|
+
assert c == 100
|
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: torchmonarch-nightly
|
3
|
-
Version: 2025.6.
|
3
|
+
Version: 2025.6.27
|
4
4
|
Summary: Monarch: Single controller library
|
5
5
|
Author: Meta
|
6
6
|
Author-email: oncall+monarch@xmail.facebook.com
|
@@ -42,7 +42,7 @@ Note: Monarch is currently only supported on Linux systems
|
|
42
42
|
|
43
43
|
## Installation
|
44
44
|
|
45
|
-
`pip install torchmonarch`
|
45
|
+
`pip install torchmonarch-nightly`
|
46
46
|
|
47
47
|
or manually
|
48
48
|
|
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/RECORD
RENAMED
@@ -1,22 +1,23 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=SCTdGchlMLPZEiF4SNSbLSczRY7ZC3f7t0e-YZHGNDk,43327072
|
3
3
|
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
|
-
monarch/actor_mesh.py,sha256=
|
5
|
-
monarch/allocator.py,sha256=
|
4
|
+
monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
|
5
|
+
monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
|
6
6
|
monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
8
|
-
monarch/
|
8
|
+
monarch/code_sync.py,sha256=SIqXx-zAKx60s2LbS_e9XSSlE1YSEo75vE05tMrOyYo,332
|
9
|
+
monarch/debugger.py,sha256=AizU8MWBdloe0wj1ysxlOXmUhCwGoShVH_xGfVBCQjs,13354
|
9
10
|
monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
10
11
|
monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
|
11
12
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
12
13
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
13
14
|
monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
|
14
|
-
monarch/monarch_controller,sha256=
|
15
|
+
monarch/monarch_controller,sha256=Vr5ym1QWSWyd02YCd5q8tC9X_V-ony1v7v-pFfrXVQA,21664144
|
15
16
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
16
17
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
17
18
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
18
19
|
monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
|
19
|
-
monarch/proc_mesh.py,sha256=
|
20
|
+
monarch/proc_mesh.py,sha256=ZnNWjINoFTdkRVbu_ikos2jV4Ham-I9jqeWdEN-1ZtQ,10436
|
20
21
|
monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
|
21
22
|
monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
|
22
23
|
monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
|
@@ -108,8 +109,8 @@ monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_u
|
|
108
109
|
monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
109
110
|
monarch/tools/cli.py,sha256=EIdarsfuFX0WqRCe29_5GNKWJBhxx0lABalw3zPSagw,4977
|
110
111
|
monarch/tools/commands.py,sha256=OuFDVAcl5LvBdBZ-HyemErR0IiDtiMMNgmGPD4MWTHY,8996
|
111
|
-
monarch/tools/mesh_spec.py,sha256=
|
112
|
-
monarch/tools/network.py,sha256=
|
112
|
+
monarch/tools/mesh_spec.py,sha256=gj3p4fqLOVAnkrCcE0gY8tGhGBNi1Eu3KpQv5xzWCZ0,5484
|
113
|
+
monarch/tools/network.py,sha256=mN8Fx9mervxM3VdFHRn4ZXt4z7yWxZp52BTxx2tfpus,2455
|
113
114
|
monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
114
115
|
monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
|
115
116
|
monarch/tools/config/__init__.py,sha256=OPSflEmJB2zxAaRVzzWSWXV5M5vlknLgpulGdW1ze5U,510
|
@@ -136,7 +137,7 @@ tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,55
|
|
136
137
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
137
138
|
tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
|
138
139
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
139
|
-
tests/test_allocator.py,sha256=
|
140
|
+
tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
|
140
141
|
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
141
142
|
tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
|
142
143
|
tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
|
@@ -145,19 +146,20 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
|
|
145
146
|
tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
|
146
147
|
tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
|
147
148
|
tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
|
148
|
-
tests/test_python_actors.py,sha256=
|
149
|
+
tests/test_python_actors.py,sha256=0kF3LQpvPnAqT6xbNaBQxaG3gsMyBzzM4Ou7om9ZhoE,20069
|
149
150
|
tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
|
150
151
|
tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
|
151
152
|
tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
|
152
153
|
tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
|
154
|
+
tests/test_tensor_engine.py,sha256=ZYQlr77d1txMfQ4w7qqyCLhHGRwt57bsHs6E9oAd7SQ,1361
|
153
155
|
tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
154
156
|
tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wkB0sg,4565
|
155
157
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
156
158
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
157
159
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
158
|
-
torchmonarch_nightly-2025.6.
|
159
|
-
torchmonarch_nightly-2025.6.
|
160
|
-
torchmonarch_nightly-2025.6.
|
161
|
-
torchmonarch_nightly-2025.6.
|
162
|
-
torchmonarch_nightly-2025.6.
|
163
|
-
torchmonarch_nightly-2025.6.
|
160
|
+
torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
161
|
+
torchmonarch_nightly-2025.6.27.dist-info/METADATA,sha256=0PKqq2myfJJjhPa9nAZVJCp4vymD0dBmC1w-RmRKgYI,2780
|
162
|
+
torchmonarch_nightly-2025.6.27.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
163
|
+
torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
164
|
+
torchmonarch_nightly-2025.6.27.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
165
|
+
torchmonarch_nightly-2025.6.27.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/top_level.txt
RENAMED
File without changes
|