torchmonarch-nightly 2025.6.18__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.20__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/actor_mesh.py +13 -8
- monarch/allocator.py +7 -2
- monarch/common/client.py +35 -3
- monarch/common/messages.py +5 -4
- monarch/common/remote.py +14 -21
- monarch/future.py +59 -8
- monarch/monarch_controller +0 -0
- monarch/tools/commands.py +64 -2
- monarch/tools/mesh_spec.py +7 -1
- tests/test_allocator.py +4 -3
- tests/test_python_actors.py +109 -0
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/RECORD +18 -18
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/top_level.txt +0 -0
monarch/_rust_bindings.so
CHANGED
Binary file
|
monarch/actor_mesh.py
CHANGED
@@ -48,8 +48,9 @@ from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
|
|
48
48
|
from monarch._rust_bindings.monarch_hyperactor.mailbox import (
|
49
49
|
Mailbox,
|
50
50
|
OncePortReceiver,
|
51
|
-
|
51
|
+
OncePortRef,
|
52
52
|
PortReceiver as HyPortReceiver,
|
53
|
+
PortRef,
|
53
54
|
)
|
54
55
|
from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
|
55
56
|
from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
|
@@ -227,6 +228,8 @@ class Endpoint(Generic[P, R]):
|
|
227
228
|
|
228
229
|
Load balanced RPC-style entrypoint for request/response messaging.
|
229
230
|
"""
|
231
|
+
p: Port[R]
|
232
|
+
r: PortReceiver[R]
|
230
233
|
p, r = port(self, once=True)
|
231
234
|
# pyre-ignore
|
232
235
|
send(self, args, kwargs, port=p, selection="choose")
|
@@ -365,7 +368,7 @@ def send(
|
|
365
368
|
message = PythonMessage(
|
366
369
|
endpoint._name,
|
367
370
|
_pickle((args, kwargs)),
|
368
|
-
None if port is None else port.
|
371
|
+
None if port is None else port._port_ref,
|
369
372
|
None,
|
370
373
|
)
|
371
374
|
endpoint._actor_mesh.cast(message, selection)
|
@@ -389,14 +392,16 @@ def endpoint(
|
|
389
392
|
|
390
393
|
|
391
394
|
class Port(Generic[R]):
|
392
|
-
def __init__(
|
393
|
-
self
|
395
|
+
def __init__(
|
396
|
+
self, port_ref: PortRef | OncePortRef, mailbox: Mailbox, rank: Optional[int]
|
397
|
+
) -> None:
|
398
|
+
self._port_ref = port_ref
|
394
399
|
self._mailbox = mailbox
|
395
400
|
self._rank = rank
|
396
401
|
|
397
402
|
def send(self, method: str, obj: R) -> None:
|
398
|
-
self.
|
399
|
-
self.
|
403
|
+
self._port_ref.send(
|
404
|
+
self._mailbox,
|
400
405
|
PythonMessage(method, _pickle(obj), None, self._rank),
|
401
406
|
)
|
402
407
|
|
@@ -410,8 +415,8 @@ def port(
|
|
410
415
|
handle, receiver = (
|
411
416
|
endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
|
412
417
|
)
|
413
|
-
|
414
|
-
return Port(
|
418
|
+
port_ref: PortRef | OncePortRef = handle.bind()
|
419
|
+
return Port(port_ref, endpoint._mailbox, rank=None), PortReceiver(
|
415
420
|
endpoint._mailbox, receiver
|
416
421
|
)
|
417
422
|
|
monarch/allocator.py
CHANGED
@@ -74,7 +74,7 @@ class RemoteAllocInitializer(abc.ABC):
|
|
74
74
|
"""
|
75
75
|
|
76
76
|
@abc.abstractmethod
|
77
|
-
async def initialize_alloc(self) -> list[str]:
|
77
|
+
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
78
78
|
"""
|
79
79
|
Return the addresses of the servers that should be used to allocate processes
|
80
80
|
for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
|
@@ -88,6 +88,10 @@ class RemoteAllocInitializer(abc.ABC):
|
|
88
88
|
in the future this method can be called multiple times and should return the current set of
|
89
89
|
addresses that are eligible to handle allocation requests.
|
90
90
|
|
91
|
+
Arguments:
|
92
|
+
- `match_labels`: The match labels specified in `AllocSpec.AllocConstraints`. Initializer implementations
|
93
|
+
can read specific labels for matching a set of hosts that will service `allocate()` requests.
|
94
|
+
|
91
95
|
"""
|
92
96
|
...
|
93
97
|
|
@@ -102,7 +106,8 @@ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
|
|
102
106
|
super().__init__()
|
103
107
|
self.addrs: list[str] = list(addrs)
|
104
108
|
|
105
|
-
async def initialize_alloc(self) -> list[str]:
|
109
|
+
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
110
|
+
_ = match_labels # Suppress unused variable warning
|
106
111
|
return list(self.addrs)
|
107
112
|
|
108
113
|
|
monarch/common/client.py
CHANGED
@@ -41,6 +41,8 @@ from monarch.common import messages
|
|
41
41
|
from monarch.common.borrows import Borrow, StorageAliases
|
42
42
|
from monarch.common.controller_api import LogMessage, MessageResult, TController
|
43
43
|
from monarch.common.device_mesh import DeviceMesh
|
44
|
+
|
45
|
+
from monarch.common.future import Future
|
44
46
|
from monarch.common.invocation import DeviceException, RemoteException, Seq
|
45
47
|
from monarch.common.recording import flatten_messages, Recording
|
46
48
|
|
@@ -52,9 +54,6 @@ from monarch.common.tree import tree_map
|
|
52
54
|
|
53
55
|
from . import _coalescing
|
54
56
|
|
55
|
-
if TYPE_CHECKING:
|
56
|
-
from monarch.common.future import Future
|
57
|
-
|
58
57
|
|
59
58
|
logger = logging.getLogger(__name__)
|
60
59
|
|
@@ -447,6 +446,39 @@ class Client:
|
|
447
446
|
def mesh_state(self) -> WorldState:
|
448
447
|
return self.inner.worker_world_state()
|
449
448
|
|
449
|
+
def fetch(
|
450
|
+
self,
|
451
|
+
mesh: "DeviceMesh",
|
452
|
+
stream: "StreamRef",
|
453
|
+
shard,
|
454
|
+
preprocess_message,
|
455
|
+
args,
|
456
|
+
kwargs,
|
457
|
+
defs: Tuple["Tensor", ...],
|
458
|
+
uses: Tuple["Tensor", ...],
|
459
|
+
) -> "Future":
|
460
|
+
fut = Future(self)
|
461
|
+
ident = self.new_node(defs, uses, fut)
|
462
|
+
process = mesh._process(shard)
|
463
|
+
self.send(
|
464
|
+
process,
|
465
|
+
messages.SendValue(
|
466
|
+
ident,
|
467
|
+
None,
|
468
|
+
defs,
|
469
|
+
preprocess_message,
|
470
|
+
args,
|
471
|
+
kwargs,
|
472
|
+
stream,
|
473
|
+
),
|
474
|
+
)
|
475
|
+
# we have to ask for status updates
|
476
|
+
# from workers to be sure they have finished
|
477
|
+
# enough work to count this future as finished,
|
478
|
+
# and all potential errors have been reported
|
479
|
+
self._request_status()
|
480
|
+
return fut
|
481
|
+
|
450
482
|
|
451
483
|
def tree_map_refs(first_ref: int, tree):
|
452
484
|
def translate_id(ref: int) -> int:
|
monarch/common/messages.py
CHANGED
@@ -25,7 +25,6 @@ from monarch._rust_bindings.monarch_extension import tensor_worker
|
|
25
25
|
from monarch.common.function import ResolvableFromCloudpickle, ResolvableFunction
|
26
26
|
from monarch.common.invocation import DeviceException, RemoteException
|
27
27
|
from monarch.common.reference import Referenceable
|
28
|
-
from monarch.common.stream import StreamRef
|
29
28
|
from monarch.common.tree import flattener
|
30
29
|
from pyre_extensions import none_throws
|
31
30
|
|
@@ -33,6 +32,8 @@ from .shape import NDSlice
|
|
33
32
|
from .tensor_factory import TensorFactory
|
34
33
|
|
35
34
|
if TYPE_CHECKING:
|
35
|
+
from monarch.common.stream import StreamRef
|
36
|
+
|
36
37
|
from .device_mesh import DeviceMesh, RemoteProcessGroup
|
37
38
|
from .pipe import Pipe
|
38
39
|
from .recording import Recording
|
@@ -98,7 +99,7 @@ class CreateDeviceMesh(NamedTuple):
|
|
98
99
|
|
99
100
|
|
100
101
|
class CreateStream(NamedTuple):
|
101
|
-
result: StreamRef
|
102
|
+
result: "StreamRef"
|
102
103
|
default: bool
|
103
104
|
|
104
105
|
def to_rust_message(self) -> tensor_worker.WorkerMessage:
|
@@ -132,7 +133,7 @@ class CallFunction(NamedTuple):
|
|
132
133
|
function: ResolvableFunction
|
133
134
|
args: Tuple[object, ...]
|
134
135
|
kwargs: Dict[str, object]
|
135
|
-
stream: StreamRef
|
136
|
+
stream: "StreamRef"
|
136
137
|
device_mesh: DeviceMesh
|
137
138
|
remote_process_groups: List[RemoteProcessGroup]
|
138
139
|
|
@@ -199,7 +200,7 @@ class RecordingFormal(NamedTuple):
|
|
199
200
|
class RecordingResult(NamedTuple):
|
200
201
|
input: Tensor | tensor_worker.Ref
|
201
202
|
output_index: int
|
202
|
-
stream: StreamRef
|
203
|
+
stream: "StreamRef"
|
203
204
|
|
204
205
|
def to_rust_message(self) -> tensor_worker.WorkerMessage:
|
205
206
|
return tensor_worker.RecordingResult(
|
monarch/common/remote.py
CHANGED
@@ -21,6 +21,7 @@ from typing import (
|
|
21
21
|
overload,
|
22
22
|
Protocol,
|
23
23
|
Tuple,
|
24
|
+
TYPE_CHECKING,
|
24
25
|
TypeVar,
|
25
26
|
)
|
26
27
|
|
@@ -30,6 +31,9 @@ import torch
|
|
30
31
|
|
31
32
|
from monarch.common import _coalescing, device_mesh, messages, stream
|
32
33
|
|
34
|
+
if TYPE_CHECKING:
|
35
|
+
from monarch.common.client import Client
|
36
|
+
|
33
37
|
from monarch.common.device_mesh import RemoteProcessGroup
|
34
38
|
from monarch.common.fake import fake_call
|
35
39
|
|
@@ -173,30 +177,19 @@ def _call_on_shard_and_fetch(
|
|
173
177
|
propagator, rfunction, args, kwargs, ambient_mesh, stream._active
|
174
178
|
)
|
175
179
|
|
176
|
-
client = mesh.client
|
180
|
+
client: "Client" = mesh.client
|
177
181
|
if _coalescing.is_active(client):
|
178
182
|
raise NotImplementedError("NYI: fetching results during a coalescing block")
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
preprocess_message,
|
189
|
-
args,
|
190
|
-
kwargs,
|
191
|
-
stream._active._to_ref(client),
|
192
|
-
),
|
183
|
+
return client.fetch(
|
184
|
+
mesh,
|
185
|
+
stream._active._to_ref(client),
|
186
|
+
shard,
|
187
|
+
preprocess_message,
|
188
|
+
args,
|
189
|
+
kwargs,
|
190
|
+
mutates,
|
191
|
+
dtensors,
|
193
192
|
)
|
194
|
-
# we have to ask for status updates
|
195
|
-
# from workers to be sure they have finished
|
196
|
-
# enough work to count this future as finished,
|
197
|
-
# and all potential errors have been reported
|
198
|
-
client._request_status()
|
199
|
-
return fut
|
200
193
|
|
201
194
|
|
202
195
|
@remote
|
monarch/future.py
CHANGED
@@ -5,21 +5,72 @@
|
|
5
5
|
# LICENSE file in the root directory of this source tree.
|
6
6
|
|
7
7
|
import asyncio
|
8
|
-
from
|
8
|
+
from functools import partial
|
9
|
+
from typing import Generator, Generic, Optional, TypeVar
|
9
10
|
|
10
11
|
R = TypeVar("R")
|
11
12
|
|
12
13
|
|
14
|
+
def _incomplete(impl, self):
|
15
|
+
try:
|
16
|
+
return self._set_result(impl())
|
17
|
+
except Exception as e:
|
18
|
+
self._set_exception(e)
|
19
|
+
raise
|
20
|
+
|
21
|
+
|
22
|
+
async def _aincomplete(impl, self):
|
23
|
+
try:
|
24
|
+
return self._set_result(await impl())
|
25
|
+
except Exception as e:
|
26
|
+
self._set_exception(e)
|
27
|
+
raise
|
28
|
+
|
29
|
+
|
13
30
|
# TODO: consolidate with monarch.common.future
|
14
31
|
class ActorFuture(Generic[R]):
|
15
32
|
def __init__(self, impl, blocking_impl=None):
|
16
|
-
|
17
|
-
|
33
|
+
if blocking_impl is None:
|
34
|
+
blocking_impl = partial(asyncio.run, impl())
|
35
|
+
self._get = partial(_incomplete, blocking_impl)
|
36
|
+
self._aget = partial(_aincomplete, impl)
|
18
37
|
|
19
|
-
def get(self) -> R:
|
20
|
-
if
|
21
|
-
return self.
|
22
|
-
return
|
38
|
+
def get(self, timeout: Optional[float] = None) -> R:
|
39
|
+
if timeout is not None:
|
40
|
+
return asyncio.run(asyncio.wait_for(self._aget(self), timeout))
|
41
|
+
return self._get(self)
|
23
42
|
|
24
43
|
def __await__(self) -> Generator[R, None, R]:
|
25
|
-
return self.
|
44
|
+
return self._aget(self).__await__()
|
45
|
+
|
46
|
+
def _set_result(self, result):
|
47
|
+
def f(self):
|
48
|
+
return result
|
49
|
+
|
50
|
+
async def af(self):
|
51
|
+
return result
|
52
|
+
|
53
|
+
self._get, self._aget = f, af
|
54
|
+
return result
|
55
|
+
|
56
|
+
def _set_exception(self, e):
|
57
|
+
def f(self):
|
58
|
+
raise e
|
59
|
+
|
60
|
+
async def af(self):
|
61
|
+
raise e
|
62
|
+
|
63
|
+
self._get, self._aget = f, af
|
64
|
+
|
65
|
+
# compatibility with old tensor engine Future objects
|
66
|
+
# hopefully we do not need done(), add_callback because
|
67
|
+
# they are harder to implement right.
|
68
|
+
def result(self, timeout: Optional[float] = None) -> R:
|
69
|
+
return self.get(timeout)
|
70
|
+
|
71
|
+
def exception(self, timeout: Optional[float] = None):
|
72
|
+
try:
|
73
|
+
self.get(timeout)
|
74
|
+
return None
|
75
|
+
except Exception as e:
|
76
|
+
return e
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/tools/commands.py
CHANGED
@@ -9,7 +9,10 @@
|
|
9
9
|
import argparse
|
10
10
|
import functools
|
11
11
|
import inspect
|
12
|
+
import logging
|
12
13
|
import os
|
14
|
+
import time
|
15
|
+
from datetime import timedelta
|
13
16
|
from typing import Any, Callable, Mapping, Optional, Union
|
14
17
|
|
15
18
|
from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
|
@@ -18,12 +21,13 @@ from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/con
|
|
18
21
|
)
|
19
22
|
|
20
23
|
from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
|
21
|
-
|
22
24
|
from torchx.runner import Runner
|
23
|
-
from torchx.specs import AppDef, AppDryRunInfo, CfgVal
|
25
|
+
from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
|
24
26
|
from torchx.specs.builders import parse_args
|
25
27
|
from torchx.util.types import decode, decode_optional
|
26
28
|
|
29
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
30
|
+
|
27
31
|
|
28
32
|
def torchx_runner() -> Runner:
|
29
33
|
# namespace is currently unused so make it empty str
|
@@ -165,15 +169,73 @@ def info(server_handle: str) -> Optional[ServerSpec]:
|
|
165
169
|
if appdef is None:
|
166
170
|
return None
|
167
171
|
|
172
|
+
# host status grouped by mesh (role) names
|
173
|
+
replica_status = {r.role: r.replicas for r in status.roles}
|
174
|
+
|
168
175
|
mesh_specs = []
|
169
176
|
for role in appdef.roles:
|
170
177
|
spec = mesh_spec_from_metadata(appdef, role.name)
|
171
178
|
assert spec is not None, "cannot be 'None' since we iterate over appdef's roles"
|
179
|
+
|
180
|
+
# null-guard since some schedulers do not fill replica_status
|
181
|
+
if host_status := replica_status.get(role.name):
|
182
|
+
spec.hostnames = [h.hostname for h in host_status]
|
183
|
+
|
172
184
|
mesh_specs.append(spec)
|
173
185
|
|
174
186
|
return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
|
175
187
|
|
176
188
|
|
189
|
+
_5_SECONDS = timedelta(seconds=5)
|
190
|
+
|
191
|
+
|
192
|
+
async def server_ready(
|
193
|
+
server_handle: str, check_interval: timedelta = _5_SECONDS
|
194
|
+
) -> Optional[ServerSpec]:
|
195
|
+
"""Waits until the server's job is in RUNNING state to returns the server spec.
|
196
|
+
Returns `None` if the server does not exist.
|
197
|
+
|
198
|
+
NOTE: Certain fields such as `hostnames` is only filled (and valid) when the server is RUNNING.
|
199
|
+
|
200
|
+
Usage:
|
201
|
+
|
202
|
+
.. code-block:: python
|
203
|
+
|
204
|
+
server_info = await server_ready("slurm:///123")
|
205
|
+
if not server_info:
|
206
|
+
print(f"Job does not exist")
|
207
|
+
else:
|
208
|
+
if server_info.is_running:
|
209
|
+
for mesh in server_info.meshes:
|
210
|
+
connect_to(mesh.hostnames)
|
211
|
+
else:
|
212
|
+
print(f"Job in {server_info.state} state. Hostnames are not available")
|
213
|
+
|
214
|
+
"""
|
215
|
+
|
216
|
+
while True:
|
217
|
+
server_spec = info(server_handle)
|
218
|
+
|
219
|
+
if not server_spec: # server not found
|
220
|
+
return None
|
221
|
+
|
222
|
+
if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
|
223
|
+
# NOTE: TorchX currently does not have async APIs so need to loop-on-interval
|
224
|
+
# TODO maybe inverse exponential backoff instead of constant interval?
|
225
|
+
check_interval_seconds = check_interval.total_seconds()
|
226
|
+
logger.info(
|
227
|
+
"waiting for %s to be %s (current: %s), will check again in %g seconds...",
|
228
|
+
server_handle,
|
229
|
+
AppState.RUNNING,
|
230
|
+
server_spec.state,
|
231
|
+
check_interval_seconds,
|
232
|
+
)
|
233
|
+
time.sleep(check_interval_seconds)
|
234
|
+
continue
|
235
|
+
else:
|
236
|
+
return server_spec
|
237
|
+
|
238
|
+
|
177
239
|
def kill(server_handle: str) -> None:
|
178
240
|
with torchx_runner() as runner:
|
179
241
|
runner.cancel(server_handle)
|
monarch/tools/mesh_spec.py
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
import string
|
9
|
-
from dataclasses import dataclass
|
9
|
+
from dataclasses import dataclass, field
|
10
10
|
from typing import Any, Optional
|
11
11
|
|
12
12
|
from torchx import specs
|
@@ -29,6 +29,7 @@ class MeshSpec:
|
|
29
29
|
host_type: str
|
30
30
|
gpus: int
|
31
31
|
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
|
32
|
+
hostnames: list[str] = field(default_factory=list)
|
32
33
|
|
33
34
|
|
34
35
|
def _tag(mesh_name: str, tag_template: str) -> str:
|
@@ -84,6 +85,10 @@ class ServerSpec:
|
|
84
85
|
state: specs.AppState
|
85
86
|
meshes: list[MeshSpec]
|
86
87
|
|
88
|
+
@property
|
89
|
+
def is_running(self) -> bool:
|
90
|
+
return self.state == specs.AppState.RUNNING
|
91
|
+
|
87
92
|
def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
|
88
93
|
for mesh_spec in self.meshes:
|
89
94
|
if mesh_spec.name == mesh_name:
|
@@ -115,6 +120,7 @@ class ServerSpec:
|
|
115
120
|
"host_type": mesh.host_type,
|
116
121
|
"hosts": mesh.num_hosts,
|
117
122
|
"gpus": mesh.gpus,
|
123
|
+
"hostnames": mesh.hostnames,
|
118
124
|
}
|
119
125
|
for mesh in self.meshes
|
120
126
|
},
|
tests/test_allocator.py
CHANGED
@@ -116,8 +116,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
116
116
|
used to test that the state of the initializer is preserved across calls to allocate()
|
117
117
|
"""
|
118
118
|
|
119
|
-
async def initialize_alloc(self) -> list[str]:
|
120
|
-
alloc = await super().initialize_alloc()
|
119
|
+
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
120
|
+
alloc = await super().initialize_alloc(match_labels)
|
121
121
|
self.addrs.pop(-1)
|
122
122
|
return alloc
|
123
123
|
|
@@ -142,7 +142,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
142
142
|
class EmptyAllocInitializer(StaticRemoteAllocInitializer):
|
143
143
|
"""test initializer that returns an empty list of addresses"""
|
144
144
|
|
145
|
-
async def initialize_alloc(self) -> list[str]:
|
145
|
+
async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
|
146
|
+
_ = match_labels # Suppress unused variable warning
|
146
147
|
return []
|
147
148
|
|
148
149
|
empty_initializer = EmptyAllocInitializer()
|
tests/test_python_actors.py
CHANGED
@@ -9,6 +9,7 @@ import operator
|
|
9
9
|
import os
|
10
10
|
import re
|
11
11
|
import threading
|
12
|
+
import time
|
12
13
|
from types import ModuleType
|
13
14
|
from unittest.mock import AsyncMock, patch
|
14
15
|
|
@@ -28,6 +29,7 @@ from monarch.actor_mesh import (
|
|
28
29
|
MonarchContext,
|
29
30
|
)
|
30
31
|
from monarch.debugger import init_debugging
|
32
|
+
from monarch.future import ActorFuture
|
31
33
|
|
32
34
|
from monarch.mesh_controller import spawn_tensor_engine
|
33
35
|
|
@@ -391,6 +393,16 @@ def test_rust_binding_modules_correct() -> None:
|
|
391
393
|
check(bindings, "monarch._rust_bindings")
|
392
394
|
|
393
395
|
|
396
|
+
def test_proc_mesh_liveness() -> None:
|
397
|
+
mesh = proc_mesh(gpus=2).get()
|
398
|
+
counter = mesh.spawn("counter", Counter, 1).get()
|
399
|
+
del mesh
|
400
|
+
# Give some time for the mesh to have been shut down.
|
401
|
+
# (It only would if there were a bug.)
|
402
|
+
time.sleep(0.5)
|
403
|
+
counter.value.call().get()
|
404
|
+
|
405
|
+
|
394
406
|
two_gpu = pytest.mark.skipif(
|
395
407
|
torch.cuda.device_count() < 2,
|
396
408
|
reason="Not enough GPUs, this test requires at least 2 GPUs",
|
@@ -661,3 +673,100 @@ async def test_async_concurrency():
|
|
661
673
|
# actually concurrently processing messages.
|
662
674
|
await am.no_more.call()
|
663
675
|
await fut
|
676
|
+
|
677
|
+
|
678
|
+
async def awaitit(f):
|
679
|
+
return await f
|
680
|
+
|
681
|
+
|
682
|
+
def test_actor_future():
|
683
|
+
v = 0
|
684
|
+
|
685
|
+
async def incr():
|
686
|
+
nonlocal v
|
687
|
+
v += 1
|
688
|
+
return v
|
689
|
+
|
690
|
+
# can use async implementation from sync
|
691
|
+
# if no non-blocking is provided
|
692
|
+
f = ActorFuture(incr)
|
693
|
+
assert f.get() == 1
|
694
|
+
assert v == 1
|
695
|
+
assert f.get() == 1
|
696
|
+
assert asyncio.run(awaitit(f)) == 1
|
697
|
+
|
698
|
+
f = ActorFuture(incr)
|
699
|
+
assert asyncio.run(awaitit(f)) == 2
|
700
|
+
assert f.get() == 2
|
701
|
+
|
702
|
+
def incr2():
|
703
|
+
nonlocal v
|
704
|
+
v += 2
|
705
|
+
return v
|
706
|
+
|
707
|
+
# Use non-blocking optimization if provided
|
708
|
+
f = ActorFuture(incr, incr2)
|
709
|
+
assert f.get() == 4
|
710
|
+
assert asyncio.run(awaitit(f)) == 4
|
711
|
+
|
712
|
+
async def nope():
|
713
|
+
nonlocal v
|
714
|
+
v += 1
|
715
|
+
raise ValueError("nope")
|
716
|
+
|
717
|
+
f = ActorFuture(nope)
|
718
|
+
|
719
|
+
with pytest.raises(ValueError):
|
720
|
+
f.get()
|
721
|
+
|
722
|
+
assert v == 5
|
723
|
+
|
724
|
+
with pytest.raises(ValueError):
|
725
|
+
f.get()
|
726
|
+
|
727
|
+
assert v == 5
|
728
|
+
|
729
|
+
with pytest.raises(ValueError):
|
730
|
+
asyncio.run(awaitit(f))
|
731
|
+
|
732
|
+
assert v == 5
|
733
|
+
|
734
|
+
def nope():
|
735
|
+
nonlocal v
|
736
|
+
v += 1
|
737
|
+
raise ValueError("nope")
|
738
|
+
|
739
|
+
f = ActorFuture(incr, nope)
|
740
|
+
|
741
|
+
with pytest.raises(ValueError):
|
742
|
+
f.get()
|
743
|
+
|
744
|
+
assert v == 6
|
745
|
+
|
746
|
+
with pytest.raises(ValueError):
|
747
|
+
f.result()
|
748
|
+
|
749
|
+
assert f.exception() is not None
|
750
|
+
|
751
|
+
assert v == 6
|
752
|
+
|
753
|
+
with pytest.raises(ValueError):
|
754
|
+
asyncio.run(awaitit(f))
|
755
|
+
|
756
|
+
assert v == 6
|
757
|
+
|
758
|
+
async def seven():
|
759
|
+
return 7
|
760
|
+
|
761
|
+
f = ActorFuture(seven)
|
762
|
+
|
763
|
+
assert 7 == f.get(timeout=0.001)
|
764
|
+
|
765
|
+
async def neverfinish():
|
766
|
+
f = asyncio.Future()
|
767
|
+
await f
|
768
|
+
|
769
|
+
f = ActorFuture(neverfinish)
|
770
|
+
|
771
|
+
with pytest.raises(asyncio.exceptions.TimeoutError):
|
772
|
+
f.get(timeout=0.1)
|
{torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/RECORD
RENAMED
@@ -1,17 +1,17 @@
|
|
1
1
|
monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
|
2
|
-
monarch/_rust_bindings.so,sha256=
|
2
|
+
monarch/_rust_bindings.so,sha256=sDdg6RjptgNPmFnFiDAgv36k_Or_Kz47aYaZ2M5EAao,41088032
|
3
3
|
monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
|
4
|
-
monarch/actor_mesh.py,sha256=
|
5
|
-
monarch/allocator.py,sha256=
|
4
|
+
monarch/actor_mesh.py,sha256=m6QapbZHqYujXya28jW1II2wkBUV_nKGvxmWPSW9lsQ,24327
|
5
|
+
monarch/allocator.py,sha256=UEaVLntH4xQ8Lr84TbgcXusvuK8FhSMJmav-omztUbw,4473
|
6
6
|
monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
|
7
7
|
monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
|
8
8
|
monarch/debugger.py,sha256=AdlvOG3X-9Pw9c1DLQYEy4vjEfh0ZtwtsNJEFLFzN8o,13312
|
9
9
|
monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
|
10
|
-
monarch/future.py,sha256=
|
10
|
+
monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
|
11
11
|
monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
|
12
12
|
monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
|
13
13
|
monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
|
14
|
-
monarch/monarch_controller,sha256=
|
14
|
+
monarch/monarch_controller,sha256=sWOUMClz3JPUjZbppDWgdrPOAjbydygdRPDZ1kaAVC4,20328464
|
15
15
|
monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
|
16
16
|
monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
|
17
17
|
monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
|
@@ -46,7 +46,7 @@ monarch/common/_device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4
|
|
46
46
|
monarch/common/_tensor_to_table.py,sha256=yRjCNwvtl188Z1Dwkx3ZU-Bh2mwYnQ0Lnue2RAztwvc,5753
|
47
47
|
monarch/common/base_tensor.py,sha256=ujRzR6lWaeCdPv2JX0vCR-VsCWn-3SHaJIkZH1Sw9FQ,1159
|
48
48
|
monarch/common/borrows.py,sha256=7KR62xoUat1T6FyADsdHsxVAVIJDvfJWUnPO-xx277U,5307
|
49
|
-
monarch/common/client.py,sha256=
|
49
|
+
monarch/common/client.py,sha256=abYQqi-yFzG0ERvh3gMC5UgiWSezmM20kbxzalKpnf4,25806
|
50
50
|
monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
|
51
51
|
monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
|
52
52
|
monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
|
@@ -57,7 +57,7 @@ monarch/common/function_caching.py,sha256=HVdbWtv6Eea7ENMWi8iv36w1G1TaVuUJhkUX_J
|
|
57
57
|
monarch/common/future.py,sha256=D1UJ_8Rvb8-VG9vNE-z7xz2m2otMd2HgB0rnA02nlvA,4681
|
58
58
|
monarch/common/invocation.py,sha256=L4mSmzqlHMxo1Tb71hBU_M8aBZCRCOcb6vvPhvvewec,4195
|
59
59
|
monarch/common/mast.py,sha256=XTzYljGR0aZ7GjmNMPgU2HyuL4HWSAy4IwE3kEDqdOw,7735
|
60
|
-
monarch/common/messages.py,sha256=
|
60
|
+
monarch/common/messages.py,sha256=OFMd_4yBoMIHjdXcKcJDG88iERfViLG3QxTqzwV4Gnw,18289
|
61
61
|
monarch/common/mock_cuda.py,sha256=x6ho1Ton6BbKjBZ5ZxnFOUaQM032X70wnpoUNB7Ci2w,1039
|
62
62
|
monarch/common/opaque_ref.py,sha256=tWNvOC6CsjNPKD1JDx-8PSaeXqZC3eermgBExUPKML4,2871
|
63
63
|
monarch/common/pickle_flatten.py,sha256=2mc-dPiZy7kRqAstyfMLnPuoGJwsBftYYEHyF_HOZw4,1313
|
@@ -65,7 +65,7 @@ monarch/common/pipe.py,sha256=9pTf8--3yOv4HpnJEhgcmc_JM6Az4uL1y72TSQA55dw,5013
|
|
65
65
|
monarch/common/process_group.py,sha256=FbJ_AJRZYFkvQ68L2naRq64J_aNuAKe5kO0MWdn_x74,1662
|
66
66
|
monarch/common/recording.py,sha256=hoI9VY_FyW_xVx-jmfsKydqX5vW2GulwcDWsBdUVOm8,4637
|
67
67
|
monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,938
|
68
|
-
monarch/common/remote.py,sha256=
|
68
|
+
monarch/common/remote.py,sha256=vklFYJvuaPpS8kAyFmRz-T-brfHvcZ1lPTC_-7DIwqM,8908
|
69
69
|
monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
|
70
70
|
monarch/common/shape.py,sha256=B-7DI768ZhT8ECUNCJcI7DfCB7iDFGFH0r-HmXaAfcM,8296
|
71
71
|
monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
|
@@ -107,8 +107,8 @@ monarch/timer/execution_timer.py,sha256=1YsrLIZirdohKOeFAU2H4UcONhQXHuctJbYcoX8I
|
|
107
107
|
monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_uxl9SOHak,4486
|
108
108
|
monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
109
109
|
monarch/tools/cli.py,sha256=EIdarsfuFX0WqRCe29_5GNKWJBhxx0lABalw3zPSagw,4977
|
110
|
-
monarch/tools/commands.py,sha256=
|
111
|
-
monarch/tools/mesh_spec.py,sha256=
|
110
|
+
monarch/tools/commands.py,sha256=OuFDVAcl5LvBdBZ-HyemErR0IiDtiMMNgmGPD4MWTHY,8996
|
111
|
+
monarch/tools/mesh_spec.py,sha256=3Qp7Lu3pAa9tfaG-METsCmj-QXECQ6OsrPWiLydWvKc,3914
|
112
112
|
monarch/tools/network.py,sha256=bRj-jOs5qDqnM3BcE9MSXCLS01hiMN4YSWfKZ_d7bc4,2182
|
113
113
|
monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
114
114
|
monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
|
@@ -136,7 +136,7 @@ tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,55
|
|
136
136
|
tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
|
137
137
|
tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
|
138
138
|
tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
|
139
|
-
tests/test_allocator.py,sha256=
|
139
|
+
tests/test_allocator.py,sha256=jaYWPVEFdcK0XmmEA1Y9uwkeBjhxb2iI1GUL6IZKh4s,8305
|
140
140
|
tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
|
141
141
|
tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
|
142
142
|
tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
|
@@ -145,7 +145,7 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
|
|
145
145
|
tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
|
146
146
|
tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
|
147
147
|
tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
|
148
|
-
tests/test_python_actors.py,sha256=
|
148
|
+
tests/test_python_actors.py,sha256=du0AiGiKtVHOLkDUKu6gV75eYf_NoHDKV6utKzrplz4,21010
|
149
149
|
tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
|
150
150
|
tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
|
151
151
|
tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
|
@@ -155,9 +155,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
|
|
155
155
|
tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
|
156
156
|
tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
|
157
157
|
tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
|
158
|
-
torchmonarch_nightly-2025.6.
|
159
|
-
torchmonarch_nightly-2025.6.
|
160
|
-
torchmonarch_nightly-2025.6.
|
161
|
-
torchmonarch_nightly-2025.6.
|
162
|
-
torchmonarch_nightly-2025.6.
|
163
|
-
torchmonarch_nightly-2025.6.
|
158
|
+
torchmonarch_nightly-2025.6.20.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
159
|
+
torchmonarch_nightly-2025.6.20.dist-info/METADATA,sha256=QKiDH01IYFpa492TDs5WzWeDRbjMKmpSAc3V9NpQ5YM,2772
|
160
|
+
torchmonarch_nightly-2025.6.20.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
|
161
|
+
torchmonarch_nightly-2025.6.20.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
|
162
|
+
torchmonarch_nightly-2025.6.20.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
|
163
|
+
torchmonarch_nightly-2025.6.20.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
{torchmonarch_nightly-2025.6.18.dist-info → torchmonarch_nightly-2025.6.20.dist-info}/top_level.txt
RENAMED
File without changes
|