torchmonarch-nightly 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.27__cp310-cp310-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +109 -52
- monarch/_src/actor/endpoint.py +99 -8
- monarch/_src/actor/event_loop.py +1 -1
- monarch/_src/actor/proc_mesh.py +17 -9
- monarch/_src/actor/tensor_engine_shim.py +5 -2
- monarch/actor/__init__.py +2 -0
- monarch/common/messages.py +9 -0
- monarch/common/remote.py +2 -2
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/mesh_controller.py +76 -14
- monarch/monarch_controller +0 -0
- monarch/tools/cli.py +2 -2
- monarch/tools/commands.py +49 -27
- monarch/tools/components/hyperactor.py +5 -3
- monarch/tools/config/__init__.py +18 -1
- monarch/tools/config/defaults.py +2 -2
- monarch/tools/mesh_spec.py +4 -1
- tests/test_allocator.py +11 -15
- tests/test_env_before_cuda.py +2 -3
- tests/test_python_actors.py +12 -0
- tests/test_tensor_engine.py +27 -1
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/METADATA +34 -1
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/RECORD +28 -28
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/top_level.txt +0 -0
monarch/mesh_controller.py
CHANGED
@@ -30,6 +30,7 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
|
|
30
30
|
WorldState,
|
31
31
|
)
|
32
32
|
from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
|
33
|
+
from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
|
33
34
|
from monarch._rust_bindings.monarch_hyperactor.actor import (
|
34
35
|
PythonMessage,
|
35
36
|
PythonMessageKind,
|
@@ -44,10 +45,12 @@ from monarch._src.actor.endpoint import Selection
|
|
44
45
|
from monarch._src.actor.shape import NDSlice
|
45
46
|
from monarch.common import device_mesh, messages, stream
|
46
47
|
from monarch.common.controller_api import TController
|
48
|
+
from monarch.common.function import ResolvableFunction
|
47
49
|
from monarch.common.invocation import Seq
|
48
50
|
from monarch.common.messages import Referenceable, SendResultOfActorCall
|
49
51
|
from monarch.common.stream import StreamRef
|
50
|
-
from monarch.common.tensor import InputChecker, Tensor
|
52
|
+
from monarch.common.tensor import dtensor_check, InputChecker, Tensor
|
53
|
+
from monarch.common.tree import flatten
|
51
54
|
from monarch.tensor_worker_main import _set_trace
|
52
55
|
|
53
56
|
if TYPE_CHECKING:
|
@@ -265,17 +268,36 @@ class RemoteException(Exception):
|
|
265
268
|
return "<exception formatting RemoteException>"
|
266
269
|
|
267
270
|
|
268
|
-
def
|
271
|
+
def _cast_call_method_indirect(
|
269
272
|
endpoint: ActorEndpoint,
|
273
|
+
selection: Selection,
|
274
|
+
client: MeshClient,
|
275
|
+
seq: Seq,
|
270
276
|
args_kwargs_tuple: bytes,
|
271
277
|
refs: Sequence[Any],
|
272
|
-
|
273
|
-
selection: Selection,
|
274
|
-
):
|
278
|
+
) -> Tuple[str, int]:
|
275
279
|
unflatten_args = [
|
276
280
|
UnflattenArg.PyObject if isinstance(ref, Tensor) else UnflattenArg.Mailbox
|
277
281
|
for ref in refs
|
278
282
|
]
|
283
|
+
broker_id: Tuple[str, int] = client._mesh_controller.broker_id
|
284
|
+
actor_msg = PythonMessage(
|
285
|
+
PythonMessageKind.CallMethodIndirect(
|
286
|
+
endpoint._name, broker_id, seq, unflatten_args
|
287
|
+
),
|
288
|
+
args_kwargs_tuple,
|
289
|
+
)
|
290
|
+
endpoint._actor_mesh.cast(actor_msg, selection)
|
291
|
+
return broker_id
|
292
|
+
|
293
|
+
|
294
|
+
def actor_send(
|
295
|
+
endpoint: ActorEndpoint,
|
296
|
+
args_kwargs_tuple: bytes,
|
297
|
+
refs: Sequence[Any],
|
298
|
+
port: Optional[Port[Any]],
|
299
|
+
selection: Selection,
|
300
|
+
):
|
279
301
|
tensors = [ref for ref in refs if isinstance(ref, Tensor)]
|
280
302
|
# we have some monarch references, we need to ensure their
|
281
303
|
# proc_mesh matches that of the tensors we sent to it
|
@@ -284,7 +306,7 @@ def actor_send(
|
|
284
306
|
if hasattr(t, "stream"):
|
285
307
|
chosen_stream = t.stream
|
286
308
|
break
|
287
|
-
with InputChecker(
|
309
|
+
with InputChecker(tensors, lambda x: f"actor_call({x})") as checker:
|
288
310
|
checker.check_mesh_stream_local(device_mesh._active, chosen_stream)
|
289
311
|
# TODO: move propagators into Endpoint abstraction and run the propagator to get the
|
290
312
|
# mutates
|
@@ -300,8 +322,6 @@ def actor_send(
|
|
300
322
|
|
301
323
|
client = cast(MeshClient, checker.mesh.client)
|
302
324
|
|
303
|
-
broker_id: Tuple[str, int] = client._mesh_controller.broker_id
|
304
|
-
|
305
325
|
stream_ref = chosen_stream._to_ref(client)
|
306
326
|
|
307
327
|
fut = (port, checker.mesh._ndslice) if port is not None else None
|
@@ -316,13 +336,9 @@ def actor_send(
|
|
316
336
|
# The message to the generic actor tells it to first wait on the broker to get the local arguments
|
317
337
|
# from the stream, then it will run the actor method, and send the result to response port.
|
318
338
|
|
319
|
-
|
320
|
-
|
321
|
-
endpoint._name, broker_id, ident, unflatten_args
|
322
|
-
),
|
323
|
-
args_kwargs_tuple,
|
339
|
+
broker_id = _cast_call_method_indirect(
|
340
|
+
endpoint, selection, client, ident, args_kwargs_tuple, refs
|
324
341
|
)
|
325
|
-
endpoint._actor_mesh.cast(actor_msg, selection)
|
326
342
|
worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
|
327
343
|
client.send(checker.mesh._ndslice, worker_msg)
|
328
344
|
# we have to ask for status updates
|
@@ -330,3 +346,49 @@ def actor_send(
|
|
330
346
|
# enough work to count this future as finished,
|
331
347
|
# and all potential errors have been reported
|
332
348
|
client._request_status()
|
349
|
+
|
350
|
+
|
351
|
+
def actor_rref(endpoint, args_kwargs_tuple: bytes, refs: Sequence[Any]):
|
352
|
+
chosen_stream = stream._active
|
353
|
+
fake_result, dtensors, mutates, mesh = dtensor_check(
|
354
|
+
endpoint._propagate,
|
355
|
+
cast(ResolvableFunction, endpoint._name),
|
356
|
+
refs,
|
357
|
+
{},
|
358
|
+
device_mesh._active,
|
359
|
+
chosen_stream,
|
360
|
+
)
|
361
|
+
assert mesh is not None
|
362
|
+
|
363
|
+
fake_result_dtensors, unflatten_result = flatten(
|
364
|
+
fake_result, lambda x: isinstance(x, torch.Tensor)
|
365
|
+
)
|
366
|
+
result_dtensors = tuple(
|
367
|
+
Tensor(fake, mesh, chosen_stream) for fake in fake_result_dtensors
|
368
|
+
)
|
369
|
+
seq = mesh.client.new_node(result_dtensors + mutates, dtensors)
|
370
|
+
assert all(t.ref is not None for t in result_dtensors)
|
371
|
+
assert all(t.ref is not None for t in mutates)
|
372
|
+
result = result_msg = unflatten_result(result_dtensors)
|
373
|
+
if len(result_dtensors) == 0:
|
374
|
+
result_msg = None
|
375
|
+
|
376
|
+
broker_id = _cast_call_method_indirect(
|
377
|
+
endpoint, "all", mesh.client, seq, args_kwargs_tuple, refs
|
378
|
+
)
|
379
|
+
# note the device mesh has to be defined regardles so the remote functions
|
380
|
+
# can invoke mesh.rank("...")
|
381
|
+
|
382
|
+
mesh.define_remotely()
|
383
|
+
|
384
|
+
mesh._send(
|
385
|
+
messages.CallActorMethod(
|
386
|
+
seq,
|
387
|
+
result_msg,
|
388
|
+
broker_id,
|
389
|
+
refs,
|
390
|
+
cast("List[Ref]", mutates),
|
391
|
+
stream._active._to_ref(mesh.client),
|
392
|
+
)
|
393
|
+
)
|
394
|
+
return result
|
monarch/monarch_controller
CHANGED
Binary file
|
monarch/tools/cli.py
CHANGED
@@ -86,9 +86,9 @@ class CreateCmd:
|
|
86
86
|
else defaults.component_fn(config.scheduler)
|
87
87
|
)
|
88
88
|
component_args = component_args_from_cli(component_fn, args.component_args)
|
89
|
-
appdef = component_fn(**component_args)
|
89
|
+
config.appdef = component_fn(**component_args)
|
90
90
|
|
91
|
-
handle = create(config
|
91
|
+
handle = create(config)
|
92
92
|
print(handle)
|
93
93
|
|
94
94
|
|
monarch/tools/commands.py
CHANGED
@@ -7,18 +7,19 @@
|
|
7
7
|
# pyre-strict
|
8
8
|
|
9
9
|
import argparse
|
10
|
+
import asyncio
|
10
11
|
import inspect
|
11
12
|
import logging
|
12
13
|
import os
|
13
|
-
import
|
14
|
-
from datetime import timedelta
|
14
|
+
from datetime import datetime, timedelta
|
15
15
|
from typing import Any, Callable, Mapping, Optional, Union
|
16
16
|
|
17
|
+
from monarch.tools.components.hyperactor import DEFAULT_NAME
|
18
|
+
|
17
19
|
from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
|
18
20
|
Config,
|
19
21
|
defaults,
|
20
22
|
)
|
21
|
-
|
22
23
|
from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
|
23
24
|
from torchx.runner import Runner # @manual=//torchx/runner:lib_core
|
24
25
|
from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
|
@@ -83,7 +84,7 @@ def component_args_from_cli(
|
|
83
84
|
|
84
85
|
def create(
|
85
86
|
config: Config,
|
86
|
-
|
87
|
+
name: str = DEFAULT_NAME,
|
87
88
|
) -> Union[str, AppDryRunInfo]:
|
88
89
|
"""Creates a monarch server by submitting it as a job to the target scheduler.
|
89
90
|
|
@@ -94,7 +95,7 @@ def create(
|
|
94
95
|
from monarch.tools.config import defaults
|
95
96
|
|
96
97
|
config = defaults.config(scheduler="slurm")
|
97
|
-
appdef = defaults.component_fn(scheduler=config.scheduler)()
|
98
|
+
config.appdef = defaults.component_fn(scheduler=config.scheduler)()
|
98
99
|
|
99
100
|
config.scheduler_args.update(
|
100
101
|
{
|
@@ -105,7 +106,7 @@ def create(
|
|
105
106
|
)
|
106
107
|
config.dryrun = True
|
107
108
|
|
108
|
-
create(config
|
109
|
+
create(config)
|
109
110
|
|
110
111
|
|
111
112
|
Args:
|
@@ -114,6 +115,7 @@ def create(
|
|
114
115
|
component_fn: a function that returns the AppDef (job def).
|
115
116
|
If not provided, defaults to the configured default for the scheduler
|
116
117
|
(in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
|
118
|
+
name: the name of the job. If none, a default job name will be created.
|
117
119
|
"""
|
118
120
|
scheduler: str = config.scheduler
|
119
121
|
cfg: Mapping[str, CfgVal] = config.scheduler_args
|
@@ -122,6 +124,8 @@ def create(
|
|
122
124
|
os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
|
123
125
|
|
124
126
|
with torchx_runner() as runner:
|
127
|
+
appdef: AppDef = AppDef(name, config.appdef.roles, config.appdef.metadata)
|
128
|
+
|
125
129
|
info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
|
126
130
|
|
127
131
|
info_json_fmt = AppDryRunInfo(
|
@@ -170,6 +174,8 @@ def info(server_handle: str) -> Optional[ServerSpec]:
|
|
170
174
|
# null-guard since some schedulers do not fill replica_status
|
171
175
|
if host_status := replica_status.get(role.name):
|
172
176
|
spec.hostnames = [h.hostname for h in host_status]
|
177
|
+
# the mesh status is based on the "least progressive" replica status
|
178
|
+
spec.state = min(h.state for h in host_status)
|
173
179
|
|
174
180
|
mesh_specs.append(spec)
|
175
181
|
|
@@ -211,6 +217,8 @@ async def server_ready(
|
|
211
217
|
|
212
218
|
"""
|
213
219
|
|
220
|
+
check_interval_seconds = check_interval.total_seconds()
|
221
|
+
start = datetime.now()
|
214
222
|
while True:
|
215
223
|
server_spec = info(server_handle)
|
216
224
|
|
@@ -220,42 +228,56 @@ async def server_ready(
|
|
220
228
|
if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
|
221
229
|
# NOTE: TorchX currently does not have async APIs so need to loop-on-interval
|
222
230
|
# TODO maybe inverse exponential backoff instead of constant interval?
|
223
|
-
|
224
|
-
|
225
|
-
"
|
226
|
-
|
227
|
-
|
228
|
-
server_spec.state,
|
229
|
-
check_interval_seconds,
|
231
|
+
print(
|
232
|
+
f"Waiting for {server_handle} to be {AppState.RUNNING} (current: {server_spec.state}); "
|
233
|
+
f"will check again in {check_interval_seconds} seconds. "
|
234
|
+
f"Total wait time: {datetime.now() - start}",
|
235
|
+
end="\r",
|
230
236
|
)
|
231
|
-
|
237
|
+
await asyncio.sleep(check_interval_seconds)
|
232
238
|
continue
|
233
|
-
else:
|
234
|
-
return server_spec
|
235
|
-
|
236
239
|
|
240
|
+
# check if hosts are allocated for all the meshes
|
241
|
+
if server_spec.state == AppState.RUNNING:
|
242
|
+
running = True
|
243
|
+
for mesh_spec in server_spec.meshes:
|
244
|
+
if mesh_spec.state <= AppState.PENDING:
|
245
|
+
print(
|
246
|
+
f"Job {server_handle} is running but waiting for mesh {mesh_spec.name} "
|
247
|
+
f"to be {AppState.RUNNING} (current: {mesh_spec.state}); "
|
248
|
+
f"will check again in {check_interval_seconds} seconds. "
|
249
|
+
f"Total wait time: {datetime.now() - start}",
|
250
|
+
end="\r",
|
251
|
+
)
|
252
|
+
running = False
|
253
|
+
break
|
254
|
+
if not running:
|
255
|
+
await asyncio.sleep(check_interval_seconds)
|
256
|
+
continue
|
257
|
+
|
258
|
+
return server_spec
|
259
|
+
|
260
|
+
|
261
|
+
# TODO: this API is overloaded. Ideally, we do not need config to get or an handle to create.
|
237
262
|
async def get_or_create(
|
238
263
|
name: str,
|
239
264
|
config: Config,
|
240
|
-
appdef: AppDef,
|
241
265
|
check_interval: timedelta = _5_SECONDS,
|
242
266
|
) -> ServerSpec:
|
243
|
-
"""Waits for the server
|
267
|
+
"""Waits for the server based on identity `name` in the scheduler specified in the `config`
|
244
268
|
to be ready (e.g. RUNNING). If the server is not found then this function creates one
|
245
|
-
per the `
|
269
|
+
per the `config` spec, and waits for the server to be ready before returning.
|
246
270
|
|
247
271
|
Usage:
|
248
272
|
|
249
273
|
.. code-block:: python
|
250
274
|
|
251
|
-
import getpass
|
252
275
|
from monarch.tools.config import defaults
|
253
276
|
|
254
|
-
USER = getpass.getuser()
|
255
277
|
config = defaults.config(scheduler)
|
256
|
-
appdef = defaults.component_fn(config.scheduler)()
|
278
|
+
config.appdef = defaults.component_fn(config.scheduler)()
|
257
279
|
|
258
|
-
server_handle = get_or_create(
|
280
|
+
server_handle = get_or_create(name="my_job_name", config)
|
259
281
|
server_info = info(server_handle)
|
260
282
|
|
261
283
|
Returns: A `ServerSpec` containing information about either the existing or the newly
|
@@ -273,7 +295,7 @@ async def get_or_create(
|
|
273
295
|
)
|
274
296
|
|
275
297
|
# no dryrun (see assertion above) support so will always be a handle (str)
|
276
|
-
new_server_handle = str(create(config,
|
298
|
+
new_server_handle = str(create(config, name))
|
277
299
|
|
278
300
|
logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
|
279
301
|
|
@@ -289,10 +311,10 @@ async def get_or_create(
|
|
289
311
|
f"the new server `{new_server_handle}` has {server_info.state}"
|
290
312
|
)
|
291
313
|
|
292
|
-
|
314
|
+
print(f"\x1b[36mNew job `{new_server_handle}` is ready to serve. \x1b[0m")
|
293
315
|
return server_info
|
294
316
|
else:
|
295
|
-
|
317
|
+
print(f"\x1b[36mFound existing job `{server_handle}` ready to serve. \x1b[0m")
|
296
318
|
return server_info
|
297
319
|
|
298
320
|
|
@@ -9,6 +9,7 @@ import getpass
|
|
9
9
|
from typing import Optional
|
10
10
|
|
11
11
|
from monarch.tools import mesh_spec
|
12
|
+
from monarch.tools.config import UnnamedAppDef
|
12
13
|
from monarch.tools.mesh_spec import mesh_spec_from_str
|
13
14
|
from torchx import specs
|
14
15
|
|
@@ -16,17 +17,18 @@ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
|
|
16
17
|
|
17
18
|
_USER: str = getpass.getuser()
|
18
19
|
|
20
|
+
DEFAULT_NAME: str = f"monarch-{_USER}"
|
21
|
+
|
19
22
|
__version__ = "latest" # TODO get version from monarch.__version_
|
20
23
|
|
21
24
|
|
22
25
|
def proc_mesh(
|
23
|
-
name: str = f"monarch-{_USER}",
|
24
26
|
image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
|
25
27
|
meshes: list[str] = _DEFAULT_MESHES,
|
26
28
|
env: Optional[dict[str, str]] = None,
|
27
29
|
port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
|
28
30
|
program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
|
29
|
-
) ->
|
31
|
+
) -> UnnamedAppDef:
|
30
32
|
"""
|
31
33
|
Args:
|
32
34
|
name: the name of the monarch server job
|
@@ -37,7 +39,7 @@ def proc_mesh(
|
|
37
39
|
program: path to the binary that the remote process allocator spawns on an allocation request
|
38
40
|
"""
|
39
41
|
|
40
|
-
appdef =
|
42
|
+
appdef = UnnamedAppDef()
|
41
43
|
|
42
44
|
for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
|
43
45
|
mesh_role = specs.Role(
|
monarch/tools/config/__init__.py
CHANGED
@@ -6,15 +6,32 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
from dataclasses import dataclass, field
|
9
|
-
from typing import Any, Optional
|
9
|
+
from typing import Any, Dict, List, Optional
|
10
|
+
|
11
|
+
from torchx.specs import Role
|
10
12
|
|
11
13
|
|
12
14
|
NOT_SET: str = "__NOT_SET__"
|
13
15
|
|
14
16
|
|
17
|
+
@dataclass
|
18
|
+
class UnnamedAppDef:
|
19
|
+
"""
|
20
|
+
A TorchX AppDef without a name.
|
21
|
+
"""
|
22
|
+
|
23
|
+
roles: List[Role] = field(default_factory=list)
|
24
|
+
metadata: Dict[str, str] = field(default_factory=dict)
|
25
|
+
|
26
|
+
|
15
27
|
@dataclass
|
16
28
|
class Config:
|
29
|
+
"""
|
30
|
+
All configs needed to schedule a mesh of allocators.
|
31
|
+
"""
|
32
|
+
|
17
33
|
scheduler: str = NOT_SET
|
18
34
|
scheduler_args: dict[str, Any] = field(default_factory=dict)
|
19
35
|
workspace: Optional[str] = None
|
20
36
|
dryrun: bool = False
|
37
|
+
appdef: UnnamedAppDef = UnnamedAppDef()
|
monarch/tools/config/defaults.py
CHANGED
@@ -11,7 +11,7 @@
|
|
11
11
|
from typing import Callable, Optional
|
12
12
|
|
13
13
|
from monarch.tools.components import hyperactor
|
14
|
-
from monarch.tools.config import Config
|
14
|
+
from monarch.tools.config import Config, UnnamedAppDef
|
15
15
|
|
16
16
|
from torchx import specs
|
17
17
|
from torchx.schedulers import (
|
@@ -23,7 +23,7 @@ from torchx.schedulers import (
|
|
23
23
|
)
|
24
24
|
|
25
25
|
|
26
|
-
def component_fn(scheduler: str) -> Callable[...,
|
26
|
+
def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
|
27
27
|
"""The default TorchX component function for the scheduler"""
|
28
28
|
return hyperactor.proc_mesh
|
29
29
|
|
monarch/tools/mesh_spec.py
CHANGED
@@ -9,6 +9,8 @@ import string
|
|
9
9
|
from dataclasses import dataclass, field
|
10
10
|
from typing import Any, Optional
|
11
11
|
|
12
|
+
from monarch.tools.config import UnnamedAppDef
|
13
|
+
|
12
14
|
from monarch.tools.network import get_sockaddr
|
13
15
|
from torchx import specs
|
14
16
|
from torchx.specs.api import is_terminal
|
@@ -39,6 +41,7 @@ class MeshSpec:
|
|
39
41
|
transport: str = "tcp"
|
40
42
|
port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
|
41
43
|
hostnames: list[str] = field(default_factory=list)
|
44
|
+
state: specs.AppState = specs.AppState.UNSUBMITTED
|
42
45
|
|
43
46
|
def server_addrs(
|
44
47
|
self, transport: Optional[str] = None, port: Optional[int] = None
|
@@ -69,7 +72,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
|
|
69
72
|
return string.Template(tag_template).substitute(mesh_name=mesh_name)
|
70
73
|
|
71
74
|
|
72
|
-
def tag_as_metadata(mesh_spec: MeshSpec, appdef:
|
75
|
+
def tag_as_metadata(mesh_spec: MeshSpec, appdef: UnnamedAppDef) -> None:
|
73
76
|
appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
|
74
77
|
appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
|
75
78
|
appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
|
tests/test_allocator.py
CHANGED
@@ -33,7 +33,6 @@ from monarch._rust_bindings.monarch_hyperactor.channel import (
|
|
33
33
|
ChannelTransport,
|
34
34
|
)
|
35
35
|
|
36
|
-
from monarch._src.actor.actor_mesh import MonarchContext
|
37
36
|
from monarch._src.actor.allocator import (
|
38
37
|
ALLOC_LABEL_PROC_MESH_NAME,
|
39
38
|
LocalAllocator,
|
@@ -160,7 +159,7 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
|
160
159
|
"TEST_ENV_VAR_3": "value_3",
|
161
160
|
}
|
162
161
|
|
163
|
-
def setup_multiple_env_vars(
|
162
|
+
def setup_multiple_env_vars() -> None:
|
164
163
|
for name, value in env_vars.items():
|
165
164
|
os.environ[name] = value
|
166
165
|
|
@@ -184,36 +183,33 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
|
|
184
183
|
await proc_mesh.stop()
|
185
184
|
|
186
185
|
async def test_setup_lambda_with_context_info(self) -> None:
|
187
|
-
"""Test that the setup lambda can access
|
188
|
-
context_var_name: str = "
|
186
|
+
"""Test that the setup lambda can access rank information"""
|
187
|
+
context_var_name: str = "PROC_MESH_RANK_INFO"
|
189
188
|
|
190
|
-
def
|
191
|
-
context_info = f"
|
189
|
+
def setup_with_rank() -> None:
|
190
|
+
context_info = f"point_rank:{current_rank().rank}"
|
192
191
|
os.environ[context_var_name] = context_info
|
193
192
|
|
194
193
|
spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
|
195
194
|
allocator = LocalAllocator()
|
196
195
|
alloc = await allocator.allocate(spec)
|
197
196
|
|
198
|
-
proc_mesh = await ProcMesh.from_alloc(alloc, setup=
|
197
|
+
proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
|
199
198
|
|
200
199
|
try:
|
201
200
|
actor = await proc_mesh.spawn("env_check", EnvCheckActor)
|
202
201
|
|
203
|
-
|
202
|
+
rank_info = await actor.get_env_var.call_one(context_var_name)
|
204
203
|
|
205
204
|
self.assertNotEqual(
|
206
|
-
|
205
|
+
rank_info,
|
207
206
|
"NOT_SET",
|
208
207
|
"Context information was not stored in the environment variable",
|
209
208
|
)
|
210
|
-
self.assertIn(
|
211
|
-
"proc_id:", context_info, "Context information does not contain proc_id"
|
212
|
-
)
|
213
209
|
self.assertIn(
|
214
210
|
"point_rank:0",
|
215
|
-
|
216
|
-
f"Context information {
|
211
|
+
rank_info,
|
212
|
+
f"Context information {rank_info} does not contain point_rank",
|
217
213
|
)
|
218
214
|
finally:
|
219
215
|
await proc_mesh.stop()
|
@@ -435,7 +431,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
435
431
|
test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
|
436
432
|
test_var_value: str = "test_value_123"
|
437
433
|
|
438
|
-
def setup_env_vars(
|
434
|
+
def setup_env_vars() -> None:
|
439
435
|
os.environ[test_var_name] = test_var_value
|
440
436
|
|
441
437
|
hosts = 2
|
tests/test_env_before_cuda.py
CHANGED
@@ -15,7 +15,6 @@ import cloudpickle
|
|
15
15
|
|
16
16
|
import torch
|
17
17
|
from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
|
18
|
-
from monarch._src.actor.actor_mesh import MonarchContext
|
19
18
|
from monarch._src.actor.allocator import LocalAllocator
|
20
19
|
from monarch._src.actor.proc_mesh import proc_mesh
|
21
20
|
from monarch.actor import Actor, endpoint, ProcMesh
|
@@ -70,7 +69,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
|
|
70
69
|
"CUDA_LAUNCH_BLOCKING": "1",
|
71
70
|
}
|
72
71
|
|
73
|
-
def setup_cuda_env(
|
72
|
+
def setup_cuda_env() -> None:
|
74
73
|
for name, value in cuda_env_vars.items():
|
75
74
|
os.environ[name] = value
|
76
75
|
|
@@ -107,7 +106,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
|
|
107
106
|
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
|
108
107
|
}
|
109
108
|
|
110
|
-
def setup_cuda_env(
|
109
|
+
def setup_cuda_env() -> None:
|
111
110
|
for name, value in cuda_env_vars.items():
|
112
111
|
os.environ[name] = value
|
113
112
|
|
tests/test_python_actors.py
CHANGED
@@ -586,3 +586,15 @@ class TestActorMeshStop(unittest.IsolatedAsyncioTestCase):
|
|
586
586
|
|
587
587
|
await am_2.print.call("hello 3")
|
588
588
|
await am_2.log.call("hello 4")
|
589
|
+
|
590
|
+
|
591
|
+
class PortedActor(Actor):
|
592
|
+
@endpoint(explicit_response_port=True)
|
593
|
+
def add(self, port: "Port[int]", b: int) -> None:
|
594
|
+
port.send(3 + b)
|
595
|
+
|
596
|
+
|
597
|
+
def test_ported_actor():
|
598
|
+
proc_mesh = local_proc_mesh(gpus=1).get()
|
599
|
+
a = proc_mesh.spawn("port_actor", PortedActor).get()
|
600
|
+
assert 5 == a.add.call_one(2).get()
|
tests/test_tensor_engine.py
CHANGED
@@ -8,7 +8,7 @@ import monarch
|
|
8
8
|
import pytest
|
9
9
|
import torch
|
10
10
|
from monarch import remote
|
11
|
-
from monarch.actor import Actor, endpoint, proc_mesh
|
11
|
+
from monarch.actor import Actor, as_endpoint, endpoint, proc_mesh
|
12
12
|
from monarch.mesh_controller import spawn_tensor_engine
|
13
13
|
|
14
14
|
|
@@ -104,3 +104,29 @@ def test_actor_tensor_ordering() -> None:
|
|
104
104
|
results.append(counter.incr.call(1))
|
105
105
|
|
106
106
|
assert list(range(10)) == [r.get().item(hosts=0, gpus=0) for r in results]
|
107
|
+
|
108
|
+
|
109
|
+
class Linear(Actor):
|
110
|
+
def __init__(self, N: int, M: int):
|
111
|
+
self.weight = torch.zeros((N, M))
|
112
|
+
|
113
|
+
def forward(self, x) -> torch.Tensor:
|
114
|
+
return x @ self.weight
|
115
|
+
|
116
|
+
@endpoint(propagate="inspect")
|
117
|
+
def update(self, w: torch.Tensor) -> None:
|
118
|
+
self.weight += w
|
119
|
+
|
120
|
+
|
121
|
+
@two_gpu
|
122
|
+
def test_rref_actor() -> None:
|
123
|
+
pm = proc_mesh(gpus=1).get()
|
124
|
+
with pm.activate():
|
125
|
+
x = pm.spawn("linear", Linear, 3, 4).get()
|
126
|
+
|
127
|
+
y = torch.ones((4, 3))
|
128
|
+
t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
|
129
|
+
assert monarch.inspect(t.sum()).item() == 0
|
130
|
+
x.update.rref(torch.ones((3, 4)))
|
131
|
+
t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
|
132
|
+
assert monarch.inspect(t.sum()).item() == 3 * 4 * 4
|
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: torchmonarch-nightly
|
3
|
-
Version: 2025.7.
|
3
|
+
Version: 2025.7.27
|
4
4
|
Summary: Monarch: Single controller library
|
5
5
|
Author: Meta
|
6
6
|
Author-email: oncall+monarch@xmail.facebook.com
|
@@ -44,6 +44,8 @@ Note: Monarch is currently only supported on Linux systems
|
|
44
44
|
|
45
45
|
## Installation
|
46
46
|
|
47
|
+
### On Fedora distributions
|
48
|
+
|
47
49
|
`pip install torchmonarch-nightly`
|
48
50
|
|
49
51
|
or manually
|
@@ -88,6 +90,37 @@ pip install --no-build-isolation -e .
|
|
88
90
|
pytest python/tests/ -v -m "not oss_skip"
|
89
91
|
```
|
90
92
|
|
93
|
+
### On MacOS
|
94
|
+
|
95
|
+
You can also build Monarch to run locally on a MacOS system.
|
96
|
+
|
97
|
+
Note that this does not support tensor engine, which is tied to CUDA and RDMA (via ibverbs).
|
98
|
+
|
99
|
+
|
100
|
+
```sh
|
101
|
+
|
102
|
+
# Create and activate the conda environment
|
103
|
+
conda create -n monarchenv python=3.10 -y
|
104
|
+
conda activate monarchenv
|
105
|
+
|
106
|
+
# Install nightly rust toolchain
|
107
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
108
|
+
rustup toolchain install nightly
|
109
|
+
rustup default nightly
|
110
|
+
|
111
|
+
# Install build dependencies
|
112
|
+
pip install -r build-requirements.txt
|
113
|
+
# Install test dependencies
|
114
|
+
pip install -r python/tests/requirements.txt
|
115
|
+
|
116
|
+
# Build and install Monarch
|
117
|
+
USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
|
118
|
+
# or setup for development
|
119
|
+
USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
|
120
|
+
|
121
|
+
```
|
122
|
+
|
123
|
+
|
91
124
|
## Running examples
|
92
125
|
|
93
126
|
Check out the `examples/` directory for demonstrations of how to use Monarch's APIs.
|