torchmonarch-nightly 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.27__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,7 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
30
30
  WorldState,
31
31
  )
32
32
  from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
33
+ from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
33
34
  from monarch._rust_bindings.monarch_hyperactor.actor import (
34
35
  PythonMessage,
35
36
  PythonMessageKind,
@@ -44,10 +45,12 @@ from monarch._src.actor.endpoint import Selection
44
45
  from monarch._src.actor.shape import NDSlice
45
46
  from monarch.common import device_mesh, messages, stream
46
47
  from monarch.common.controller_api import TController
48
+ from monarch.common.function import ResolvableFunction
47
49
  from monarch.common.invocation import Seq
48
50
  from monarch.common.messages import Referenceable, SendResultOfActorCall
49
51
  from monarch.common.stream import StreamRef
50
- from monarch.common.tensor import InputChecker, Tensor
52
+ from monarch.common.tensor import dtensor_check, InputChecker, Tensor
53
+ from monarch.common.tree import flatten
51
54
  from monarch.tensor_worker_main import _set_trace
52
55
 
53
56
  if TYPE_CHECKING:
@@ -265,17 +268,36 @@ class RemoteException(Exception):
265
268
  return "<exception formatting RemoteException>"
266
269
 
267
270
 
268
- def actor_send(
271
+ def _cast_call_method_indirect(
269
272
  endpoint: ActorEndpoint,
273
+ selection: Selection,
274
+ client: MeshClient,
275
+ seq: Seq,
270
276
  args_kwargs_tuple: bytes,
271
277
  refs: Sequence[Any],
272
- port: Optional[Port[Any]],
273
- selection: Selection,
274
- ):
278
+ ) -> Tuple[str, int]:
275
279
  unflatten_args = [
276
280
  UnflattenArg.PyObject if isinstance(ref, Tensor) else UnflattenArg.Mailbox
277
281
  for ref in refs
278
282
  ]
283
+ broker_id: Tuple[str, int] = client._mesh_controller.broker_id
284
+ actor_msg = PythonMessage(
285
+ PythonMessageKind.CallMethodIndirect(
286
+ endpoint._name, broker_id, seq, unflatten_args
287
+ ),
288
+ args_kwargs_tuple,
289
+ )
290
+ endpoint._actor_mesh.cast(actor_msg, selection)
291
+ return broker_id
292
+
293
+
294
+ def actor_send(
295
+ endpoint: ActorEndpoint,
296
+ args_kwargs_tuple: bytes,
297
+ refs: Sequence[Any],
298
+ port: Optional[Port[Any]],
299
+ selection: Selection,
300
+ ):
279
301
  tensors = [ref for ref in refs if isinstance(ref, Tensor)]
280
302
  # we have some monarch references, we need to ensure their
281
303
  # proc_mesh matches that of the tensors we sent to it
@@ -284,7 +306,7 @@ def actor_send(
284
306
  if hasattr(t, "stream"):
285
307
  chosen_stream = t.stream
286
308
  break
287
- with InputChecker(refs, lambda x: f"actor_call({x})") as checker:
309
+ with InputChecker(tensors, lambda x: f"actor_call({x})") as checker:
288
310
  checker.check_mesh_stream_local(device_mesh._active, chosen_stream)
289
311
  # TODO: move propagators into Endpoint abstraction and run the propagator to get the
290
312
  # mutates
@@ -300,8 +322,6 @@ def actor_send(
300
322
 
301
323
  client = cast(MeshClient, checker.mesh.client)
302
324
 
303
- broker_id: Tuple[str, int] = client._mesh_controller.broker_id
304
-
305
325
  stream_ref = chosen_stream._to_ref(client)
306
326
 
307
327
  fut = (port, checker.mesh._ndslice) if port is not None else None
@@ -316,13 +336,9 @@ def actor_send(
316
336
  # The message to the generic actor tells it to first wait on the broker to get the local arguments
317
337
  # from the stream, then it will run the actor method, and send the result to response port.
318
338
 
319
- actor_msg = PythonMessage(
320
- PythonMessageKind.CallMethodIndirect(
321
- endpoint._name, broker_id, ident, unflatten_args
322
- ),
323
- args_kwargs_tuple,
339
+ broker_id = _cast_call_method_indirect(
340
+ endpoint, selection, client, ident, args_kwargs_tuple, refs
324
341
  )
325
- endpoint._actor_mesh.cast(actor_msg, selection)
326
342
  worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
327
343
  client.send(checker.mesh._ndslice, worker_msg)
328
344
  # we have to ask for status updates
@@ -330,3 +346,49 @@ def actor_send(
330
346
  # enough work to count this future as finished,
331
347
  # and all potential errors have been reported
332
348
  client._request_status()
349
+
350
+
351
+ def actor_rref(endpoint, args_kwargs_tuple: bytes, refs: Sequence[Any]):
352
+ chosen_stream = stream._active
353
+ fake_result, dtensors, mutates, mesh = dtensor_check(
354
+ endpoint._propagate,
355
+ cast(ResolvableFunction, endpoint._name),
356
+ refs,
357
+ {},
358
+ device_mesh._active,
359
+ chosen_stream,
360
+ )
361
+ assert mesh is not None
362
+
363
+ fake_result_dtensors, unflatten_result = flatten(
364
+ fake_result, lambda x: isinstance(x, torch.Tensor)
365
+ )
366
+ result_dtensors = tuple(
367
+ Tensor(fake, mesh, chosen_stream) for fake in fake_result_dtensors
368
+ )
369
+ seq = mesh.client.new_node(result_dtensors + mutates, dtensors)
370
+ assert all(t.ref is not None for t in result_dtensors)
371
+ assert all(t.ref is not None for t in mutates)
372
+ result = result_msg = unflatten_result(result_dtensors)
373
+ if len(result_dtensors) == 0:
374
+ result_msg = None
375
+
376
+ broker_id = _cast_call_method_indirect(
377
+ endpoint, "all", mesh.client, seq, args_kwargs_tuple, refs
378
+ )
379
+ # note the device mesh has to be defined regardles so the remote functions
380
+ # can invoke mesh.rank("...")
381
+
382
+ mesh.define_remotely()
383
+
384
+ mesh._send(
385
+ messages.CallActorMethod(
386
+ seq,
387
+ result_msg,
388
+ broker_id,
389
+ refs,
390
+ cast("List[Ref]", mutates),
391
+ stream._active._to_ref(mesh.client),
392
+ )
393
+ )
394
+ return result
Binary file
monarch/tools/cli.py CHANGED
@@ -86,9 +86,9 @@ class CreateCmd:
86
86
  else defaults.component_fn(config.scheduler)
87
87
  )
88
88
  component_args = component_args_from_cli(component_fn, args.component_args)
89
- appdef = component_fn(**component_args)
89
+ config.appdef = component_fn(**component_args)
90
90
 
91
- handle = create(config, appdef)
91
+ handle = create(config)
92
92
  print(handle)
93
93
 
94
94
 
monarch/tools/commands.py CHANGED
@@ -7,18 +7,19 @@
7
7
  # pyre-strict
8
8
 
9
9
  import argparse
10
+ import asyncio
10
11
  import inspect
11
12
  import logging
12
13
  import os
13
- import time
14
- from datetime import timedelta
14
+ from datetime import datetime, timedelta
15
15
  from typing import Any, Callable, Mapping, Optional, Union
16
16
 
17
+ from monarch.tools.components.hyperactor import DEFAULT_NAME
18
+
17
19
  from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
18
20
  Config,
19
21
  defaults,
20
22
  )
21
-
22
23
  from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
23
24
  from torchx.runner import Runner # @manual=//torchx/runner:lib_core
24
25
  from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
@@ -83,7 +84,7 @@ def component_args_from_cli(
83
84
 
84
85
  def create(
85
86
  config: Config,
86
- appdef: AppDef,
87
+ name: str = DEFAULT_NAME,
87
88
  ) -> Union[str, AppDryRunInfo]:
88
89
  """Creates a monarch server by submitting it as a job to the target scheduler.
89
90
 
@@ -94,7 +95,7 @@ def create(
94
95
  from monarch.tools.config import defaults
95
96
 
96
97
  config = defaults.config(scheduler="slurm")
97
- appdef = defaults.component_fn(scheduler=config.scheduler)()
98
+ config.appdef = defaults.component_fn(scheduler=config.scheduler)()
98
99
 
99
100
  config.scheduler_args.update(
100
101
  {
@@ -105,7 +106,7 @@ def create(
105
106
  )
106
107
  config.dryrun = True
107
108
 
108
- create(config, appdef)
109
+ create(config)
109
110
 
110
111
 
111
112
  Args:
@@ -114,6 +115,7 @@ def create(
114
115
  component_fn: a function that returns the AppDef (job def).
115
116
  If not provided, defaults to the configured default for the scheduler
116
117
  (in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
118
+ name: the name of the job. If none, a default job name will be created.
117
119
  """
118
120
  scheduler: str = config.scheduler
119
121
  cfg: Mapping[str, CfgVal] = config.scheduler_args
@@ -122,6 +124,8 @@ def create(
122
124
  os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
123
125
 
124
126
  with torchx_runner() as runner:
127
+ appdef: AppDef = AppDef(name, config.appdef.roles, config.appdef.metadata)
128
+
125
129
  info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
126
130
 
127
131
  info_json_fmt = AppDryRunInfo(
@@ -170,6 +174,8 @@ def info(server_handle: str) -> Optional[ServerSpec]:
170
174
  # null-guard since some schedulers do not fill replica_status
171
175
  if host_status := replica_status.get(role.name):
172
176
  spec.hostnames = [h.hostname for h in host_status]
177
+ # the mesh status is based on the "least progressive" replica status
178
+ spec.state = min(h.state for h in host_status)
173
179
 
174
180
  mesh_specs.append(spec)
175
181
 
@@ -211,6 +217,8 @@ async def server_ready(
211
217
 
212
218
  """
213
219
 
220
+ check_interval_seconds = check_interval.total_seconds()
221
+ start = datetime.now()
214
222
  while True:
215
223
  server_spec = info(server_handle)
216
224
 
@@ -220,42 +228,56 @@ async def server_ready(
220
228
  if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
221
229
  # NOTE: TorchX currently does not have async APIs so need to loop-on-interval
222
230
  # TODO maybe inverse exponential backoff instead of constant interval?
223
- check_interval_seconds = check_interval.total_seconds()
224
- logger.info(
225
- "waiting for %s to be %s (current: %s), will check again in %g seconds...",
226
- server_handle,
227
- AppState.RUNNING,
228
- server_spec.state,
229
- check_interval_seconds,
231
+ print(
232
+ f"Waiting for {server_handle} to be {AppState.RUNNING} (current: {server_spec.state}); "
233
+ f"will check again in {check_interval_seconds} seconds. "
234
+ f"Total wait time: {datetime.now() - start}",
235
+ end="\r",
230
236
  )
231
- time.sleep(check_interval_seconds)
237
+ await asyncio.sleep(check_interval_seconds)
232
238
  continue
233
- else:
234
- return server_spec
235
-
236
239
 
240
+ # check if hosts are allocated for all the meshes
241
+ if server_spec.state == AppState.RUNNING:
242
+ running = True
243
+ for mesh_spec in server_spec.meshes:
244
+ if mesh_spec.state <= AppState.PENDING:
245
+ print(
246
+ f"Job {server_handle} is running but waiting for mesh {mesh_spec.name} "
247
+ f"to be {AppState.RUNNING} (current: {mesh_spec.state}); "
248
+ f"will check again in {check_interval_seconds} seconds. "
249
+ f"Total wait time: {datetime.now() - start}",
250
+ end="\r",
251
+ )
252
+ running = False
253
+ break
254
+ if not running:
255
+ await asyncio.sleep(check_interval_seconds)
256
+ continue
257
+
258
+ return server_spec
259
+
260
+
261
+ # TODO: this API is overloaded. Ideally, we do not need config to get or an handle to create.
237
262
  async def get_or_create(
238
263
  name: str,
239
264
  config: Config,
240
- appdef: AppDef,
241
265
  check_interval: timedelta = _5_SECONDS,
242
266
  ) -> ServerSpec:
243
- """Waits for the server called `name` in the scheduler specified in the `config`
267
+ """Waits for the server based on identity `name` in the scheduler specified in the `config`
244
268
  to be ready (e.g. RUNNING). If the server is not found then this function creates one
245
- per the `appdef` spec, and waits for the server to be ready before returning.
269
+ per the `config` spec, and waits for the server to be ready before returning.
246
270
 
247
271
  Usage:
248
272
 
249
273
  .. code-block:: python
250
274
 
251
- import getpass
252
275
  from monarch.tools.config import defaults
253
276
 
254
- USER = getpass.getuser()
255
277
  config = defaults.config(scheduler)
256
- appdef = defaults.component_fn(config.scheduler)()
278
+ config.appdef = defaults.component_fn(config.scheduler)()
257
279
 
258
- server_handle = get_or_create(f"{USER}_monarch", config, appdef)
280
+ server_handle = get_or_create(name="my_job_name", config)
259
281
  server_info = info(server_handle)
260
282
 
261
283
  Returns: A `ServerSpec` containing information about either the existing or the newly
@@ -273,7 +295,7 @@ async def get_or_create(
273
295
  )
274
296
 
275
297
  # no dryrun (see assertion above) support so will always be a handle (str)
276
- new_server_handle = str(create(config, appdef))
298
+ new_server_handle = str(create(config, name))
277
299
 
278
300
  logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
279
301
 
@@ -289,10 +311,10 @@ async def get_or_create(
289
311
  f"the new server `{new_server_handle}` has {server_info.state}"
290
312
  )
291
313
 
292
- logger.info(f"server `{new_server_handle}` is: {server_info.state}")
314
+ print(f"\x1b[36mNew job `{new_server_handle}` is ready to serve. \x1b[0m")
293
315
  return server_info
294
316
  else:
295
- logger.info("found existing RUNNING server `%s`", server_handle)
317
+ print(f"\x1b[36mFound existing job `{server_handle}` ready to serve. \x1b[0m")
296
318
  return server_info
297
319
 
298
320
 
@@ -9,6 +9,7 @@ import getpass
9
9
  from typing import Optional
10
10
 
11
11
  from monarch.tools import mesh_spec
12
+ from monarch.tools.config import UnnamedAppDef
12
13
  from monarch.tools.mesh_spec import mesh_spec_from_str
13
14
  from torchx import specs
14
15
 
@@ -16,17 +17,18 @@ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
16
17
 
17
18
  _USER: str = getpass.getuser()
18
19
 
20
+ DEFAULT_NAME: str = f"monarch-{_USER}"
21
+
19
22
  __version__ = "latest" # TODO get version from monarch.__version_
20
23
 
21
24
 
22
25
  def proc_mesh(
23
- name: str = f"monarch-{_USER}",
24
26
  image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
25
27
  meshes: list[str] = _DEFAULT_MESHES,
26
28
  env: Optional[dict[str, str]] = None,
27
29
  port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
28
30
  program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
29
- ) -> specs.AppDef:
31
+ ) -> UnnamedAppDef:
30
32
  """
31
33
  Args:
32
34
  name: the name of the monarch server job
@@ -37,7 +39,7 @@ def proc_mesh(
37
39
  program: path to the binary that the remote process allocator spawns on an allocation request
38
40
  """
39
41
 
40
- appdef = specs.AppDef(name)
42
+ appdef = UnnamedAppDef()
41
43
 
42
44
  for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
43
45
  mesh_role = specs.Role(
@@ -6,15 +6,32 @@
6
6
 
7
7
  # pyre-strict
8
8
  from dataclasses import dataclass, field
9
- from typing import Any, Optional
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from torchx.specs import Role
10
12
 
11
13
 
12
14
  NOT_SET: str = "__NOT_SET__"
13
15
 
14
16
 
17
+ @dataclass
18
+ class UnnamedAppDef:
19
+ """
20
+ A TorchX AppDef without a name.
21
+ """
22
+
23
+ roles: List[Role] = field(default_factory=list)
24
+ metadata: Dict[str, str] = field(default_factory=dict)
25
+
26
+
15
27
  @dataclass
16
28
  class Config:
29
+ """
30
+ All configs needed to schedule a mesh of allocators.
31
+ """
32
+
17
33
  scheduler: str = NOT_SET
18
34
  scheduler_args: dict[str, Any] = field(default_factory=dict)
19
35
  workspace: Optional[str] = None
20
36
  dryrun: bool = False
37
+ appdef: UnnamedAppDef = UnnamedAppDef()
@@ -11,7 +11,7 @@
11
11
  from typing import Callable, Optional
12
12
 
13
13
  from monarch.tools.components import hyperactor
14
- from monarch.tools.config import Config
14
+ from monarch.tools.config import Config, UnnamedAppDef
15
15
 
16
16
  from torchx import specs
17
17
  from torchx.schedulers import (
@@ -23,7 +23,7 @@ from torchx.schedulers import (
23
23
  )
24
24
 
25
25
 
26
- def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
26
+ def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
27
27
  """The default TorchX component function for the scheduler"""
28
28
  return hyperactor.proc_mesh
29
29
 
@@ -9,6 +9,8 @@ import string
9
9
  from dataclasses import dataclass, field
10
10
  from typing import Any, Optional
11
11
 
12
+ from monarch.tools.config import UnnamedAppDef
13
+
12
14
  from monarch.tools.network import get_sockaddr
13
15
  from torchx import specs
14
16
  from torchx.specs.api import is_terminal
@@ -39,6 +41,7 @@ class MeshSpec:
39
41
  transport: str = "tcp"
40
42
  port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
41
43
  hostnames: list[str] = field(default_factory=list)
44
+ state: specs.AppState = specs.AppState.UNSUBMITTED
42
45
 
43
46
  def server_addrs(
44
47
  self, transport: Optional[str] = None, port: Optional[int] = None
@@ -69,7 +72,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
69
72
  return string.Template(tag_template).substitute(mesh_name=mesh_name)
70
73
 
71
74
 
72
- def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
75
+ def tag_as_metadata(mesh_spec: MeshSpec, appdef: UnnamedAppDef) -> None:
73
76
  appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
74
77
  appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
75
78
  appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
tests/test_allocator.py CHANGED
@@ -33,7 +33,6 @@ from monarch._rust_bindings.monarch_hyperactor.channel import (
33
33
  ChannelTransport,
34
34
  )
35
35
 
36
- from monarch._src.actor.actor_mesh import MonarchContext
37
36
  from monarch._src.actor.allocator import (
38
37
  ALLOC_LABEL_PROC_MESH_NAME,
39
38
  LocalAllocator,
@@ -160,7 +159,7 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
160
159
  "TEST_ENV_VAR_3": "value_3",
161
160
  }
162
161
 
163
- def setup_multiple_env_vars(ctx: MonarchContext) -> None:
162
+ def setup_multiple_env_vars() -> None:
164
163
  for name, value in env_vars.items():
165
164
  os.environ[name] = value
166
165
 
@@ -184,36 +183,33 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
184
183
  await proc_mesh.stop()
185
184
 
186
185
  async def test_setup_lambda_with_context_info(self) -> None:
187
- """Test that the setup lambda can access context information"""
188
- context_var_name: str = "PROC_MESH_CONTEXT_INFO"
186
+ """Test that the setup lambda can access rank information"""
187
+ context_var_name: str = "PROC_MESH_RANK_INFO"
189
188
 
190
- def setup_with_context(ctx: MonarchContext) -> None:
191
- context_info = f"proc_id:{ctx.proc_id},point_rank:{ctx.point.rank}"
189
+ def setup_with_rank() -> None:
190
+ context_info = f"point_rank:{current_rank().rank}"
192
191
  os.environ[context_var_name] = context_info
193
192
 
194
193
  spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
195
194
  allocator = LocalAllocator()
196
195
  alloc = await allocator.allocate(spec)
197
196
 
198
- proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_context)
197
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
199
198
 
200
199
  try:
201
200
  actor = await proc_mesh.spawn("env_check", EnvCheckActor)
202
201
 
203
- context_info = await actor.get_env_var.call_one(context_var_name)
202
+ rank_info = await actor.get_env_var.call_one(context_var_name)
204
203
 
205
204
  self.assertNotEqual(
206
- context_info,
205
+ rank_info,
207
206
  "NOT_SET",
208
207
  "Context information was not stored in the environment variable",
209
208
  )
210
- self.assertIn(
211
- "proc_id:", context_info, "Context information does not contain proc_id"
212
- )
213
209
  self.assertIn(
214
210
  "point_rank:0",
215
- context_info,
216
- f"Context information {context_info} does not contain point_rank",
211
+ rank_info,
212
+ f"Context information {rank_info} does not contain point_rank",
217
213
  )
218
214
  finally:
219
215
  await proc_mesh.stop()
@@ -435,7 +431,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
435
431
  test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
436
432
  test_var_value: str = "test_value_123"
437
433
 
438
- def setup_env_vars(ctx: MonarchContext) -> None:
434
+ def setup_env_vars() -> None:
439
435
  os.environ[test_var_name] = test_var_value
440
436
 
441
437
  hosts = 2
@@ -15,7 +15,6 @@ import cloudpickle
15
15
 
16
16
  import torch
17
17
  from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
18
- from monarch._src.actor.actor_mesh import MonarchContext
19
18
  from monarch._src.actor.allocator import LocalAllocator
20
19
  from monarch._src.actor.proc_mesh import proc_mesh
21
20
  from monarch.actor import Actor, endpoint, ProcMesh
@@ -70,7 +69,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
70
69
  "CUDA_LAUNCH_BLOCKING": "1",
71
70
  }
72
71
 
73
- def setup_cuda_env(_: MonarchContext) -> None:
72
+ def setup_cuda_env() -> None:
74
73
  for name, value in cuda_env_vars.items():
75
74
  os.environ[name] = value
76
75
 
@@ -107,7 +106,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
107
106
  "CUDA_DEVICE_MAX_CONNECTIONS": "1",
108
107
  }
109
108
 
110
- def setup_cuda_env(_: MonarchContext) -> None:
109
+ def setup_cuda_env() -> None:
111
110
  for name, value in cuda_env_vars.items():
112
111
  os.environ[name] = value
113
112
 
@@ -586,3 +586,15 @@ class TestActorMeshStop(unittest.IsolatedAsyncioTestCase):
586
586
 
587
587
  await am_2.print.call("hello 3")
588
588
  await am_2.log.call("hello 4")
589
+
590
+
591
+ class PortedActor(Actor):
592
+ @endpoint(explicit_response_port=True)
593
+ def add(self, port: "Port[int]", b: int) -> None:
594
+ port.send(3 + b)
595
+
596
+
597
+ def test_ported_actor():
598
+ proc_mesh = local_proc_mesh(gpus=1).get()
599
+ a = proc_mesh.spawn("port_actor", PortedActor).get()
600
+ assert 5 == a.add.call_one(2).get()
@@ -8,7 +8,7 @@ import monarch
8
8
  import pytest
9
9
  import torch
10
10
  from monarch import remote
11
- from monarch.actor import Actor, endpoint, proc_mesh
11
+ from monarch.actor import Actor, as_endpoint, endpoint, proc_mesh
12
12
  from monarch.mesh_controller import spawn_tensor_engine
13
13
 
14
14
 
@@ -104,3 +104,29 @@ def test_actor_tensor_ordering() -> None:
104
104
  results.append(counter.incr.call(1))
105
105
 
106
106
  assert list(range(10)) == [r.get().item(hosts=0, gpus=0) for r in results]
107
+
108
+
109
+ class Linear(Actor):
110
+ def __init__(self, N: int, M: int):
111
+ self.weight = torch.zeros((N, M))
112
+
113
+ def forward(self, x) -> torch.Tensor:
114
+ return x @ self.weight
115
+
116
+ @endpoint(propagate="inspect")
117
+ def update(self, w: torch.Tensor) -> None:
118
+ self.weight += w
119
+
120
+
121
+ @two_gpu
122
+ def test_rref_actor() -> None:
123
+ pm = proc_mesh(gpus=1).get()
124
+ with pm.activate():
125
+ x = pm.spawn("linear", Linear, 3, 4).get()
126
+
127
+ y = torch.ones((4, 3))
128
+ t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
129
+ assert monarch.inspect(t.sum()).item() == 0
130
+ x.update.rref(torch.ones((3, 4)))
131
+ t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
132
+ assert monarch.inspect(t.sum()).item() == 3 * 4 * 4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.7.25
3
+ Version: 2025.7.27
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -44,6 +44,8 @@ Note: Monarch is currently only supported on Linux systems
44
44
 
45
45
  ## Installation
46
46
 
47
+ ### On Fedora distributions
48
+
47
49
  `pip install torchmonarch-nightly`
48
50
 
49
51
  or manually
@@ -88,6 +90,37 @@ pip install --no-build-isolation -e .
88
90
  pytest python/tests/ -v -m "not oss_skip"
89
91
  ```
90
92
 
93
+ ### On MacOS
94
+
95
+ You can also build Monarch to run locally on a MacOS system.
96
+
97
+ Note that this does not support tensor engine, which is tied to CUDA and RDMA (via ibverbs).
98
+
99
+
100
+ ```sh
101
+
102
+ # Create and activate the conda environment
103
+ conda create -n monarchenv python=3.10 -y
104
+ conda activate monarchenv
105
+
106
+ # Install nightly rust toolchain
107
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
108
+ rustup toolchain install nightly
109
+ rustup default nightly
110
+
111
+ # Install build dependencies
112
+ pip install -r build-requirements.txt
113
+ # Install test dependencies
114
+ pip install -r python/tests/requirements.txt
115
+
116
+ # Build and install Monarch
117
+ USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
118
+ # or setup for development
119
+ USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
120
+
121
+ ```
122
+
123
+
91
124
  ## Running examples
92
125
 
93
126
  Check out the `examples/` directory for demonstrations of how to use Monarch's APIs.