torchmonarch-nightly 2025.7.25__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.26__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
@@ -65,6 +65,7 @@ from monarch._src.actor.endpoint import (
65
65
  Endpoint,
66
66
  EndpointProperty,
67
67
  Extent,
68
+ NotAnEndpoint,
68
69
  Propagator,
69
70
  Selection,
70
71
  )
@@ -76,7 +77,7 @@ from monarch._src.actor.pickle import flatten, unflatten
76
77
  from monarch._src.actor.shape import MeshTrait, NDSlice
77
78
  from monarch._src.actor.sync_state import fake_sync_state
78
79
 
79
- from monarch._src.actor.tensor_engine_shim import actor_send
80
+ from monarch._src.actor.tensor_engine_shim import actor_rref, actor_send
80
81
 
81
82
  if TYPE_CHECKING:
82
83
  from monarch._src.actor.proc_mesh import ProcMesh
@@ -313,8 +314,7 @@ class ActorEndpoint(Endpoint[P, R]):
313
314
  """
314
315
  self._signature.bind(None, *args, **kwargs)
315
316
  objects, bytes = flatten((args, kwargs), _is_ref_or_mailbox)
316
- refs = [obj for obj in objects if hasattr(obj, "__monarch_ref__")]
317
- if not refs:
317
+ if all(not hasattr(obj, "__monarch_ref__") for obj in objects):
318
318
  message = PythonMessage(
319
319
  PythonMessageKind.CallMethod(
320
320
  self._name, None if port is None else port._port_ref
@@ -323,7 +323,7 @@ class ActorEndpoint(Endpoint[P, R]):
323
323
  )
324
324
  self._actor_mesh.cast(message, selection)
325
325
  else:
326
- actor_send(self, bytes, refs, port, selection)
326
+ actor_send(self, bytes, objects, port, selection)
327
327
  shape = self._actor_mesh._shape
328
328
  return Extent(shape.labels, shape.ndslice.sizes)
329
329
 
@@ -335,6 +335,26 @@ class ActorEndpoint(Endpoint[P, R]):
335
335
  ), "unexpected receiver type"
336
336
  return PortTuple(p, PortReceiver(self._mailbox, self._supervise(r._receiver)))
337
337
 
338
+ def _rref(self, args, kwargs):
339
+ self._signature.bind(None, *args, **kwargs)
340
+ refs, bytes = flatten((args, kwargs), _is_ref_or_mailbox)
341
+
342
+ return actor_rref(self, bytes, refs)
343
+
344
+
345
+ def as_endpoint(
346
+ not_an_endpoint: Callable[P, R], *, propagate: Propagator = None
347
+ ) -> Endpoint[P, R]:
348
+ if not isinstance(not_an_endpoint, NotAnEndpoint):
349
+ raise ValueError("expected an method of a spawned actor")
350
+ return ActorEndpoint(
351
+ not_an_endpoint._ref._actor_mesh_ref,
352
+ not_an_endpoint._name,
353
+ getattr(not_an_endpoint._ref, not_an_endpoint._name),
354
+ not_an_endpoint._ref._mailbox,
355
+ propagate,
356
+ )
357
+
338
358
 
339
359
  class Accumulator(Generic[P, R, A]):
340
360
  def __init__(
@@ -625,18 +645,23 @@ class _Actor:
625
645
  f" This is likely due to an earlier error: {self._saved_error}"
626
646
  )
627
647
  raise AssertionError(error_message)
628
- the_method = getattr(self.instance, method)._method
648
+ the_method = getattr(self.instance, method)
649
+ if isinstance(the_method, EndpointProperty):
650
+ module = the_method._method.__module__
651
+ the_method = functools.partial(the_method._method, self.instance)
652
+ else:
653
+ module = the_method.__module__
629
654
 
630
655
  if inspect.iscoroutinefunction(the_method):
631
656
 
632
657
  async def instrumented():
633
658
  enter_span(
634
- the_method.__module__,
659
+ module,
635
660
  method,
636
661
  str(ctx.mailbox.actor_id),
637
662
  )
638
663
  try:
639
- result = await the_method(self.instance, *args, **kwargs)
664
+ result = await the_method(*args, **kwargs)
640
665
  self._maybe_exit_debugger()
641
666
  except Exception as e:
642
667
  logging.critical(
@@ -649,9 +674,9 @@ class _Actor:
649
674
 
650
675
  result = await instrumented()
651
676
  else:
652
- enter_span(the_method.__module__, method, str(ctx.mailbox.actor_id))
677
+ enter_span(module, method, str(ctx.mailbox.actor_id))
653
678
  with fake_sync_state():
654
- result = the_method(self.instance, *args, **kwargs)
679
+ result = the_method(*args, **kwargs)
655
680
  self._maybe_exit_debugger()
656
681
  exit_span()
657
682
 
@@ -758,35 +783,14 @@ class ActorMeshRef(MeshTrait):
758
783
  attr_name,
759
784
  attr_value._method,
760
785
  self._mailbox,
786
+ attr_value._propagator,
761
787
  ),
762
788
  )
763
789
 
764
- def __getattr__(self, name: str) -> Any:
765
- # This method is called when an attribute is not found
766
- # For linting purposes, we need to tell the type checker that any attribute
767
- # could be an endpoint that's dynamically added at runtime
768
- # At runtime, we still want to raise AttributeError for truly missing attributes
769
-
770
- # Check if this is a method on the underlying class
771
- if hasattr(self._class, name):
772
- attr = getattr(self._class, name)
773
- if isinstance(attr, EndpointProperty):
774
- # Dynamically create the endpoint
775
- endpoint = ActorEndpoint(
776
- self._actor_mesh_ref,
777
- name,
778
- attr._method,
779
- self._mailbox,
780
- propagator=attr._propagator,
781
- )
782
- # Cache it for future use
783
- setattr(self, name, endpoint)
784
- return endpoint
785
-
786
- # If we get here, it's truly not found
787
- raise AttributeError(
788
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
789
- )
790
+ def __getattr__(self, attr: str) -> NotAnEndpoint:
791
+ if attr in dir(self._class):
792
+ return NotAnEndpoint(self, attr)
793
+ raise AttributeError(attr)
790
794
 
791
795
  def _create(
792
796
  self,
@@ -34,6 +34,7 @@ from monarch._src.actor.tensor_engine_shim import _cached_propagation, fake_call
34
34
 
35
35
  if TYPE_CHECKING:
36
36
  from monarch._src.actor.actor_mesh import (
37
+ ActorMeshRef,
37
38
  HyPortReceiver,
38
39
  OncePortReceiver,
39
40
  Port,
@@ -182,11 +183,22 @@ class Endpoint(ABC, Generic[P, R]):
182
183
  # pyre-ignore
183
184
  send(self, args, kwargs)
184
185
 
186
+ @abstractmethod
187
+ def _rref(self, args, kwargs) -> Any: ...
188
+
189
+ def rref(self, *args: P.args, **kwargs: P.kwargs) -> R:
190
+ return self._rref(args, kwargs)
191
+
185
192
  def _propagate(self, args, kwargs, fake_args, fake_kwargs):
186
193
  if self._propagator_arg is None or self._propagator_arg == "cached":
187
194
  if self._cache is None:
188
195
  self._cache = {}
189
- return _cached_propagation(self._cache, self._resolvable, args, kwargs)
196
+ resolvable = getattr(self, "_resolvable", None)
197
+ if resolvable is None:
198
+ raise NotImplementedError(
199
+ "Cached propagation is not implemented for actor endpoints."
200
+ )
201
+ return _cached_propagation(self._cache, resolvable, args, kwargs)
190
202
  elif self._propagator_arg == "inspect":
191
203
  return None
192
204
  elif self._propagator_arg == "mocked":
@@ -229,13 +241,34 @@ class EndpointProperty(Generic[P, R]):
229
241
  return cast(Endpoint[P, R], self)
230
242
 
231
243
 
244
+ class NotAnEndpoint:
245
+ """
246
+ Used as the dynamic value of functions on an ActorMeshRef that were not marked as endpoints.
247
+ This is used both to give a better error message (since we cannot prevent the type system from thinking they are methods),
248
+ and to provide the oppurtunity for someone to do endpoint(x.foo) on something that wasn't marked as an endpoint.
249
+ """
250
+
251
+ def __init__(self, ref: "ActorMeshRef", name: str):
252
+ self._ref = ref
253
+ self._name = name
254
+
255
+ def __call__(self, *args, **kwargs) -> None:
256
+ raise RuntimeError(
257
+ f"Actor {self._ref._class}.{self._name} is not annotated as an endpoint. To call it as one, add a @endpoint decorator to it, or directly wrap it in one as_endpoint(obj.method).call(...)"
258
+ )
259
+
260
+
232
261
  # This can't just be Callable because otherwise we are not
233
262
  # allowed to use type arguments in the return value.
234
263
  class EndpointIfy:
235
264
  @overload
236
- def __call__(self, function: Callable[P, Awaitable[R]]) -> Endpoint[P, R]: ...
265
+ def __call__(
266
+ self, function: Callable[Concatenate[Any, P], Awaitable[R]]
267
+ ) -> Endpoint[P, R]: ...
237
268
  @overload
238
- def __call__(self, function: Callable[P, R]) -> Endpoint[P, R]: ...
269
+ def __call__(
270
+ self, function: Callable[Concatenate[Any, P], R]
271
+ ) -> Endpoint[P, R]: ...
239
272
 
240
273
  def __call__(self, function: Any):
241
274
  pass
@@ -14,7 +14,7 @@ import logging
14
14
  import threading
15
15
  from typing import Optional
16
16
 
17
- from libfb.py.pyre import none_throws
17
+ from pyre_extensions import none_throws
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -43,7 +43,6 @@ from monarch._src.actor.actor_mesh import (
43
43
  Actor,
44
44
  ActorMeshRef,
45
45
  fake_sync_state,
46
- MonarchContext,
47
46
  )
48
47
 
49
48
  from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
@@ -89,7 +88,7 @@ class SetupActor(Actor):
89
88
  Typically used to setup the environment variables.
90
89
  """
91
90
 
92
- def __init__(self, env: Callable[[MonarchContext], None]) -> None:
91
+ def __init__(self, env: Callable[[], None]) -> None:
93
92
  """
94
93
  Initialize the setup actor with the user defined setup method.
95
94
  """
@@ -100,8 +99,7 @@ class SetupActor(Actor):
100
99
  """
101
100
  Call the user defined setup method with the monarch context.
102
101
  """
103
- ctx = MonarchContext.get()
104
- self._setup_method(ctx)
102
+ self._setup_method()
105
103
 
106
104
 
107
105
  T = TypeVar("T")
@@ -114,7 +112,7 @@ except ImportError:
114
112
 
115
113
 
116
114
  async def _allocate_nonblocking(
117
- alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
115
+ alloc: Alloc, setup: Callable[[], None] | None = None
118
116
  ) -> "ProcMesh":
119
117
  _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
120
118
  if setup is None:
@@ -211,7 +209,7 @@ class ProcMesh(MeshTrait):
211
209
 
212
210
  @classmethod
213
211
  def from_alloc(
214
- self, alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
212
+ self, alloc: Alloc, setup: Callable[[], None] | None = None
215
213
  ) -> Future["ProcMesh"]:
216
214
  """
217
215
  Allocate a process mesh according to the provided alloc.
@@ -219,7 +217,17 @@ class ProcMesh(MeshTrait):
219
217
 
220
218
  Arguments:
221
219
  - `alloc`: The alloc to allocate according to.
222
- - `setup`: A lambda taking MonarchContext as param, can be used to setup env vars on the allocated mesh
220
+ - `setup`: An optional lambda function to configure environment variables on the allocated mesh.
221
+ Use the `current_rank()` method within the lambda to obtain the rank.
222
+
223
+ Example of a setup method to initialize torch distributed environment variables:
224
+ ```
225
+ def setup():
226
+ rank = current_rank()
227
+ os.environ["RANK"] = str(rank)
228
+ os.environ["WORLD_SIZE"] = str(len(rank.shape))
229
+ os.environ["LOCAL_RANK"] = str(rank["gpus"])
230
+ ```
223
231
  """
224
232
  return Future(
225
233
  impl=lambda: _allocate_nonblocking(alloc, setup),
@@ -428,7 +436,7 @@ async def proc_mesh_nonblocking(
428
436
  gpus: Optional[int] = None,
429
437
  hosts: int = 1,
430
438
  env: dict[str, str] | None = None,
431
- setup: Callable[[MonarchContext], None] | None = None,
439
+ setup: Callable[[], None] | None = None,
432
440
  ) -> ProcMesh:
433
441
  if gpus is None:
434
442
  gpus = _local_device_count()
@@ -457,7 +465,7 @@ def proc_mesh(
457
465
  gpus: Optional[int] = None,
458
466
  hosts: int = 1,
459
467
  env: dict[str, str] | None = None,
460
- setup: Callable[[MonarchContext], None] | None = None,
468
+ setup: Callable[[], None] | None = None,
461
469
  ) -> Future[ProcMesh]:
462
470
  return Future(
463
471
  impl=lambda: proc_mesh_nonblocking(
@@ -19,7 +19,6 @@ time it is used.
19
19
 
20
20
  if TYPE_CHECKING:
21
21
  from monarch._src.actor.actor_mesh import ActorEndpoint, Port, Selection
22
- from monarch._src.actor.endpoint import Endpoint
23
22
 
24
23
 
25
24
  def shim(fn=None, *, module=None):
@@ -48,8 +47,12 @@ def actor_send(
48
47
  ) -> None: ...
49
48
 
50
49
 
50
+ @shim(module="monarch.mesh_controller")
51
+ def actor_rref(endpoint, args_kwargs_tuple: bytes, refs: Sequence[Any]): ...
52
+
53
+
51
54
  @shim(module="monarch.common.remote")
52
- def _cached_propagation(_cache, rfunction: "Endpoint", args, kwargs) -> Any: ...
55
+ def _cached_propagation(_cache, rfunction, args, kwargs) -> Any: ...
53
56
 
54
57
 
55
58
  @shim(module="monarch.common.fake")
monarch/actor/__init__.py CHANGED
@@ -12,6 +12,7 @@ from monarch._src.actor.actor_mesh import (
12
12
  Accumulator,
13
13
  Actor,
14
14
  ActorError,
15
+ as_endpoint,
15
16
  current_actor_name,
16
17
  current_rank,
17
18
  current_size,
@@ -35,6 +36,7 @@ __all__ = [
35
36
  "Actor",
36
37
  "ActorError",
37
38
  "current_actor_name",
39
+ "as_endpoint",
38
40
  "current_rank",
39
41
  "current_size",
40
42
  "endpoint",
@@ -435,6 +435,15 @@ class SendResultOfActorCall(NamedTuple):
435
435
  stream: tensor_worker.StreamRef
436
436
 
437
437
 
438
+ class CallActorMethod(NamedTuple):
439
+ seq: int
440
+ result: object
441
+ broker_id: Tuple[str, int]
442
+ local_state: Sequence[Tensor | tensor_worker.Ref]
443
+ mutates: List[tensor_worker.Ref]
444
+ stream: tensor_worker.StreamRef
445
+
446
+
438
447
  class SplitComm(NamedTuple):
439
448
  dims: Dims
440
449
  device_mesh: DeviceMesh
monarch/common/remote.py CHANGED
@@ -157,7 +157,7 @@ class Remote(Generic[P, R], Endpoint[P, R]):
157
157
  def _maybe_resolvable(self):
158
158
  return None if self._remote_impl is None else self._resolvable
159
159
 
160
- def rref(self, *args: P.args, **kwargs: P.kwargs) -> R:
160
+ def _rref(self, args, kwargs):
161
161
  return dtensor_dispatch(
162
162
  self._resolvable,
163
163
  self._propagate,
@@ -352,7 +352,7 @@ _miss = 0
352
352
  _hit = 0
353
353
 
354
354
 
355
- def _cached_propagation(_cache, rfunction: Endpoint, args, kwargs):
355
+ def _cached_propagation(_cache, rfunction: ResolvableFunction, args, kwargs):
356
356
  tensors, shape_key = hashable_tensor_flatten(args, kwargs)
357
357
  # pyre-ignore
358
358
  inputs_group = TensorGroup([t._fake for t in tensors])
Binary file
@@ -30,6 +30,7 @@ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monar
30
30
  WorldState,
31
31
  )
32
32
  from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
33
+ from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
33
34
  from monarch._rust_bindings.monarch_hyperactor.actor import (
34
35
  PythonMessage,
35
36
  PythonMessageKind,
@@ -44,10 +45,12 @@ from monarch._src.actor.endpoint import Selection
44
45
  from monarch._src.actor.shape import NDSlice
45
46
  from monarch.common import device_mesh, messages, stream
46
47
  from monarch.common.controller_api import TController
48
+ from monarch.common.function import ResolvableFunction
47
49
  from monarch.common.invocation import Seq
48
50
  from monarch.common.messages import Referenceable, SendResultOfActorCall
49
51
  from monarch.common.stream import StreamRef
50
- from monarch.common.tensor import InputChecker, Tensor
52
+ from monarch.common.tensor import dtensor_check, InputChecker, Tensor
53
+ from monarch.common.tree import flatten
51
54
  from monarch.tensor_worker_main import _set_trace
52
55
 
53
56
  if TYPE_CHECKING:
@@ -265,17 +268,36 @@ class RemoteException(Exception):
265
268
  return "<exception formatting RemoteException>"
266
269
 
267
270
 
268
- def actor_send(
271
+ def _cast_call_method_indirect(
269
272
  endpoint: ActorEndpoint,
273
+ selection: Selection,
274
+ client: MeshClient,
275
+ seq: Seq,
270
276
  args_kwargs_tuple: bytes,
271
277
  refs: Sequence[Any],
272
- port: Optional[Port[Any]],
273
- selection: Selection,
274
- ):
278
+ ) -> Tuple[str, int]:
275
279
  unflatten_args = [
276
280
  UnflattenArg.PyObject if isinstance(ref, Tensor) else UnflattenArg.Mailbox
277
281
  for ref in refs
278
282
  ]
283
+ broker_id: Tuple[str, int] = client._mesh_controller.broker_id
284
+ actor_msg = PythonMessage(
285
+ PythonMessageKind.CallMethodIndirect(
286
+ endpoint._name, broker_id, seq, unflatten_args
287
+ ),
288
+ args_kwargs_tuple,
289
+ )
290
+ endpoint._actor_mesh.cast(actor_msg, selection)
291
+ return broker_id
292
+
293
+
294
+ def actor_send(
295
+ endpoint: ActorEndpoint,
296
+ args_kwargs_tuple: bytes,
297
+ refs: Sequence[Any],
298
+ port: Optional[Port[Any]],
299
+ selection: Selection,
300
+ ):
279
301
  tensors = [ref for ref in refs if isinstance(ref, Tensor)]
280
302
  # we have some monarch references, we need to ensure their
281
303
  # proc_mesh matches that of the tensors we sent to it
@@ -284,7 +306,7 @@ def actor_send(
284
306
  if hasattr(t, "stream"):
285
307
  chosen_stream = t.stream
286
308
  break
287
- with InputChecker(refs, lambda x: f"actor_call({x})") as checker:
309
+ with InputChecker(tensors, lambda x: f"actor_call({x})") as checker:
288
310
  checker.check_mesh_stream_local(device_mesh._active, chosen_stream)
289
311
  # TODO: move propagators into Endpoint abstraction and run the propagator to get the
290
312
  # mutates
@@ -300,8 +322,6 @@ def actor_send(
300
322
 
301
323
  client = cast(MeshClient, checker.mesh.client)
302
324
 
303
- broker_id: Tuple[str, int] = client._mesh_controller.broker_id
304
-
305
325
  stream_ref = chosen_stream._to_ref(client)
306
326
 
307
327
  fut = (port, checker.mesh._ndslice) if port is not None else None
@@ -316,13 +336,9 @@ def actor_send(
316
336
  # The message to the generic actor tells it to first wait on the broker to get the local arguments
317
337
  # from the stream, then it will run the actor method, and send the result to response port.
318
338
 
319
- actor_msg = PythonMessage(
320
- PythonMessageKind.CallMethodIndirect(
321
- endpoint._name, broker_id, ident, unflatten_args
322
- ),
323
- args_kwargs_tuple,
339
+ broker_id = _cast_call_method_indirect(
340
+ endpoint, selection, client, ident, args_kwargs_tuple, refs
324
341
  )
325
- endpoint._actor_mesh.cast(actor_msg, selection)
326
342
  worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
327
343
  client.send(checker.mesh._ndslice, worker_msg)
328
344
  # we have to ask for status updates
@@ -330,3 +346,49 @@ def actor_send(
330
346
  # enough work to count this future as finished,
331
347
  # and all potential errors have been reported
332
348
  client._request_status()
349
+
350
+
351
+ def actor_rref(endpoint, args_kwargs_tuple: bytes, refs: Sequence[Any]):
352
+ chosen_stream = stream._active
353
+ fake_result, dtensors, mutates, mesh = dtensor_check(
354
+ endpoint._propagate,
355
+ cast(ResolvableFunction, endpoint._name),
356
+ refs,
357
+ {},
358
+ device_mesh._active,
359
+ chosen_stream,
360
+ )
361
+ assert mesh is not None
362
+
363
+ fake_result_dtensors, unflatten_result = flatten(
364
+ fake_result, lambda x: isinstance(x, torch.Tensor)
365
+ )
366
+ result_dtensors = tuple(
367
+ Tensor(fake, mesh, chosen_stream) for fake in fake_result_dtensors
368
+ )
369
+ seq = mesh.client.new_node(result_dtensors + mutates, dtensors)
370
+ assert all(t.ref is not None for t in result_dtensors)
371
+ assert all(t.ref is not None for t in mutates)
372
+ result = result_msg = unflatten_result(result_dtensors)
373
+ if len(result_dtensors) == 0:
374
+ result_msg = None
375
+
376
+ broker_id = _cast_call_method_indirect(
377
+ endpoint, "all", mesh.client, seq, args_kwargs_tuple, refs
378
+ )
379
+ # note the device mesh has to be defined regardles so the remote functions
380
+ # can invoke mesh.rank("...")
381
+
382
+ mesh.define_remotely()
383
+
384
+ mesh._send(
385
+ messages.CallActorMethod(
386
+ seq,
387
+ result_msg,
388
+ broker_id,
389
+ refs,
390
+ cast("List[Ref]", mutates),
391
+ stream._active._to_ref(mesh.client),
392
+ )
393
+ )
394
+ return result
Binary file
monarch/tools/cli.py CHANGED
@@ -86,9 +86,9 @@ class CreateCmd:
86
86
  else defaults.component_fn(config.scheduler)
87
87
  )
88
88
  component_args = component_args_from_cli(component_fn, args.component_args)
89
- appdef = component_fn(**component_args)
89
+ config.appdef = component_fn(**component_args)
90
90
 
91
- handle = create(config, appdef)
91
+ handle = create(config)
92
92
  print(handle)
93
93
 
94
94
 
monarch/tools/commands.py CHANGED
@@ -7,18 +7,19 @@
7
7
  # pyre-strict
8
8
 
9
9
  import argparse
10
+ import asyncio
10
11
  import inspect
11
12
  import logging
12
13
  import os
13
- import time
14
- from datetime import timedelta
14
+ from datetime import datetime, timedelta
15
15
  from typing import Any, Callable, Mapping, Optional, Union
16
16
 
17
+ from monarch.tools.components.hyperactor import DEFAULT_NAME
18
+
17
19
  from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults
18
20
  Config,
19
21
  defaults,
20
22
  )
21
-
22
23
  from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
23
24
  from torchx.runner import Runner # @manual=//torchx/runner:lib_core
24
25
  from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal, parse_app_handle
@@ -83,7 +84,7 @@ def component_args_from_cli(
83
84
 
84
85
  def create(
85
86
  config: Config,
86
- appdef: AppDef,
87
+ name: str = DEFAULT_NAME,
87
88
  ) -> Union[str, AppDryRunInfo]:
88
89
  """Creates a monarch server by submitting it as a job to the target scheduler.
89
90
 
@@ -94,7 +95,7 @@ def create(
94
95
  from monarch.tools.config import defaults
95
96
 
96
97
  config = defaults.config(scheduler="slurm")
97
- appdef = defaults.component_fn(scheduler=config.scheduler)()
98
+ config.appdef = defaults.component_fn(scheduler=config.scheduler)()
98
99
 
99
100
  config.scheduler_args.update(
100
101
  {
@@ -105,7 +106,7 @@ def create(
105
106
  )
106
107
  config.dryrun = True
107
108
 
108
- create(config, appdef)
109
+ create(config)
109
110
 
110
111
 
111
112
  Args:
@@ -114,6 +115,7 @@ def create(
114
115
  component_fn: a function that returns the AppDef (job def).
115
116
  If not provided, defaults to the configured default for the scheduler
116
117
  (in most cases ``monarch.tools.components.hyperactor.proc_mesh``)
118
+ name: the name of the job. If none, a default job name will be created.
117
119
  """
118
120
  scheduler: str = config.scheduler
119
121
  cfg: Mapping[str, CfgVal] = config.scheduler_args
@@ -122,6 +124,8 @@ def create(
122
124
  os.environ["TORCHX_CONTEXT_NAME"] = os.getenv("TORCHX_CONTEXT_NAME", "monarch")
123
125
 
124
126
  with torchx_runner() as runner:
127
+ appdef: AppDef = AppDef(name, config.appdef.roles, config.appdef.metadata)
128
+
125
129
  info = runner.dryrun(appdef, scheduler, cfg, config.workspace)
126
130
 
127
131
  info_json_fmt = AppDryRunInfo(
@@ -170,6 +174,8 @@ def info(server_handle: str) -> Optional[ServerSpec]:
170
174
  # null-guard since some schedulers do not fill replica_status
171
175
  if host_status := replica_status.get(role.name):
172
176
  spec.hostnames = [h.hostname for h in host_status]
177
+ # the mesh status is based on the "least progressive" replica status
178
+ spec.state = min(h.state for h in host_status)
173
179
 
174
180
  mesh_specs.append(spec)
175
181
 
@@ -211,6 +217,8 @@ async def server_ready(
211
217
 
212
218
  """
213
219
 
220
+ check_interval_seconds = check_interval.total_seconds()
221
+ start = datetime.now()
214
222
  while True:
215
223
  server_spec = info(server_handle)
216
224
 
@@ -220,42 +228,56 @@ async def server_ready(
220
228
  if server_spec.state <= AppState.PENDING: # UNSUBMITTED or SUBMITTED or PENDING
221
229
  # NOTE: TorchX currently does not have async APIs so need to loop-on-interval
222
230
  # TODO maybe inverse exponential backoff instead of constant interval?
223
- check_interval_seconds = check_interval.total_seconds()
224
- logger.info(
225
- "waiting for %s to be %s (current: %s), will check again in %g seconds...",
226
- server_handle,
227
- AppState.RUNNING,
228
- server_spec.state,
229
- check_interval_seconds,
231
+ print(
232
+ f"Waiting for {server_handle} to be {AppState.RUNNING} (current: {server_spec.state}); "
233
+ f"will check again in {check_interval_seconds} seconds. "
234
+ f"Total wait time: {datetime.now() - start}",
235
+ end="\r",
230
236
  )
231
- time.sleep(check_interval_seconds)
237
+ await asyncio.sleep(check_interval_seconds)
232
238
  continue
233
- else:
234
- return server_spec
235
-
236
239
 
240
+ # check if hosts are allocated for all the meshes
241
+ if server_spec.state == AppState.RUNNING:
242
+ running = True
243
+ for mesh_spec in server_spec.meshes:
244
+ if mesh_spec.state <= AppState.PENDING:
245
+ print(
246
+ f"Job {server_handle} is running but waiting for mesh {mesh_spec.name} "
247
+ f"to be {AppState.RUNNING} (current: {mesh_spec.state}); "
248
+ f"will check again in {check_interval_seconds} seconds. "
249
+ f"Total wait time: {datetime.now() - start}",
250
+ end="\r",
251
+ )
252
+ running = False
253
+ break
254
+ if not running:
255
+ await asyncio.sleep(check_interval_seconds)
256
+ continue
257
+
258
+ return server_spec
259
+
260
+
261
+ # TODO: this API is overloaded. Ideally, we do not need config to get or an handle to create.
237
262
  async def get_or_create(
238
263
  name: str,
239
264
  config: Config,
240
- appdef: AppDef,
241
265
  check_interval: timedelta = _5_SECONDS,
242
266
  ) -> ServerSpec:
243
- """Waits for the server called `name` in the scheduler specified in the `config`
267
+ """Waits for the server based on identity `name` in the scheduler specified in the `config`
244
268
  to be ready (e.g. RUNNING). If the server is not found then this function creates one
245
- per the `appdef` spec, and waits for the server to be ready before returning.
269
+ per the `config` spec, and waits for the server to be ready before returning.
246
270
 
247
271
  Usage:
248
272
 
249
273
  .. code-block:: python
250
274
 
251
- import getpass
252
275
  from monarch.tools.config import defaults
253
276
 
254
- USER = getpass.getuser()
255
277
  config = defaults.config(scheduler)
256
- appdef = defaults.component_fn(config.scheduler)()
278
+ config.appdef = defaults.component_fn(config.scheduler)()
257
279
 
258
- server_handle = get_or_create(f"{USER}_monarch", config, appdef)
280
+ server_handle = get_or_create(name="my_job_name", config)
259
281
  server_info = info(server_handle)
260
282
 
261
283
  Returns: A `ServerSpec` containing information about either the existing or the newly
@@ -273,7 +295,7 @@ async def get_or_create(
273
295
  )
274
296
 
275
297
  # no dryrun (see assertion above) support so will always be a handle (str)
276
- new_server_handle = str(create(config, appdef))
298
+ new_server_handle = str(create(config, name))
277
299
 
278
300
  logger.info(f"created new `{new_server_handle}` waiting for it to be ready...")
279
301
 
@@ -289,10 +311,10 @@ async def get_or_create(
289
311
  f"the new server `{new_server_handle}` has {server_info.state}"
290
312
  )
291
313
 
292
- logger.info(f"server `{new_server_handle}` is: {server_info.state}")
314
+ print(f"\x1b[36mNew job `{new_server_handle}` is ready to serve. \x1b[0m")
293
315
  return server_info
294
316
  else:
295
- logger.info("found existing RUNNING server `%s`", server_handle)
317
+ print(f"\x1b[36mFound existing job `{server_handle}` ready to serve. \x1b[0m")
296
318
  return server_info
297
319
 
298
320
 
@@ -9,6 +9,7 @@ import getpass
9
9
  from typing import Optional
10
10
 
11
11
  from monarch.tools import mesh_spec
12
+ from monarch.tools.config import UnnamedAppDef
12
13
  from monarch.tools.mesh_spec import mesh_spec_from_str
13
14
  from torchx import specs
14
15
 
@@ -16,17 +17,18 @@ _DEFAULT_MESHES = ["mesh_0:1:gpu.small"]
16
17
 
17
18
  _USER: str = getpass.getuser()
18
19
 
20
+ DEFAULT_NAME: str = f"monarch-{_USER}"
21
+
19
22
  __version__ = "latest" # TODO get version from monarch.__version_
20
23
 
21
24
 
22
25
  def proc_mesh(
23
- name: str = f"monarch-{_USER}",
24
26
  image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
25
27
  meshes: list[str] = _DEFAULT_MESHES,
26
28
  env: Optional[dict[str, str]] = None,
27
29
  port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT,
28
30
  program: str = "monarch_bootstrap", # installed with monarch wheel (as console script)
29
- ) -> specs.AppDef:
31
+ ) -> UnnamedAppDef:
30
32
  """
31
33
  Args:
32
34
  name: the name of the monarch server job
@@ -37,7 +39,7 @@ def proc_mesh(
37
39
  program: path to the binary that the remote process allocator spawns on an allocation request
38
40
  """
39
41
 
40
- appdef = specs.AppDef(name)
42
+ appdef = UnnamedAppDef()
41
43
 
42
44
  for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]:
43
45
  mesh_role = specs.Role(
@@ -6,15 +6,32 @@
6
6
 
7
7
  # pyre-strict
8
8
  from dataclasses import dataclass, field
9
- from typing import Any, Optional
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from torchx.specs import Role
10
12
 
11
13
 
12
14
  NOT_SET: str = "__NOT_SET__"
13
15
 
14
16
 
17
+ @dataclass
18
+ class UnnamedAppDef:
19
+ """
20
+ A TorchX AppDef without a name.
21
+ """
22
+
23
+ roles: List[Role] = field(default_factory=list)
24
+ metadata: Dict[str, str] = field(default_factory=dict)
25
+
26
+
15
27
  @dataclass
16
28
  class Config:
29
+ """
30
+ All configs needed to schedule a mesh of allocators.
31
+ """
32
+
17
33
  scheduler: str = NOT_SET
18
34
  scheduler_args: dict[str, Any] = field(default_factory=dict)
19
35
  workspace: Optional[str] = None
20
36
  dryrun: bool = False
37
+ appdef: UnnamedAppDef = UnnamedAppDef()
@@ -11,7 +11,7 @@
11
11
  from typing import Callable, Optional
12
12
 
13
13
  from monarch.tools.components import hyperactor
14
- from monarch.tools.config import Config
14
+ from monarch.tools.config import Config, UnnamedAppDef
15
15
 
16
16
  from torchx import specs
17
17
  from torchx.schedulers import (
@@ -23,7 +23,7 @@ from torchx.schedulers import (
23
23
  )
24
24
 
25
25
 
26
- def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
26
+ def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
27
27
  """The default TorchX component function for the scheduler"""
28
28
  return hyperactor.proc_mesh
29
29
 
@@ -9,6 +9,8 @@ import string
9
9
  from dataclasses import dataclass, field
10
10
  from typing import Any, Optional
11
11
 
12
+ from monarch.tools.config import UnnamedAppDef
13
+
12
14
  from monarch.tools.network import get_sockaddr
13
15
  from torchx import specs
14
16
  from torchx.specs.api import is_terminal
@@ -39,6 +41,7 @@ class MeshSpec:
39
41
  transport: str = "tcp"
40
42
  port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
41
43
  hostnames: list[str] = field(default_factory=list)
44
+ state: specs.AppState = specs.AppState.UNSUBMITTED
42
45
 
43
46
  def server_addrs(
44
47
  self, transport: Optional[str] = None, port: Optional[int] = None
@@ -69,7 +72,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
69
72
  return string.Template(tag_template).substitute(mesh_name=mesh_name)
70
73
 
71
74
 
72
- def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
75
+ def tag_as_metadata(mesh_spec: MeshSpec, appdef: UnnamedAppDef) -> None:
73
76
  appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
74
77
  appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
75
78
  appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
tests/test_allocator.py CHANGED
@@ -33,7 +33,6 @@ from monarch._rust_bindings.monarch_hyperactor.channel import (
33
33
  ChannelTransport,
34
34
  )
35
35
 
36
- from monarch._src.actor.actor_mesh import MonarchContext
37
36
  from monarch._src.actor.allocator import (
38
37
  ALLOC_LABEL_PROC_MESH_NAME,
39
38
  LocalAllocator,
@@ -160,7 +159,7 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
160
159
  "TEST_ENV_VAR_3": "value_3",
161
160
  }
162
161
 
163
- def setup_multiple_env_vars(ctx: MonarchContext) -> None:
162
+ def setup_multiple_env_vars() -> None:
164
163
  for name, value in env_vars.items():
165
164
  os.environ[name] = value
166
165
 
@@ -184,36 +183,33 @@ class TestSetupActorInAllocator(unittest.IsolatedAsyncioTestCase):
184
183
  await proc_mesh.stop()
185
184
 
186
185
  async def test_setup_lambda_with_context_info(self) -> None:
187
- """Test that the setup lambda can access context information"""
188
- context_var_name: str = "PROC_MESH_CONTEXT_INFO"
186
+ """Test that the setup lambda can access rank information"""
187
+ context_var_name: str = "PROC_MESH_RANK_INFO"
189
188
 
190
- def setup_with_context(ctx: MonarchContext) -> None:
191
- context_info = f"proc_id:{ctx.proc_id},point_rank:{ctx.point.rank}"
189
+ def setup_with_rank() -> None:
190
+ context_info = f"point_rank:{current_rank().rank}"
192
191
  os.environ[context_var_name] = context_info
193
192
 
194
193
  spec = AllocSpec(AllocConstraints(), gpus=1, hosts=1)
195
194
  allocator = LocalAllocator()
196
195
  alloc = await allocator.allocate(spec)
197
196
 
198
- proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_context)
197
+ proc_mesh = await ProcMesh.from_alloc(alloc, setup=setup_with_rank)
199
198
 
200
199
  try:
201
200
  actor = await proc_mesh.spawn("env_check", EnvCheckActor)
202
201
 
203
- context_info = await actor.get_env_var.call_one(context_var_name)
202
+ rank_info = await actor.get_env_var.call_one(context_var_name)
204
203
 
205
204
  self.assertNotEqual(
206
- context_info,
205
+ rank_info,
207
206
  "NOT_SET",
208
207
  "Context information was not stored in the environment variable",
209
208
  )
210
- self.assertIn(
211
- "proc_id:", context_info, "Context information does not contain proc_id"
212
- )
213
209
  self.assertIn(
214
210
  "point_rank:0",
215
- context_info,
216
- f"Context information {context_info} does not contain point_rank",
211
+ rank_info,
212
+ f"Context information {rank_info} does not contain point_rank",
217
213
  )
218
214
  finally:
219
215
  await proc_mesh.stop()
@@ -435,7 +431,7 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
435
431
  test_var_name: str = "TEST_ENV_VAR_FOR_PROC_MESH"
436
432
  test_var_value: str = "test_value_123"
437
433
 
438
- def setup_env_vars(ctx: MonarchContext) -> None:
434
+ def setup_env_vars() -> None:
439
435
  os.environ[test_var_name] = test_var_value
440
436
 
441
437
  hosts = 2
@@ -15,7 +15,6 @@ import cloudpickle
15
15
 
16
16
  import torch
17
17
  from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
18
- from monarch._src.actor.actor_mesh import MonarchContext
19
18
  from monarch._src.actor.allocator import LocalAllocator
20
19
  from monarch._src.actor.proc_mesh import proc_mesh
21
20
  from monarch.actor import Actor, endpoint, ProcMesh
@@ -70,7 +69,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
70
69
  "CUDA_LAUNCH_BLOCKING": "1",
71
70
  }
72
71
 
73
- def setup_cuda_env(_: MonarchContext) -> None:
72
+ def setup_cuda_env() -> None:
74
73
  for name, value in cuda_env_vars.items():
75
74
  os.environ[name] = value
76
75
 
@@ -107,7 +106,7 @@ class TestEnvBeforeCuda(unittest.IsolatedAsyncioTestCase):
107
106
  "CUDA_DEVICE_MAX_CONNECTIONS": "1",
108
107
  }
109
108
 
110
- def setup_cuda_env(_: MonarchContext) -> None:
109
+ def setup_cuda_env() -> None:
111
110
  for name, value in cuda_env_vars.items():
112
111
  os.environ[name] = value
113
112
 
@@ -8,7 +8,7 @@ import monarch
8
8
  import pytest
9
9
  import torch
10
10
  from monarch import remote
11
- from monarch.actor import Actor, endpoint, proc_mesh
11
+ from monarch.actor import Actor, as_endpoint, endpoint, proc_mesh
12
12
  from monarch.mesh_controller import spawn_tensor_engine
13
13
 
14
14
 
@@ -104,3 +104,29 @@ def test_actor_tensor_ordering() -> None:
104
104
  results.append(counter.incr.call(1))
105
105
 
106
106
  assert list(range(10)) == [r.get().item(hosts=0, gpus=0) for r in results]
107
+
108
+
109
+ class Linear(Actor):
110
+ def __init__(self, N: int, M: int):
111
+ self.weight = torch.zeros((N, M))
112
+
113
+ def forward(self, x) -> torch.Tensor:
114
+ return x @ self.weight
115
+
116
+ @endpoint(propagate="inspect")
117
+ def update(self, w: torch.Tensor) -> None:
118
+ self.weight += w
119
+
120
+
121
+ @two_gpu
122
+ def test_rref_actor() -> None:
123
+ pm = proc_mesh(gpus=1).get()
124
+ with pm.activate():
125
+ x = pm.spawn("linear", Linear, 3, 4).get()
126
+
127
+ y = torch.ones((4, 3))
128
+ t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
129
+ assert monarch.inspect(t.sum()).item() == 0
130
+ x.update.rref(torch.ones((3, 4)))
131
+ t = as_endpoint(x.forward, propagate=lambda x: torch.rand(3, 4)).rref(y)
132
+ assert monarch.inspect(t.sum()).item() == 3 * 4 * 4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.7.25
3
+ Version: 2025.7.26
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -44,6 +44,8 @@ Note: Monarch is currently only supported on Linux systems
44
44
 
45
45
  ## Installation
46
46
 
47
+ ### On Fedora distributions
48
+
47
49
  `pip install torchmonarch-nightly`
48
50
 
49
51
  or manually
@@ -88,6 +90,37 @@ pip install --no-build-isolation -e .
88
90
  pytest python/tests/ -v -m "not oss_skip"
89
91
  ```
90
92
 
93
+ ### On MacOS
94
+
95
+ You can also build Monarch to run locally on a MacOS system.
96
+
97
+ Note that this does not support tensor engine, which is tied to CUDA and RDMA (via ibverbs).
98
+
99
+
100
+ ```sh
101
+
102
+ # Create and activate the conda environment
103
+ conda create -n monarchenv python=3.10 -y
104
+ conda activate monarchenv
105
+
106
+ # Install nightly rust toolchain
107
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
108
+ rustup toolchain install nightly
109
+ rustup default nightly
110
+
111
+ # Install build dependencies
112
+ pip install -r build-requirements.txt
113
+ # Install test dependencies
114
+ pip install -r python/tests/requirements.txt
115
+
116
+ # Build and install Monarch
117
+ USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
118
+ # or setup for development
119
+ USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
120
+
121
+ ```
122
+
123
+
91
124
  ## Running examples
92
125
 
93
126
  Check out the `examples/` directory for demonstrations of how to use Monarch's APIs.
@@ -1,5 +1,5 @@
1
1
  monarch/__init__.py,sha256=mgKiyD1kxky-1pvhMlNfF4VmxWnhi-FSYZNFzkW1BEM,7052
2
- monarch/_rust_bindings.so,sha256=WY-nbm2FEKDjmEtBkutsLWHrfUdm-tBlCOT5fezUibI,47609504
2
+ monarch/_rust_bindings.so,sha256=KOHytC_5b5wSnqlZT1gxMJYTwasEteUAtGScllx_taQ,47706680
3
3
  monarch/_testing.py,sha256=_3MYNMq-_0T1qXCj2vxrW13GlWGdUuVFMskQF2Gsw_o,7877
4
4
  monarch/actor_mesh.py,sha256=VtPU9syi_vUdwDSJJ639Z4Y_EcWZUScyoj0lQ88RQPs,421
5
5
  monarch/bootstrap_main.py,sha256=39OZpNMrfvvNJf-iwuNzgslzYA_ItaRPHfXGn_V74N0,524
@@ -7,8 +7,8 @@ monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUi
7
7
  monarch/fetch.py,sha256=JMxC8HdWMvpik0T4E6e-gfxvmNnOkA0ul4eo4R3Jg_o,1712
8
8
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
9
9
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
10
- monarch/mesh_controller.py,sha256=uFAExVNzMGoeQI0kmuv4-kMbKHn420oeaG1rTYFi2sg,11884
11
- monarch/monarch_controller,sha256=j2oEc0o6s6E7YZJejYRdx3ud0KXEiO6WJf_64aJA1L4,21222144
10
+ monarch/mesh_controller.py,sha256=mOk2misobJun2AgR_ALjFoopAEcOPYQVrrAJXa18ZTs,13810
11
+ monarch/monarch_controller,sha256=nN4OheB1gPNUBr-l9Oj0p5uNBob1nFg5SI5fBtfnM3o,21248048
12
12
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
13
13
  monarch/opaque_module.py,sha256=jCcg0DjbcEVXA9WNG0NhUzGteLHOJLTZEBvrIYJIAns,10436
14
14
  monarch/opaque_object.py,sha256=x1LoX6RIMGh4ux52xIfhPgoh6PhZHdkf9bMccHW3DW0,2808
@@ -25,27 +25,27 @@ monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
25
25
  monarch/world_mesh.py,sha256=ob5dJWaC49Uw0xqClHBm8CQLvL4xKnjd4TGzk7k8NxI,980
26
26
  monarch/_src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  monarch/_src/actor/__init__.py,sha256=4iK3nzQZmEPe0HGNY70fABBenw3lCVVaaF0xddF5Fa0,235
28
- monarch/_src/actor/actor_mesh.py,sha256=nG56lE6RwcNHeF4SUutf1NPmO8GQMaIjCjphCxf_pRU,29233
28
+ monarch/_src/actor/actor_mesh.py,sha256=guYD9nZHguLGJAvTisc3Q664ASkupcNC6z9iheeGFUQ,29188
29
29
  monarch/_src/actor/allocator.py,sha256=WpHEK1SvjgF3GdIWIYUkonXli2-gQVKJVZPInl2RFQo,8212
30
30
  monarch/_src/actor/bootstrap_main.py,sha256=e5eU3GvX60MWWmCty7VcZrAmukD29iJdWBysNgQ2o3A,2342
31
31
  monarch/_src/actor/debugger.py,sha256=t2iAAxz03b2KZ89T3VjRc__7GzSf83R8gM81SDyX3-o,19532
32
32
  monarch/_src/actor/device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4j4,577
33
- monarch/_src/actor/endpoint.py,sha256=V8J4LVTnTFeo4HhkdhISmepzJOxdqbHe6cxwYkj9Qyg,8462
34
- monarch/_src/actor/event_loop.py,sha256=gpfbPoOhrY8W2Z5cDcujIfGI0NTdLrRycs7FmbsVatE,2830
33
+ monarch/_src/actor/endpoint.py,sha256=jM3XYWze6gB6ajE4AMojNFSN4IaaxgioNAErJkkywjE,9721
34
+ monarch/_src/actor/event_loop.py,sha256=2i4fKIkemBzua_t47BqVa2roZ6fWB6sbmMFPNx2zKN0,2832
35
35
  monarch/_src/actor/future.py,sha256=jOGh1wfwKyGJxhl9t1P8eapXYywf8KwQldZCCbupmb8,4042
36
36
  monarch/_src/actor/pdb_wrapper.py,sha256=-QxRktntdEO2LdHixBGKLboYtADyh8bEIAoa3gFwIEo,4161
37
37
  monarch/_src/actor/pickle.py,sha256=jD_3E07OJmMIlcMOOrNFnIuRKZU2F_Q_BP-njDFXUNM,2044
38
- monarch/_src/actor/proc_mesh.py,sha256=mbXgoUAnajKqe54LQSV3QfWii2b28-NxL4YmbpS4hxM,16368
38
+ monarch/_src/actor/proc_mesh.py,sha256=amF4fbO-33qHFudlS9WabYXIVh0Y_D_0nhCTxvOhpGg,16640
39
39
  monarch/_src/actor/shape.py,sha256=B-7DI768ZhT8ECUNCJcI7DfCB7iDFGFH0r-HmXaAfcM,8296
40
40
  monarch/_src/actor/sync_state.py,sha256=GB6bTAGztkcN8fZ9K7zXklOzjYzv6cvkJeBje20xFkE,471
41
- monarch/_src/actor/tensor_engine_shim.py,sha256=r9ZK0ELVvn-cbs4LdP7IxO9KZaLV9p6q36ERbthjEd0,1568
41
+ monarch/_src/actor/tensor_engine_shim.py,sha256=hupavQ2rjPwECaTlDAhY-eeiEY18Wyyx59MZHcSEcYM,1622
42
42
  monarch/_src/actor/code_sync/__init__.py,sha256=qzWoFNJEJvEbqab0QuHbkvhdz6FHi7BOTw6-2B3p0A4,378
43
43
  monarch/_src/actor/code_sync/auto_reload.py,sha256=kqXCQuSzjxMw8bcDLsUZiL_NImo4j2EScfNklwpltmU,6685
44
44
  monarch/_src/actor/telemetry/__init__.py,sha256=sHA5fmFdWU9jcUJVszNFhbXbjRSIBmuDXDMwJrrE0hw,523
45
45
  monarch/_src/actor/telemetry/rust_span_tracing.py,sha256=UvkywuwjQX7tIyLdKZbF-fcmI_aHporAejsTRTyJNNg,4445
46
46
  monarch/_src/tensor_engine/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
47
47
  monarch/_src/tensor_engine/rdma.py,sha256=KbhJXMuuHruYXnmxzB3BpkpcGsZ4hSu_7C6wF-EPhDk,6331
48
- monarch/actor/__init__.py,sha256=oQY131pVHg9iBnyZIi-zHvDturO6VuSlfnWktLwI3cE,1014
48
+ monarch/actor/__init__.py,sha256=F87BC7owDdH_yRjLvMu6pbICbajndsEbtWG2W53Rapo,1050
49
49
  monarch/builtins/__init__.py,sha256=QcfnHZGbc2qktBg7DyZt2ruE6VahnIt4S8lEZLHdJqU,443
50
50
  monarch/builtins/log.py,sha256=H1QkuVzwxyi36Zyv-XR0VN0QsNimBWwxE1__fjs0_2o,554
51
51
  monarch/builtins/random.py,sha256=wPbvscg7u53EXpMFo885fO2XOlsyjrNAJ4rBxLzfxdg,1839
@@ -67,14 +67,14 @@ monarch/common/function_caching.py,sha256=HVdbWtv6Eea7ENMWi8iv36w1G1TaVuUJhkUX_J
67
67
  monarch/common/future.py,sha256=D1UJ_8Rvb8-VG9vNE-z7xz2m2otMd2HgB0rnA02nlvA,4681
68
68
  monarch/common/invocation.py,sha256=L4mSmzqlHMxo1Tb71hBU_M8aBZCRCOcb6vvPhvvewec,4195
69
69
  monarch/common/mast.py,sha256=XTzYljGR0aZ7GjmNMPgU2HyuL4HWSAy4IwE3kEDqdOw,7735
70
- monarch/common/messages.py,sha256=Xp2TdVhCN52qpSnDGzi1o813okYZ9-vY9mBxw7ZFGVg,18606
70
+ monarch/common/messages.py,sha256=jwwJMVVx3gKd39AXcnRxjMr7lPJRLimHtZYel3zjq4o,18833
71
71
  monarch/common/mock_cuda.py,sha256=x6ho1Ton6BbKjBZ5ZxnFOUaQM032X70wnpoUNB7Ci2w,1039
72
72
  monarch/common/opaque_ref.py,sha256=tWNvOC6CsjNPKD1JDx-8PSaeXqZC3eermgBExUPKML4,2871
73
73
  monarch/common/pipe.py,sha256=9pTf8--3yOv4HpnJEhgcmc_JM6Az4uL1y72TSQA55dw,5013
74
74
  monarch/common/process_group.py,sha256=FbJ_AJRZYFkvQ68L2naRq64J_aNuAKe5kO0MWdn_x74,1662
75
75
  monarch/common/recording.py,sha256=Q39Zhb3kT52NCPf4VVMox2WXjtXju5eTuvPMZ_QGW7o,4660
76
76
  monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,938
77
- monarch/common/remote.py,sha256=h0niT9wDHuRuIPwQYBa4QVM803BxA91ANjsnKH2ZyXc,12144
77
+ monarch/common/remote.py,sha256=Q2YpAo_fsdh22ElUNruxYyn-cNTecZr8POcHCGtuKyg,12129
78
78
  monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
79
79
  monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
80
80
  monarch/common/tensor.py,sha256=ysT51NClNF4FxV0DFLJJUNmCRaVy8uQuYWpLViyPLdY,29292
@@ -89,7 +89,7 @@ monarch/controller/rust_backend/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTY
89
89
  monarch/controller/rust_backend/controller.py,sha256=8IYnVUiqEVKO9rGL3vKqcCSAhWJG1bYYQ0MoaMqsp78,9521
90
90
  monarch/gradient/__init__.py,sha256=kqmzwt16mMpk0M3GhpgP_f7da4DGnaV9chDzbt66k4Q,308
91
91
  monarch/gradient/_gradient_generator.pyi,sha256=6cX0UxaDt9NAlwgIhTgnweqGOf6qRhHiGnUzSWNCxdU,630
92
- monarch/gradient/_gradient_generator.so,sha256=G9zScMPjAMiR5mUtgYJVhkJAmfMzRjJ8-Jfd9Rg1Qm8,11534720
92
+ monarch/gradient/_gradient_generator.so,sha256=Jw4MAcg1zcjYfm51N6EZWxTA8bedXaTcGMj3g45o9jQ,11534720
93
93
  monarch/parallel/__init__.py,sha256=6920kIkhiX7AiyjYvyc1ad8ccP-bStJJ1sS5KkeN2P0,352
94
94
  monarch/parallel/pipelining/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
95
95
  monarch/parallel/pipelining/runtime.py,sha256=KK8TG1gUYEzSsquiZoPTWGSIC74mlncD7cYknKxfb3c,32470
@@ -115,15 +115,15 @@ monarch/timer/example_spmd.py,sha256=p8i3_tO1AmpwSkZryiSjgkh7qaEZ6QXp2Fy1qtPpECA
115
115
  monarch/timer/execution_timer.py,sha256=1YsrLIZirdohKOeFAU2H4UcONhQXHuctJbYcoX8I6gY,6985
116
116
  monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_uxl9SOHak,4486
117
117
  monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
118
- monarch/tools/cli.py,sha256=rni8Et8_uMZLTKwwdqeFTia84pDwMh6Ne-IJx36LYsc,5002
119
- monarch/tools/commands.py,sha256=fU4EPnNx0M2iH4eLJsMqPDzIGl6t9ALSJc1szLHQP6Y,10821
120
- monarch/tools/mesh_spec.py,sha256=kAo_GcU0LOm6cBkbeBGBtU9WKKS0kiDG-M7Uf9Wrp7Y,7831
118
+ monarch/tools/cli.py,sha256=b3mKZnK-MwP7JwskTxHI0KcJXxSU6498jEb2ntVr_VM,5001
119
+ monarch/tools/commands.py,sha256=3xuvHcMwl0t6cWTVUxI_r8EqrJZnay0bkKxOijhlKrw,12126
120
+ monarch/tools/mesh_spec.py,sha256=in6txNRmA-UvveVSMHCjX6mGpofd3K8vl2Plz1eD6rg,7935
121
121
  monarch/tools/network.py,sha256=mN8Fx9mervxM3VdFHRn4ZXt4z7yWxZp52BTxx2tfpus,2455
122
122
  monarch/tools/utils.py,sha256=2GGUQQE0dLtzoKy40_tAsOfbSxE6krnL0WvwMgUBgmw,1213
123
123
  monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
124
- monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
125
- monarch/tools/config/__init__.py,sha256=OPSflEmJB2zxAaRVzzWSWXV5M5vlknLgpulGdW1ze5U,510
126
- monarch/tools/config/defaults.py,sha256=34a3HQhyXqt9qR2SYMVCROoNsnwk37rIwLXXiKwqtog,1894
124
+ monarch/tools/components/hyperactor.py,sha256=gYZS8AcmoTuq48mRrZWWnyxQqaiwTNHv8YqHhHi799U,2169
125
+ monarch/tools/config/__init__.py,sha256=MLa6uvVJssN_zTciCvCMeCURWglchCuqE3zdqA-gh4U,869
126
+ monarch/tools/config/defaults.py,sha256=ZymyKr9fNnBIgsV-xz-KrtrXRLkJo3hymTqxjXXnBzs,1910
127
127
  monarch/worker/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
128
128
  monarch/worker/_testing_function.py,sha256=A81cVMKgdlO66XvoYcBCDrxIQIm3o3GgvcH_c8M9OmI,13480
129
129
  monarch/worker/compiled_block.py,sha256=hYx1F6PAu0_BnpKAprP_nV9qJtk5XWO7mcwH3JPDioU,10114
@@ -146,12 +146,12 @@ tests/error_test_binary.py,sha256=cgdrnVI3SIzAFSRXTvASfiR8eKSMrZ7N3tSCLVkJo44,78
146
146
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
147
147
  tests/test_actor_error.py,sha256=kEfj1XW_WPk2mplucFBuzCWU3UrvzwkKoHSLIZfyQr0,20945
148
148
  tests/test_alloc.py,sha256=IW7yJSaKxhOYc8SJtFyREakDUwiKWq9M0CGgYyBYHoc,743
149
- tests/test_allocator.py,sha256=gETCLy7kMNVo17gxR3rnMq8kdH5IwcF2tVher-fAOxU,29047
149
+ tests/test_allocator.py,sha256=4LcUB4QRNGDp0qBWAyLM6ektmoxpO922f-NcHZziJ_w,28762
150
150
  tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
151
151
  tests/test_controller.py,sha256=CIMb-ApmBcBj1eCqccDUAbVyyJWMGooAha5gQk0AoeY,31452
152
152
  tests/test_debugger.py,sha256=mtd_no7dAooBePoQ_TZOxtgzwd1-x6xkpzAFK1_Y8B8,13703
153
153
  tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
154
- tests/test_env_before_cuda.py,sha256=w00oi9aP0tFuZtUFggzA9h6qWXXgLo1rN1GoLJZbG10,5531
154
+ tests/test_env_before_cuda.py,sha256=K5zdpXNRZB8hXQJaTN_CftcGHb3vzzdKasu8KFUoiCg,5440
155
155
  tests/test_fault_tolerance.py,sha256=u4wmG1z5MZ6PY6us5zUZHJh2pUC3L7i0wsUfRDNHmxA,14144
156
156
  tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
157
157
  tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
@@ -163,15 +163,15 @@ tests/test_remote_functions.py,sha256=VT65W7htp1jCsP9-AsiO1dofhx-4OebWEOssDEgi3G
163
163
  tests/test_rust_backend.py,sha256=3TLu8dSVEqyLhjHED2DoAEQHTpbBQcr3WI6K2eGZLZw,7861
164
164
  tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
165
165
  tests/test_sim_backend.py,sha256=kT7MnPo5O9xxX8f7uZOpR9Tkuz5brjaOyK1g1NqHRlo,1398
166
- tests/test_tensor_engine.py,sha256=LIJOb6hPVCpgLJjjPlcH2MgLIyM1JG7d-qMFpIUvFuQ,2793
166
+ tests/test_tensor_engine.py,sha256=_F70SQiUCRVZcbq5JcP5XkGJFnul57pqBpu1rF9kipE,3591
167
167
  tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
168
  tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wkB0sg,4565
169
169
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
170
170
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
171
171
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
172
- torchmonarch_nightly-2025.7.25.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
173
- torchmonarch_nightly-2025.7.25.dist-info/METADATA,sha256=sumslJxhBqMCVPa_3AoGQh7fY7dJHR6DhInBfTc6skE,3063
174
- torchmonarch_nightly-2025.7.25.dist-info/WHEEL,sha256=OlISbtpDcfagPrLwG7WtpcZbPTUnoKPnwphA_26fNqE,104
175
- torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt,sha256=60QVSpYVzkzS4iDOiLp0fsLxVp47X3J2l3v7W-59LMo,117
176
- torchmonarch_nightly-2025.7.25.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
177
- torchmonarch_nightly-2025.7.25.dist-info/RECORD,,
172
+ torchmonarch_nightly-2025.7.26.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
173
+ torchmonarch_nightly-2025.7.26.dist-info/METADATA,sha256=rJuOVCi7kVf2R9tHhtMSlaWv80ybWS5g9MvMzmso5M8,3852
174
+ torchmonarch_nightly-2025.7.26.dist-info/WHEEL,sha256=OlISbtpDcfagPrLwG7WtpcZbPTUnoKPnwphA_26fNqE,104
175
+ torchmonarch_nightly-2025.7.26.dist-info/entry_points.txt,sha256=60QVSpYVzkzS4iDOiLp0fsLxVp47X3J2l3v7W-59LMo,117
176
+ torchmonarch_nightly-2025.7.26.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
177
+ torchmonarch_nightly-2025.7.26.dist-info/RECORD,,