torchmonarch-nightly 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
7
  # pyre-strict
8
- import abc
9
8
 
10
9
  from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage
11
10
 
@@ -29,21 +28,6 @@ from monarch._rust_bindings.monarch_hyperactor.shape import ( # @manual=//monar
29
28
  Shape,
30
29
  )
31
30
 
32
-
33
- class Actor(abc.ABC):
34
- @abc.abstractmethod
35
- async def handle(self, mailbox: Mailbox, message: PythonMessage) -> None: ...
36
-
37
- async def handle_cast(
38
- self,
39
- mailbox: Mailbox,
40
- rank: int,
41
- coordinates: list[tuple[str, int]],
42
- message: PythonMessage,
43
- ) -> None:
44
- await self.handle(mailbox, message)
45
-
46
-
47
31
  __all__ = [
48
32
  "init_proc",
49
33
  "Actor",
monarch/_rust_bindings.so CHANGED
Binary file
monarch/actor_mesh.py CHANGED
@@ -4,9 +4,12 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-unsafe
8
+
7
9
  import asyncio
8
10
  import collections
9
11
  import contextvars
12
+ import functools
10
13
  import inspect
11
14
 
12
15
  import itertools
@@ -19,6 +22,7 @@ from traceback import extract_tb, StackSummary
19
22
  from typing import (
20
23
  Any,
21
24
  AsyncGenerator,
25
+ Awaitable,
22
26
  Callable,
23
27
  cast,
24
28
  Concatenate,
@@ -38,6 +42,7 @@ from typing import (
38
42
 
39
43
  import monarch
40
44
  from monarch import ActorFuture as Future
45
+ from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
41
46
 
42
47
  from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
43
48
  from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
@@ -49,10 +54,11 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
49
54
  )
50
55
  from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
51
56
  from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
57
+
52
58
  from monarch.common.pickle_flatten import flatten, unflatten
53
59
  from monarch.common.shape import MeshTrait, NDSlice
54
60
 
55
- logger = logging.getLogger(__name__)
61
+ logger: logging.Logger = logging.getLogger(__name__)
56
62
 
57
63
  Allocator = monarch.ProcessAllocator | monarch.LocalAllocator
58
64
 
@@ -89,7 +95,7 @@ _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
89
95
 
90
96
  # this was implemented in python 3.12 as an argument to task
91
97
  # but I have to backport to 3.10/3.11.
92
- def create_eager_task(coro: Coroutine[Any, None, Any]) -> asyncio.Future:
98
+ def create_eager_task(coro: Awaitable[None]) -> asyncio.Future:
93
99
  iter = coro.__await__()
94
100
  try:
95
101
  first_yield = next(iter)
@@ -232,7 +238,7 @@ class Endpoint(Generic[P, R]):
232
238
  self,
233
239
  actor_mesh_ref: _ActorMeshRefImpl,
234
240
  name: str,
235
- impl: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
241
+ impl: Callable[Concatenate[Any, P], Awaitable[R]],
236
242
  mailbox: Mailbox,
237
243
  ) -> None:
238
244
  self._actor_mesh = actor_mesh_ref
@@ -264,14 +270,16 @@ class Endpoint(Generic[P, R]):
264
270
  return self.choose(*args, **kwargs)
265
271
 
266
272
  def call(self, *args: P.args, **kwargs: P.kwargs) -> "Future[ValueMesh[R]]":
273
+ p: PortId
274
+ r: PortReceiver[R]
267
275
  p, r = port(self)
268
276
  # pyre-ignore
269
277
  send(self, args, kwargs, port=p, rank_in_response=True)
270
278
 
271
- async def process():
272
- results = [None] * len(self._actor_mesh)
279
+ async def process() -> ValueMesh[R]:
280
+ results: List[R] = [None] * len(self._actor_mesh) # pyre-fixme[9]
273
281
  for _ in range(len(self._actor_mesh)):
274
- rank, value = await r.recv()
282
+ rank, value = await r.recv() # pyre-fixme[23]
275
283
  results[rank] = value
276
284
  call_shape = Shape(
277
285
  self._actor_mesh._shape.labels,
@@ -309,15 +317,15 @@ class Endpoint(Generic[P, R]):
309
317
  class Accumulator(Generic[P, R, A]):
310
318
  def __init__(
311
319
  self, endpoint: Endpoint[P, R], identity: A, combine: Callable[[A, R], A]
312
- ):
313
- self._endpoint = endpoint
314
- self._identity = identity
315
- self._combine = combine
320
+ ) -> None:
321
+ self._endpoint: Endpoint[P, R] = endpoint
322
+ self._identity: A = identity
323
+ self._combine: Callable[[A, R], A] = combine
316
324
 
317
325
  def accumulate(self, *args: P.args, **kwargs: P.kwargs) -> "Future[A]":
318
- gen = self._endpoint.stream(*args, **kwargs)
326
+ gen: AsyncGenerator[R, R] = self._endpoint.stream(*args, **kwargs)
319
327
 
320
- async def impl():
328
+ async def impl() -> A:
321
329
  value = self._identity
322
330
  async for x in gen:
323
331
  value = self._combine(value, x)
@@ -334,7 +342,7 @@ class ValueMesh(MeshTrait, Generic[R]):
334
342
  def _new_with_shape(self, shape: Shape) -> "ValueMesh[R]":
335
343
  return ValueMesh(shape, self._values)
336
344
 
337
- def item(self, **kwargs):
345
+ def item(self, **kwargs) -> R:
338
346
  coordinates = [kwargs.pop(label) for label in self._labels]
339
347
  if kwargs:
340
348
  raise KeyError(f"item has extra dimensions: {list(kwargs.keys())}")
@@ -345,7 +353,7 @@ class ValueMesh(MeshTrait, Generic[R]):
345
353
  for rank in self._shape.ranks():
346
354
  yield Point(rank, self._shape), self._values[rank]
347
355
 
348
- def __len__(self):
356
+ def __len__(self) -> int:
349
357
  return len(self._shape)
350
358
 
351
359
  @property
@@ -378,7 +386,7 @@ def send(
378
386
 
379
387
 
380
388
  class EndpointProperty(Generic[P, R]):
381
- def __init__(self, method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]]):
389
+ def __init__(self, method: Callable[Concatenate[Any, P], Awaitable[R]]) -> None:
382
390
  self._method = method
383
391
 
384
392
  def __get__(self, instance, owner) -> Endpoint[P, R]:
@@ -389,7 +397,7 @@ class EndpointProperty(Generic[P, R]):
389
397
 
390
398
 
391
399
  def endpoint(
392
- method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
400
+ method: Callable[Concatenate[Any, P], Awaitable[R]],
393
401
  ) -> EndpointProperty[P, R]:
394
402
  return EndpointProperty(method)
395
403
 
@@ -412,7 +420,9 @@ class Port:
412
420
  # advance lower-level API for sending messages. This is intentially
413
421
  # not part of the Endpoint API because they way it accepts arguments
414
422
  # and handles concerns is different.
415
- def port(endpoint: Endpoint[P, R], once=False) -> Tuple["PortId", "PortReceiver[R]"]:
423
+ def port(
424
+ endpoint: Endpoint[P, R], once: bool = False
425
+ ) -> Tuple["PortId", "PortReceiver[R]"]:
416
426
  handle, receiver = (
417
427
  endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
418
428
  )
@@ -425,9 +435,9 @@ class PortReceiver(Generic[R]):
425
435
  self,
426
436
  mailbox: Mailbox,
427
437
  receiver: HyPortReceiver | OncePortReceiver,
428
- ):
429
- self._mailbox = mailbox
430
- self._receiver = receiver
438
+ ) -> None:
439
+ self._mailbox: Mailbox = mailbox
440
+ self._receiver: HyPortReceiver | OncePortReceiver = receiver
431
441
 
432
442
  async def _recv(self) -> R:
433
443
  return self._process(await self._receiver.recv())
@@ -435,7 +445,7 @@ class PortReceiver(Generic[R]):
435
445
  def _blocking_recv(self) -> R:
436
446
  return self._process(self._receiver.blocking_recv())
437
447
 
438
- def _process(self, msg: PythonMessage):
448
+ def _process(self, msg: PythonMessage) -> R:
439
449
  # TODO: Try to do something more structured than a cast here
440
450
  payload = cast(R, _unpickle(msg.message, self._mailbox))
441
451
  if msg.method == "result":
@@ -482,7 +492,9 @@ class _Actor:
482
492
  else None
483
493
  )
484
494
  try:
485
- ctx = MonarchContext(mailbox, mailbox.actor_id.proc_id, Point(rank, shape))
495
+ ctx: MonarchContext = MonarchContext(
496
+ mailbox, mailbox.actor_id.proc_id, Point(rank, shape)
497
+ )
486
498
  _context.set(ctx)
487
499
 
488
500
  args, kwargs = _unpickle(message.message, mailbox)
@@ -492,13 +504,29 @@ class _Actor:
492
504
  return None
493
505
  else:
494
506
  the_method = getattr(self.instance, message.method)._method
495
- result = the_method(self.instance, *args, **kwargs)
507
+
496
508
  if not inspect.iscoroutinefunction(the_method):
509
+ enter_span(
510
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
511
+ )
512
+ result = the_method(self.instance, *args, **kwargs)
513
+ exit_span()
497
514
  if port is not None:
498
515
  port.send("result", result)
499
516
  return None
500
517
 
501
- return self.run_async(ctx, self.run_task(port, result, panic_flag))
518
+ async def instrumented():
519
+ enter_span(
520
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
521
+ )
522
+ result = await the_method(self.instance, *args, **kwargs)
523
+ exit_span()
524
+ return result
525
+
526
+ return self.run_async(
527
+ ctx,
528
+ self.run_task(port, instrumented(), panic_flag),
529
+ )
502
530
  except Exception as e:
503
531
  traceback.print_exc()
504
532
  s = ActorError(e)
@@ -510,13 +538,22 @@ class _Actor:
510
538
  else:
511
539
  raise s from None
512
540
 
513
- async def run_async(self, ctx, coroutine):
541
+ async def run_async(
542
+ self,
543
+ ctx: MonarchContext,
544
+ coroutine: Awaitable[None],
545
+ ) -> None:
514
546
  _context.set(ctx)
515
547
  if self.complete_task is None:
516
548
  self.complete_task = asyncio.create_task(self._complete())
517
549
  await self.active_requests.put(create_eager_task(coroutine))
518
550
 
519
- async def run_task(self, port, coroutine, panic_flag):
551
+ async def run_task(
552
+ self,
553
+ port: Port | None,
554
+ coroutine: Awaitable[Any],
555
+ panic_flag: PanicFlag,
556
+ ) -> None:
520
557
  try:
521
558
  result = await coroutine
522
559
  if port is not None:
@@ -564,6 +601,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
564
601
 
565
602
 
566
603
  class Actor(MeshTrait):
604
+ @functools.cached_property
605
+ def logger(cls) -> logging.Logger:
606
+ lgr = logging.getLogger(cls.__class__.__name__)
607
+ lgr.setLevel(logging.DEBUG)
608
+ return lgr
609
+
567
610
  @property
568
611
  def _ndslice(self) -> NDSlice:
569
612
  raise NotImplementedError(
@@ -586,10 +629,10 @@ class ActorMeshRef(MeshTrait):
586
629
  def __init__(
587
630
  self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
588
631
  ) -> None:
589
- self.__name__ = Class.__name__
590
- self._class = Class
591
- self._actor_mesh_ref = actor_mesh_ref
592
- self._mailbox = mailbox
632
+ self.__name__: str = Class.__name__
633
+ self._class: Type[T] = Class
634
+ self._actor_mesh_ref: _ActorMeshRefImpl = actor_mesh_ref
635
+ self._mailbox: Mailbox = mailbox
593
636
  for attr_name in dir(self._class):
594
637
  attr_value = getattr(self._class, attr_name, None)
595
638
  if isinstance(attr_value, EndpointProperty):
@@ -630,7 +673,11 @@ class ActorMeshRef(MeshTrait):
630
673
  f"'{self.__class__.__name__}' object has no attribute '{name}'"
631
674
  )
632
675
 
633
- def _create(self, args: Iterable[Any], kwargs: Dict[str, Any]) -> None:
676
+ def _create(
677
+ self,
678
+ args: Iterable[Any],
679
+ kwargs: Dict[str, Any],
680
+ ) -> None:
634
681
  async def null_func(*_args: Iterable[Any], **_kwargs: Dict[str, Any]) -> None:
635
682
  return None
636
683
 
monarch/allocator.py CHANGED
@@ -4,6 +4,9 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-strict
8
+
9
+ import abc
7
10
  from typing import final
8
11
 
9
12
  from monarch import ActorFuture as Future
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
15
18
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
16
19
  LocalAllocatorBase,
17
20
  ProcessAllocatorBase,
21
+ RemoteAllocatorBase,
18
22
  )
19
23
 
20
24
 
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
60
64
  lambda: self.allocate_nonblocking(spec),
61
65
  lambda: self.allocate_blocking(spec),
62
66
  )
67
+
68
+
69
+ class RemoteAllocInitializer(abc.ABC):
70
+ """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
71
+
72
+ NOTE: changes to method signatures of this class must be made to the call-site at
73
+ `PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
74
+ """
75
+
76
+ @abc.abstractmethod
77
+ async def initialize_alloc(self) -> list[str]:
78
+ """
79
+ Return the addresses of the servers that should be used to allocate processes
80
+ for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
81
+
82
+ Each address is of the form `{transport}!{addr}(:{port})`.
83
+ This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
84
+ For example, `tcp!127.0.0.1:1234`.
85
+
86
+ NOTE: Currently, all the addresses must have the same transport type and port
87
+ NOTE: Although this method is currently called once at the initialization of the Allocator,
88
+ in the future this method can be called multiple times and should return the current set of
89
+ addresses that are eligible to handle allocation requests.
90
+
91
+ """
92
+ ...
93
+
94
+
95
+ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
96
+ """
97
+ Returns the static list of server addresses that this initializer
98
+ was constructed with on each `initialize_alloc()` call.
99
+ """
100
+
101
+ def __init__(self, *addrs: str) -> None:
102
+ super().__init__()
103
+ self.addrs: list[str] = list(addrs)
104
+
105
+ async def initialize_alloc(self) -> list[str]:
106
+ return list(self.addrs)
107
+
108
+
109
+ @final
110
+ class RemoteAllocator(RemoteAllocatorBase):
111
+ """
112
+ An allocator that allocates by spawning actors on a remote host.
113
+ The remote host must be running hyperactor's remote-process-allocator.
114
+ """
115
+
116
+ def allocate(self, spec: AllocSpec) -> Future[Alloc]:
117
+ """
118
+ Allocate a process according to the provided spec.
119
+
120
+ Arguments:
121
+ - `spec`: The spec to allocate according to.
122
+
123
+ Returns:
124
+ - A future that will be fulfilled when the requested allocation is fulfilled.
125
+ """
126
+ return Future(
127
+ lambda: self.allocate_nonblocking(spec),
128
+ lambda: self.allocate_blocking(spec),
129
+ )
monarch/bootstrap_main.py CHANGED
@@ -58,7 +58,7 @@ def invoke_main():
58
58
 
59
59
  # forward logs to rust tracing. Defaults to on.
60
60
  if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
61
- logging.root.addHandler(TracingForwarder())
61
+ logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
62
62
 
63
63
  try:
64
64
  with (
monarch/common/client.py CHANGED
@@ -302,7 +302,7 @@ class Client:
302
302
  self.last_processed_seq = max(self.last_processed_seq, seq)
303
303
 
304
304
  if error is not None:
305
- logging.error("Received error for seq %s: %s", seq, error)
305
+ logging.info("Received error for seq %s: %s", seq, error)
306
306
  # We should not have set result if we have an error.
307
307
  assert result is None
308
308
  if not isinstance(error, RemoteException):
@@ -332,9 +332,7 @@ class Client:
332
332
  elif error is not None:
333
333
  # errors get reported as results even if they
334
334
  # do not have futures attached.
335
- logger.warning(
336
- f"Error encountered for this instruction {seq}. Proceeding forward because error is unused and unhandled. Error details:\n{error}."
337
- )
335
+ pass
338
336
 
339
337
  # We can safely delete the seq as tracebacks have been saved to the remote failure itself.
340
338
  del self.pending_results[seq]
monarch/common/stream.py CHANGED
@@ -82,6 +82,9 @@ class StreamRef(Referenceable):
82
82
  messages.CreateStream(self, self.default),
83
83
  )
84
84
 
85
+ def __repr__(self):
86
+ return f"<StreamRef {repr(self.name)} {self.ref}>"
87
+
85
88
  def delete_ref(self, ref):
86
89
  client = self.client()
87
90
  if client is not None and not client._shutdown:
@@ -158,7 +158,6 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
158
158
  traceback.FrameSummary("<unknown>", None, frame)
159
159
  for frame in exc.backtrace.split("\\n")
160
160
  ]
161
- logger.error(f"Worker {exc.actor_id} failed")
162
161
  return MessageResult(
163
162
  seq=result.seq,
164
163
  result=None,
@@ -169,7 +168,7 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
169
168
  controller_frames=None,
170
169
  worker_frames=worker_frames,
171
170
  source_actor_id=exc.actor_id,
172
- message=f"Worker {exc.actor_id} failed",
171
+ message=f"Remote function in {exc.actor_id} errored.",
173
172
  ),
174
173
  )
175
174
  elif isinstance(exc, client.Failure):
Binary file
monarch/proc_mesh.py CHANGED
@@ -4,9 +4,11 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-strict
8
+
7
9
  import sys
8
10
 
9
- from typing import Any, cast, Optional, Type, TypeVar
11
+ from typing import Any, cast, List, Optional, Type, TypeVar
10
12
 
11
13
  import monarch
12
14
  from monarch import ActorFuture as Future
@@ -18,7 +20,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
18
20
  )
19
21
  from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
20
22
  from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
21
- from monarch._rust_bindings.monarch_hyperactor.shape import Shape
23
+ from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
22
24
  from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
23
25
 
24
26
  from monarch.common._device_utils import _local_device_count
@@ -46,14 +48,16 @@ class ProcMesh(MeshTrait):
46
48
  def __init__(self, hy_proc_mesh: HyProcMesh) -> None:
47
49
  self._proc_mesh = hy_proc_mesh
48
50
  self._mailbox: Mailbox = self._proc_mesh.client
49
- self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
51
+ self._rdma_manager: RDMAManager = self._spawn_blocking(
52
+ "rdma_manager", RDMAManager
53
+ )
50
54
 
51
55
  @property
52
- def _ndslice(self):
56
+ def _ndslice(self) -> Slice:
53
57
  return self._proc_mesh.shape.ndslice
54
58
 
55
59
  @property
56
- def _labels(self):
60
+ def _labels(self) -> List[str]:
57
61
  return self._proc_mesh.shape.labels
58
62
 
59
63
  def _new_with_shape(self, shape: Shape) -> "ProcMesh":
@@ -0,0 +1,216 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import contextlib
10
+ import importlib.resources
11
+ import math
12
+ import os
13
+ import subprocess
14
+ import sys
15
+ import unittest
16
+ from datetime import timedelta
17
+ from typing import Generator
18
+
19
+ import cloudpickle
20
+
21
+ import torch
22
+ import torch.distributed as dist
23
+ import torch.nn.functional as F
24
+
25
+ from monarch._rust_bindings.hyperactor_extension.alloc import (
26
+ AllocConstraints,
27
+ AllocSpec,
28
+ )
29
+
30
+ from monarch._rust_bindings.monarch_hyperactor.channel import (
31
+ ChannelAddr,
32
+ ChannelTransport,
33
+ )
34
+ from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
35
+
36
+ from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
37
+ from monarch.proc_mesh import ProcMesh
38
+
39
+ from torch.distributed.elastic.utils.distributed import get_free_port
40
+
41
+ _100_MILLISECONDS = timedelta(milliseconds=100)
42
+
43
+
44
+ class TestActor(Actor):
45
+ """Silly actor that computes the world size by all-reducing rank-hot tensors"""
46
+
47
+ def __init__(self) -> None:
48
+ self.rank: int = current_rank().rank
49
+ self.world_size: int = math.prod(current_size().values())
50
+
51
+ @endpoint
52
+ async def compute_world_size(self, master_addr: str, master_port: int) -> int:
53
+ os.environ["MASTER_ADDR"] = master_addr
54
+ os.environ["MASTER_PORT"] = str(master_port)
55
+ dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
56
+
57
+ try:
58
+ t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
59
+ dist.all_reduce(t)
60
+ return int(torch.sum(t).item())
61
+ finally:
62
+ dist.destroy_process_group()
63
+
64
+
65
+ @contextlib.contextmanager
66
+ def remote_process_allocator() -> Generator[str, None, None]:
67
+ with importlib.resources.path(__package__, "") as package_path:
68
+ addr = ChannelAddr.any(ChannelTransport.Unix)
69
+
70
+ process_allocator = subprocess.Popen(
71
+ args=[
72
+ "process_allocator",
73
+ f"--addr={addr}",
74
+ ],
75
+ env={
76
+ # prefix PATH with this test module's directory to
77
+ # give 'process_allocator' and 'monarch_bootstrap' binary resources
78
+ # in this test module's directory precedence over the installed ones
79
+ # useful in BUCK where these binaries are added as 'resources' of this test target
80
+ "PATH": f"{package_path}:{os.getenv('PATH', '')}",
81
+ "RUST_LOG": "debug",
82
+ },
83
+ )
84
+ try:
85
+ yield addr
86
+ finally:
87
+ process_allocator.terminate()
88
+ try:
89
+ five_seconds = 5
90
+ process_allocator.wait(timeout=five_seconds)
91
+ except subprocess.TimeoutExpired:
92
+ process_allocator.kill()
93
+
94
+
95
+ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
96
+ @classmethod
97
+ def setUpClass(cls) -> None:
98
+ cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
99
+
100
+ @classmethod
101
+ def tearDownClass(cls) -> None:
102
+ cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
103
+
104
+ def assert_computed_world_size(
105
+ self, computed: ValueMesh[int], expected_world_size: int
106
+ ) -> None:
107
+ expected_world_sizes = {
108
+ rank: expected_world_size for rank in range(0, expected_world_size)
109
+ }
110
+ computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
111
+ self.assertDictEqual(expected_world_sizes, computed_world_sizes)
112
+
113
+ async def test_call_allocate_twice(self) -> None:
114
+ class DeletingAllocInitializer(StaticRemoteAllocInitializer):
115
+ """test initializer that removes the last address from the list each time initialize_alloc() is called
116
+ used to test that the state of the initializer is preserved across calls to allocate()
117
+ """
118
+
119
+ async def initialize_alloc(self) -> list[str]:
120
+ alloc = await super().initialize_alloc()
121
+ self.addrs.pop(-1)
122
+ return alloc
123
+
124
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
125
+ initializer = DeletingAllocInitializer(host1, host2)
126
+
127
+ allocator = RemoteAllocator(
128
+ world_id="test_remote_allocator",
129
+ initializer=initializer,
130
+ heartbeat_interval=_100_MILLISECONDS,
131
+ )
132
+
133
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
134
+
135
+ await allocator.allocate(spec)
136
+ self.assertEqual([host1], initializer.addrs)
137
+
138
+ await allocator.allocate(spec)
139
+ self.assertEqual([], initializer.addrs)
140
+
141
+ async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
142
+ class EmptyAllocInitializer(StaticRemoteAllocInitializer):
143
+ """test initializer that returns an empty list of addresses"""
144
+
145
+ async def initialize_alloc(self) -> list[str]:
146
+ return []
147
+
148
+ empty_initializer = EmptyAllocInitializer()
149
+ with self.assertRaisesRegex(
150
+ RuntimeError, r"initializer must return non-empty list of addresses"
151
+ ):
152
+ allocator = RemoteAllocator(
153
+ world_id="test_remote_allocator",
154
+ initializer=empty_initializer,
155
+ heartbeat_interval=_100_MILLISECONDS,
156
+ )
157
+ await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
158
+
159
+ async def test_allocate_2d_mesh(self) -> None:
160
+ hosts = 2
161
+ gpus = 4
162
+ world_size = hosts * gpus
163
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
164
+
165
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
166
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
167
+ allocator = RemoteAllocator(
168
+ world_id="test_remote_allocator",
169
+ initializer=StaticRemoteAllocInitializer(host1, host2),
170
+ heartbeat_interval=_100_MILLISECONDS,
171
+ )
172
+ alloc = await allocator.allocate(spec)
173
+ proc_mesh = await ProcMesh.from_alloc(alloc)
174
+ actor = await proc_mesh.spawn("test_actor", TestActor)
175
+
176
+ values = await actor.compute_world_size.call(
177
+ master_addr="0.0.0.0",
178
+ master_port=get_free_port(),
179
+ )
180
+
181
+ self.assert_computed_world_size(values, world_size)
182
+
183
+ async def test_stacked_1d_meshes(self) -> None:
184
+ # create two stacked actor meshes on the same host
185
+ # each actor mesh running on separate process-allocators
186
+
187
+ with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
188
+ allocator_a = RemoteAllocator(
189
+ world_id="a",
190
+ initializer=StaticRemoteAllocInitializer(host1_a),
191
+ heartbeat_interval=_100_MILLISECONDS,
192
+ )
193
+ allocator_b = RemoteAllocator(
194
+ world_id="b",
195
+ initializer=StaticRemoteAllocInitializer(host1_b),
196
+ heartbeat_interval=_100_MILLISECONDS,
197
+ )
198
+
199
+ spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
200
+ spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
201
+
202
+ proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
203
+ proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
204
+
205
+ actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
206
+ actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
207
+
208
+ results_a = await actor_a.compute_world_size.call(
209
+ master_addr="0.0.0.0", master_port=get_free_port()
210
+ )
211
+ results_b = await actor_b.compute_world_size.call(
212
+ master_addr="0.0.0.0", master_port=get_free_port()
213
+ )
214
+
215
+ self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
216
+ self.assert_computed_world_size(results_b, 6) # b is a 1x6 mesh
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.10
3
+ Version: 2025.6.12
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,20 +1,20 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=0-svsKnUJboaOBd5i-LOfpHiRRAgVLX_1Hq_YYREQi8,39756680
2
+ monarch/_rust_bindings.so,sha256=VPU8MhCnz10umRwSqv99QvwFkr2q0N0DiOTpZ37Ecl0,40645344
3
3
  monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
4
- monarch/actor_mesh.py,sha256=AKdjPg3FM6Yt35uFPBnP7fNVEu6busu5BXVWLwjU2A4,23000
5
- monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
6
- monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
4
+ monarch/actor_mesh.py,sha256=ovi5RBxobGEcg7zKkzhRc83n82KOD6ermhuloHKbuFs,24420
5
+ monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
6
+ monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
7
7
  monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
8
8
  monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
9
9
  monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
10
10
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
11
11
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
12
- monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
13
- monarch/monarch_controller,sha256=Q1eR_EVJqDQLrJZ_6p1ldxVDAU1OmN5lSSuctDcaAFY,20396832
12
+ monarch/mesh_controller.py,sha256=Rr4VNUNN0pJdThbPmbCoaPWid4QpTNHya9xYpmjTkW0,8575
13
+ monarch/monarch_controller,sha256=MECcriPRnSdI_NpAG6y-GiK2-DqnDsLBfyOHVdqewRU,20397992
14
14
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
15
15
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
16
16
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
17
- monarch/proc_mesh.py,sha256=pVN0BLnjGaty6-UGn1U81rNdmfiDvD4gO1c4bISHtqs,6807
17
+ monarch/proc_mesh.py,sha256=xoaReM9Ab9TWkesxedWSyyk4TMD0HLV88dQ8CQcbqTI,6892
18
18
  monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
19
19
  monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
20
20
  monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
@@ -27,7 +27,7 @@ monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI
27
27
  monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
28
28
  monarch/world_mesh.py,sha256=GqZpFoVNJPxYa70rLYgv0vu8Vg1nXqx_GYERRb1E9Pc,975
29
29
  monarch/_monarch/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
30
- monarch/_monarch/hyperactor/__init__.py,sha256=H-9w80ejck1lBVfpqOLikT-mPLMLpi7ZZfqrmprMxL0,1748
30
+ monarch/_monarch/hyperactor/__init__.py,sha256=JLnB2_-bKHLqAcZwehKvPkbwbxF-gCq5LODJiWGU_b8,1384
31
31
  monarch/_monarch/selection/__init__.py,sha256=47arOElvlK0uYcTNrd__1BwXSfsMosnVw4_tgu2hA-I,381
32
32
  monarch/_monarch/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  monarch/_monarch/worker/debugger.py,sha256=JJZwRPTgQO2emz-hrMelkOSxJFIR3dV4ZA6e7ftYUKA,3614
@@ -43,7 +43,7 @@ monarch/common/_device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4
43
43
  monarch/common/_tensor_to_table.py,sha256=yRjCNwvtl188Z1Dwkx3ZU-Bh2mwYnQ0Lnue2RAztwvc,5753
44
44
  monarch/common/base_tensor.py,sha256=ujRzR6lWaeCdPv2JX0vCR-VsCWn-3SHaJIkZH1Sw9FQ,1159
45
45
  monarch/common/borrows.py,sha256=7KR62xoUat1T6FyADsdHsxVAVIJDvfJWUnPO-xx277U,5307
46
- monarch/common/client.py,sha256=wOAnoaLmabrcv7mK_z_HVnk_ivGe5igPy3iWZI4LVZc,24517
46
+ monarch/common/client.py,sha256=BaBhOzQaNsqTa-BGy7_IknQxpnpK0j4C5QsNyFHZHW4,24343
47
47
  monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
48
48
  monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
49
49
  monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
@@ -65,7 +65,7 @@ monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,9
65
65
  monarch/common/remote.py,sha256=qZWXkShX20l07TseQSpVECh2yXZaVKYUvQXkeEM-zvY,9220
66
66
  monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
67
67
  monarch/common/shape.py,sha256=k6-0S0U19PmrfP62SMb9Ihx6_I4QQFUGErloZn8GcZ0,8144
68
- monarch/common/stream.py,sha256=J9UCqhSXSbKYFGtbKaqAq1Vgmg6DJcLzsXXm-tsBQ-w,3499
68
+ monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
69
69
  monarch/common/tensor.py,sha256=mSXiHoD0Up4m2RLdQcsbesaz2N4QCFS34UNNX3Dbldk,28842
70
70
  monarch/common/tensor_factory.py,sha256=qm8NZx-5ezMAFjNLiXQvb66okm5XgdboB_GRarGOdN0,801
71
71
  monarch/common/tree.py,sha256=1DG3siiE7ixBV6v5cwN8RT_17aJhYZTE-L3i7wZe2_c,2282
@@ -132,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
132
132
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
133
133
  tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
134
134
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
135
+ tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
135
136
  tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
136
137
  tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
137
138
  tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -150,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
150
151
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
151
152
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
152
153
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
153
- torchmonarch_nightly-2025.6.10.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
154
- torchmonarch_nightly-2025.6.10.dist-info/METADATA,sha256=DR1GtSFqtqsjhKWi38uGcvhw2p3ycHYSOwDmsErwLj0,2772
155
- torchmonarch_nightly-2025.6.10.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
156
- torchmonarch_nightly-2025.6.10.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
157
- torchmonarch_nightly-2025.6.10.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
158
- torchmonarch_nightly-2025.6.10.dist-info/RECORD,,
154
+ torchmonarch_nightly-2025.6.12.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
155
+ torchmonarch_nightly-2025.6.12.dist-info/METADATA,sha256=mBsDu66W3vkM2SdaxX7hw8_B6kl_XgQZT7nQKZhVkMk,2772
156
+ torchmonarch_nightly-2025.6.12.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
157
+ torchmonarch_nightly-2025.6.12.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
158
+ torchmonarch_nightly-2025.6.12.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
159
+ torchmonarch_nightly-2025.6.12.dist-info/RECORD,,