torchmonarch-nightly 2025.6.9__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
monarch/actor_mesh.py CHANGED
@@ -7,6 +7,7 @@
7
7
  import asyncio
8
8
  import collections
9
9
  import contextvars
10
+ import functools
10
11
  import inspect
11
12
 
12
13
  import itertools
@@ -38,6 +39,7 @@ from typing import (
38
39
 
39
40
  import monarch
40
41
  from monarch import ActorFuture as Future
42
+ from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
41
43
 
42
44
  from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
43
45
  from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
@@ -49,6 +51,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
49
51
  )
50
52
  from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
51
53
  from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
54
+
52
55
  from monarch.common.pickle_flatten import flatten, unflatten
53
56
  from monarch.common.shape import MeshTrait, NDSlice
54
57
 
@@ -83,7 +86,7 @@ class MonarchContext:
83
86
 
84
87
 
85
88
  _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
86
- "monarch.service._context"
89
+ "monarch.actor_mesh._context"
87
90
  )
88
91
 
89
92
 
@@ -492,13 +495,29 @@ class _Actor:
492
495
  return None
493
496
  else:
494
497
  the_method = getattr(self.instance, message.method)._method
495
- result = the_method(self.instance, *args, **kwargs)
498
+
496
499
  if not inspect.iscoroutinefunction(the_method):
500
+ enter_span(
501
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
502
+ )
503
+ result = the_method(self.instance, *args, **kwargs)
504
+ exit_span()
497
505
  if port is not None:
498
506
  port.send("result", result)
499
507
  return None
500
508
 
501
- return self.run_async(ctx, self.run_task(port, result, panic_flag))
509
+ async def instrumented():
510
+ enter_span(
511
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
512
+ )
513
+ result = await the_method(self.instance, *args, **kwargs)
514
+ exit_span()
515
+ return result
516
+
517
+ return self.run_async(
518
+ ctx,
519
+ self.run_task(port, instrumented(), panic_flag),
520
+ )
502
521
  except Exception as e:
503
522
  traceback.print_exc()
504
523
  s = ActorError(e)
@@ -510,7 +529,11 @@ class _Actor:
510
529
  else:
511
530
  raise s from None
512
531
 
513
- async def run_async(self, ctx, coroutine):
532
+ async def run_async(
533
+ self,
534
+ ctx: MonarchContext,
535
+ coroutine: Coroutine[Any, None, Any],
536
+ ) -> None:
514
537
  _context.set(ctx)
515
538
  if self.complete_task is None:
516
539
  self.complete_task = asyncio.create_task(self._complete())
@@ -564,6 +587,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
564
587
 
565
588
 
566
589
  class Actor(MeshTrait):
590
+ @functools.cached_property
591
+ def logger(cls) -> logging.Logger:
592
+ lgr = logging.getLogger(cls.__class__.__name__)
593
+ lgr.setLevel(logging.DEBUG)
594
+ return lgr
595
+
567
596
  @property
568
597
  def _ndslice(self) -> NDSlice:
569
598
  raise NotImplementedError(
@@ -677,7 +706,7 @@ class ActorError(Exception):
677
706
  def __init__(
678
707
  self,
679
708
  exception: Exception,
680
- message: str = "A remote service call has failed asynchronously.",
709
+ message: str = "A remote actor call has failed asynchronously.",
681
710
  ) -> None:
682
711
  self.exception = exception
683
712
  self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
@@ -688,7 +717,7 @@ class ActorError(Exception):
688
717
  actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
689
718
  return (
690
719
  f"{self.message}\n"
691
- f"Traceback of where the service call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
720
+ f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
692
721
  )
693
722
 
694
723
 
monarch/allocator.py CHANGED
@@ -4,6 +4,9 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-strict
8
+
9
+ import abc
7
10
  from typing import final
8
11
 
9
12
  from monarch import ActorFuture as Future
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
15
18
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
16
19
  LocalAllocatorBase,
17
20
  ProcessAllocatorBase,
21
+ RemoteAllocatorBase,
18
22
  )
19
23
 
20
24
 
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
60
64
  lambda: self.allocate_nonblocking(spec),
61
65
  lambda: self.allocate_blocking(spec),
62
66
  )
67
+
68
+
69
+ class RemoteAllocInitializer(abc.ABC):
70
+ """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
71
+
72
+ NOTE: changes to method signatures of this class must be made to the call-site at
73
+ `PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
74
+ """
75
+
76
+ @abc.abstractmethod
77
+ async def initialize_alloc(self) -> list[str]:
78
+ """
79
+ Return the addresses of the servers that should be used to allocate processes
80
+ for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
81
+
82
+ Each address is of the form `{transport}!{addr}(:{port})`.
83
+ This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
84
+ For example, `tcp!127.0.0.1:1234`.
85
+
86
+ NOTE: Currently, all the addresses must have the same transport type and port
87
+ NOTE: Although this method is currently called once at the initialization of the Allocator,
88
+ in the future this method can be called multiple times and should return the current set of
89
+ addresses that are eligible to handle allocation requests.
90
+
91
+ """
92
+ ...
93
+
94
+
95
+ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
96
+ """
97
+ Returns the static list of server addresses that this initializer
98
+ was constructed with on each `initialize_alloc()` call.
99
+ """
100
+
101
+ def __init__(self, *addrs: str) -> None:
102
+ super().__init__()
103
+ self.addrs: list[str] = list(addrs)
104
+
105
+ async def initialize_alloc(self) -> list[str]:
106
+ return list(self.addrs)
107
+
108
+
109
+ @final
110
+ class RemoteAllocator(RemoteAllocatorBase):
111
+ """
112
+ An allocator that allocates by spawning actors on a remote host.
113
+ The remote host must be running hyperactor's remote-process-allocator.
114
+ """
115
+
116
+ def allocate(self, spec: AllocSpec) -> Future[Alloc]:
117
+ """
118
+ Allocate a process according to the provided spec.
119
+
120
+ Arguments:
121
+ - `spec`: The spec to allocate according to.
122
+
123
+ Returns:
124
+ - A future that will be fulfilled when the requested allocation is fulfilled.
125
+ """
126
+ return Future(
127
+ lambda: self.allocate_nonblocking(spec),
128
+ lambda: self.allocate_blocking(spec),
129
+ )
monarch/bootstrap_main.py CHANGED
@@ -58,7 +58,7 @@ def invoke_main():
58
58
 
59
59
  # forward logs to rust tracing. Defaults to on.
60
60
  if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
61
- logging.root.addHandler(TracingForwarder())
61
+ logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
62
62
 
63
63
  try:
64
64
  with (
@@ -0,0 +1,209 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import logging
8
+ import traceback
9
+ from collections import deque
10
+ from logging import Logger
11
+ from typing import List, NamedTuple, Optional, Union
12
+
13
+ import torch.utils._python_dispatch
14
+
15
+ from monarch import NDSlice
16
+ from monarch._rust_bindings.monarch_extension import client, debugger
17
+ from monarch._rust_bindings.monarch_extension.client import ( # @manual=//monarch/monarch_extension:monarch_extension
18
+ WorldState,
19
+ )
20
+ from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
21
+ from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension
22
+ ActorId,
23
+ )
24
+ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
25
+ from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
26
+ from monarch.common.client import Client
27
+ from monarch.common.controller_api import LogMessage, MessageResult
28
+ from monarch.common.device_mesh import DeviceMesh, no_mesh
29
+ from monarch.common.invocation import DeviceException, RemoteException
30
+ from monarch.controller.debugger import read as debugger_read, write as debugger_write
31
+ from monarch.proc_mesh import ProcMesh
32
+ from pyre_extensions import none_throws
33
+
34
+ logger: Logger = logging.getLogger(__name__)
35
+
36
+
37
+ class Controller(_Controller):
38
+ def __init__(self, workers: HyProcMesh) -> None:
39
+ super().__init__()
40
+ # Buffer for messages unrelated to debugging that are received while a
41
+ # debugger session is active.
42
+ self._non_debugger_pending_messages: deque[
43
+ Optional[client.LogMessage | client.WorkerResponse]
44
+ ] = deque()
45
+ self._pending_debugger_sessions: deque[ActorId] = deque()
46
+
47
+ def next_message(
48
+ self, timeout: Optional[float]
49
+ ) -> Optional[LogMessage | MessageResult]:
50
+ if self._non_debugger_pending_messages:
51
+ msg = self._non_debugger_pending_messages.popleft()
52
+ else:
53
+ msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
54
+ if msg is None:
55
+ return None
56
+
57
+ if isinstance(msg, client.WorkerResponse):
58
+ return _worker_response_to_result(msg)
59
+ elif isinstance(msg, client.LogMessage):
60
+ return LogMessage(msg.level, msg.message)
61
+ elif isinstance(msg, client.DebuggerMessage):
62
+ self._run_debugger_loop(msg)
63
+
64
+ def send(
65
+ self,
66
+ ranks: Union[NDSlice, List[NDSlice]],
67
+ msg: NamedTuple,
68
+ ) -> None:
69
+ with torch.utils._python_dispatch._disable_current_modes():
70
+ return super().send(ranks, msg)
71
+
72
+ def drain_and_stop(
73
+ self,
74
+ ) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
75
+ logger.info("rust controller shutting down")
76
+ results = []
77
+ for msg in self._drain_and_stop():
78
+ if isinstance(msg, client.WorkerResponse):
79
+ results.append(_worker_response_to_result(msg))
80
+ elif isinstance(msg, client.LogMessage):
81
+ results.append(LogMessage(msg.level, msg.message))
82
+ elif isinstance(msg, client.DebuggerMessage):
83
+ results.append(msg)
84
+ else:
85
+ raise RuntimeError(f"Unexpected message type {type(msg)}")
86
+ return results
87
+
88
+ def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
89
+ if not isinstance(message.action, DebuggerAction.Paused):
90
+ raise RuntimeError(
91
+ f"Unexpected debugger message {message} when no debugger session is running"
92
+ )
93
+
94
+ self._pending_debugger_sessions.append(message.debugger_actor_id)
95
+ while self._pending_debugger_sessions:
96
+ debugger_actor_id = self._pending_debugger_sessions.popleft()
97
+ rank = debugger_actor_id.rank
98
+ proc_id = debugger_actor_id.proc_id
99
+ debugger_write(
100
+ f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
101
+ )
102
+
103
+ self._debugger_attach(debugger_actor_id)
104
+ while True:
105
+ # TODO: Add appropriate timeout.
106
+ msg = self._get_next_message(timeout_msec=None)
107
+
108
+ if not isinstance(msg, client.DebuggerMessage):
109
+ self._non_debugger_pending_messages.append(msg)
110
+ continue
111
+
112
+ if msg.debugger_actor_id != debugger_actor_id:
113
+ if isinstance(msg.action, DebuggerAction.Paused):
114
+ self._pending_debugger_sessions.append(msg.debugger_actor_id)
115
+ continue
116
+ else:
117
+ raise RuntimeError(
118
+ f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
119
+ f"when debugging rank {debugger_actor_id.rank}"
120
+ )
121
+
122
+ action = msg.action
123
+ if isinstance(action, DebuggerAction.Detach):
124
+ break
125
+ elif isinstance(action, DebuggerAction.Read):
126
+ self._debugger_write(
127
+ debugger_actor_id, debugger_read(action.requested_size)
128
+ )
129
+ elif isinstance(action, DebuggerAction.Write):
130
+ debugger_write(
131
+ debugger.get_bytes_from_write_action(action).decode()
132
+ )
133
+ else:
134
+ raise RuntimeError(
135
+ f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
136
+ )
137
+
138
+ def worker_world_state(self) -> WorldState:
139
+ raise NotImplementedError("worker world state")
140
+
141
+ def stop_mesh(self):
142
+ # I think this is a noop?
143
+
144
+ pass
145
+
146
+
147
+ # TODO: Handling conversion of the response can move to a separate module over time
148
+ # especially as we have structured error messages.
149
+ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
150
+ if not result.is_exception():
151
+ # The result of the message needs to be unwrapped on a real device.
152
+ # Staying as a fake tensor will fail the tensor deserialization.
153
+ with no_mesh.activate():
154
+ return MessageResult(result.seq, result.result(), None)
155
+ exc = none_throws(result.exception())
156
+ if isinstance(exc, client.Error):
157
+ worker_frames = [
158
+ traceback.FrameSummary("<unknown>", None, frame)
159
+ for frame in exc.backtrace.split("\\n")
160
+ ]
161
+ logger.error(f"Worker {exc.actor_id} failed")
162
+ return MessageResult(
163
+ seq=result.seq,
164
+ result=None,
165
+ error=RemoteException(
166
+ seq=exc.caused_by_seq,
167
+ exception=RuntimeError(exc.backtrace),
168
+ controller_frame_index=0, # TODO: T225205291 fix this once we have recording support in rust
169
+ controller_frames=None,
170
+ worker_frames=worker_frames,
171
+ source_actor_id=exc.actor_id,
172
+ message=f"Worker {exc.actor_id} failed",
173
+ ),
174
+ )
175
+ elif isinstance(exc, client.Failure):
176
+ frames = [
177
+ traceback.FrameSummary("<unknown>", None, frame)
178
+ for frame in exc.backtrace.split("\n")
179
+ ]
180
+ reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
181
+ logger.error(reason)
182
+ return MessageResult(
183
+ seq=0, # seq is not consumed for DeviceException; it will be directly thrown by the client
184
+ result=None,
185
+ error=DeviceException(
186
+ exception=RuntimeError(reason),
187
+ frames=frames,
188
+ source_actor_id=exc.actor_id,
189
+ message=reason,
190
+ ),
191
+ )
192
+ else:
193
+ raise RuntimeError(f"Unknown exception type: {type(exc)}")
194
+
195
+
196
+ def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
197
+ # This argument to Controller
198
+ # is currently only used for debug printing. It should be fixed to
199
+ # report the proc ID instead of the rank it currently does.
200
+ gpus = proc_mesh.sizes.get("gpus", 1)
201
+ backend_ctrl = Controller(proc_mesh._proc_mesh)
202
+ client = Client(backend_ctrl, proc_mesh.size(), gpus)
203
+ dm = DeviceMesh(
204
+ client,
205
+ NDSlice.new_row_major(list(proc_mesh.sizes.values())),
206
+ tuple(proc_mesh.sizes.keys()),
207
+ )
208
+ dm.exit = lambda: client.shutdown()
209
+ return dm
Binary file
@@ -0,0 +1,216 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import contextlib
10
+ import importlib.resources
11
+ import math
12
+ import os
13
+ import subprocess
14
+ import sys
15
+ import unittest
16
+ from datetime import timedelta
17
+ from typing import Generator
18
+
19
+ import cloudpickle
20
+
21
+ import torch
22
+ import torch.distributed as dist
23
+ import torch.nn.functional as F
24
+
25
+ from monarch._rust_bindings.hyperactor_extension.alloc import (
26
+ AllocConstraints,
27
+ AllocSpec,
28
+ )
29
+
30
+ from monarch._rust_bindings.monarch_hyperactor.channel import (
31
+ ChannelAddr,
32
+ ChannelTransport,
33
+ )
34
+ from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
35
+
36
+ from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
37
+ from monarch.proc_mesh import ProcMesh
38
+
39
+ from torch.distributed.elastic.utils.distributed import get_free_port
40
+
41
+ _100_MILLISECONDS = timedelta(milliseconds=100)
42
+
43
+
44
+ class TestActor(Actor):
45
+ """Silly actor that computes the world size by all-reducing rank-hot tensors"""
46
+
47
+ def __init__(self) -> None:
48
+ self.rank: int = current_rank().rank
49
+ self.world_size: int = math.prod(current_size().values())
50
+
51
+ @endpoint
52
+ async def compute_world_size(self, master_addr: str, master_port: int) -> int:
53
+ os.environ["MASTER_ADDR"] = master_addr
54
+ os.environ["MASTER_PORT"] = str(master_port)
55
+ dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
56
+
57
+ try:
58
+ t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
59
+ dist.all_reduce(t)
60
+ return int(torch.sum(t).item())
61
+ finally:
62
+ dist.destroy_process_group()
63
+
64
+
65
+ @contextlib.contextmanager
66
+ def remote_process_allocator() -> Generator[str, None, None]:
67
+ with importlib.resources.path(__package__, "") as package_path:
68
+ addr = ChannelAddr.any(ChannelTransport.Unix)
69
+
70
+ process_allocator = subprocess.Popen(
71
+ args=[
72
+ "process_allocator",
73
+ f"--addr={addr}",
74
+ ],
75
+ env={
76
+ # prefix PATH with this test module's directory to
77
+ # give 'process_allocator' and 'monarch_bootstrap' binary resources
78
+ # in this test module's directory precedence over the installed ones
79
+ # useful in BUCK where these binaries are added as 'resources' of this test target
80
+ "PATH": f"{package_path}:{os.getenv('PATH', '')}",
81
+ "RUST_LOG": "debug",
82
+ },
83
+ )
84
+ try:
85
+ yield addr
86
+ finally:
87
+ process_allocator.terminate()
88
+ try:
89
+ five_seconds = 5
90
+ process_allocator.wait(timeout=five_seconds)
91
+ except subprocess.TimeoutExpired:
92
+ process_allocator.kill()
93
+
94
+
95
+ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
96
+ @classmethod
97
+ def setUpClass(cls) -> None:
98
+ cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
99
+
100
+ @classmethod
101
+ def tearDownClass(cls) -> None:
102
+ cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
103
+
104
+ def assert_computed_world_size(
105
+ self, computed: ValueMesh[int], expected_world_size: int
106
+ ) -> None:
107
+ expected_world_sizes = {
108
+ rank: expected_world_size for rank in range(0, expected_world_size)
109
+ }
110
+ computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
111
+ self.assertDictEqual(expected_world_sizes, computed_world_sizes)
112
+
113
+ async def test_call_allocate_twice(self) -> None:
114
+ class DeletingAllocInitializer(StaticRemoteAllocInitializer):
115
+ """test initializer that removes the last address from the list each time initialize_alloc() is called
116
+ used to test that the state of the initializer is preserved across calls to allocate()
117
+ """
118
+
119
+ async def initialize_alloc(self) -> list[str]:
120
+ alloc = await super().initialize_alloc()
121
+ self.addrs.pop(-1)
122
+ return alloc
123
+
124
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
125
+ initializer = DeletingAllocInitializer(host1, host2)
126
+
127
+ allocator = RemoteAllocator(
128
+ world_id="test_remote_allocator",
129
+ initializer=initializer,
130
+ heartbeat_interval=_100_MILLISECONDS,
131
+ )
132
+
133
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
134
+
135
+ await allocator.allocate(spec)
136
+ self.assertEqual([host1], initializer.addrs)
137
+
138
+ await allocator.allocate(spec)
139
+ self.assertEqual([], initializer.addrs)
140
+
141
+ async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
142
+ class EmptyAllocInitializer(StaticRemoteAllocInitializer):
143
+ """test initializer that returns an empty list of addresses"""
144
+
145
+ async def initialize_alloc(self) -> list[str]:
146
+ return []
147
+
148
+ empty_initializer = EmptyAllocInitializer()
149
+ with self.assertRaisesRegex(
150
+ RuntimeError, r"initializer must return non-empty list of addresses"
151
+ ):
152
+ allocator = RemoteAllocator(
153
+ world_id="test_remote_allocator",
154
+ initializer=empty_initializer,
155
+ heartbeat_interval=_100_MILLISECONDS,
156
+ )
157
+ await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
158
+
159
+ async def test_allocate_2d_mesh(self) -> None:
160
+ hosts = 2
161
+ gpus = 4
162
+ world_size = hosts * gpus
163
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
164
+
165
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
166
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
167
+ allocator = RemoteAllocator(
168
+ world_id="test_remote_allocator",
169
+ initializer=StaticRemoteAllocInitializer(host1, host2),
170
+ heartbeat_interval=_100_MILLISECONDS,
171
+ )
172
+ alloc = await allocator.allocate(spec)
173
+ proc_mesh = await ProcMesh.from_alloc(alloc)
174
+ actor = await proc_mesh.spawn("test_actor", TestActor)
175
+
176
+ values = await actor.compute_world_size.call(
177
+ master_addr="::",
178
+ master_port=get_free_port(),
179
+ )
180
+
181
+ self.assert_computed_world_size(values, world_size)
182
+
183
+ async def test_stacked_1d_meshes(self) -> None:
184
+ # create two stacked actor meshes on the same host
185
+ # each actor mesh running on separate process-allocators
186
+
187
+ with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
188
+ allocator_a = RemoteAllocator(
189
+ world_id="a",
190
+ initializer=StaticRemoteAllocInitializer(host1_a),
191
+ heartbeat_interval=_100_MILLISECONDS,
192
+ )
193
+ allocator_b = RemoteAllocator(
194
+ world_id="b",
195
+ initializer=StaticRemoteAllocInitializer(host1_b),
196
+ heartbeat_interval=_100_MILLISECONDS,
197
+ )
198
+
199
+ spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
200
+ spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
201
+
202
+ proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
203
+ proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
204
+
205
+ actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
206
+ actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
207
+
208
+ results_a = await actor_a.compute_world_size.call(
209
+ master_addr="::", master_port=get_free_port()
210
+ )
211
+ results_b = await actor_b.compute_world_size.call(
212
+ master_addr="::", master_port=get_free_port()
213
+ )
214
+
215
+ self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
216
+ self.assert_computed_world_size(results_b, 6) # b is a 1x6 mesh
@@ -7,7 +7,12 @@
7
7
  import operator
8
8
  from types import ModuleType
9
9
 
10
+ import monarch
11
+
12
+ import pytest
13
+
10
14
  import torch
15
+
11
16
  from monarch.actor_mesh import (
12
17
  Accumulator,
13
18
  Actor,
@@ -17,6 +22,8 @@ from monarch.actor_mesh import (
17
22
  endpoint,
18
23
  )
19
24
 
25
+ from monarch.mesh_controller import spawn_tensor_engine
26
+
20
27
  from monarch.proc_mesh import local_proc_mesh, proc_mesh
21
28
  from monarch.rdma import RDMABuffer
22
29
 
@@ -375,3 +382,20 @@ def test_rust_binding_modules_correct() -> None:
375
382
  assert value.__module__ == path
376
383
 
377
384
  check(bindings, "monarch._rust_bindings")
385
+
386
+
387
+ def test_tensor_engine() -> None:
388
+ pm = proc_mesh(gpus=2).get()
389
+
390
+ dm = spawn_tensor_engine(pm)
391
+ with dm.activate():
392
+ r = monarch.inspect(2 * torch.zeros(3, 4))
393
+
394
+ fm = dm.flatten("all")
395
+ with fm.activate():
396
+ f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
397
+
398
+ assert torch.allclose(torch.zeros(3, 4), r)
399
+ assert torch.allclose(torch.zeros(3, 4), f)
400
+
401
+ dm.exit()
@@ -14,6 +14,7 @@ import monarch
14
14
 
15
15
  import pytest
16
16
  import torch
17
+ import torch.utils._python_dispatch
17
18
  from monarch import fetch_shard, no_mesh, remote, Stream
18
19
  from monarch.common.device_mesh import DeviceMesh
19
20
  from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
@@ -180,3 +181,37 @@ class TestRustBackend(TestCase):
180
181
 
181
182
  self.assertIsNotNone(mesh_info.mesh_labels)
182
183
  self.assertEqual(len(mesh_info.devices_labels), 2)
184
+
185
+ def test_ivalue_problems(self) -> None:
186
+ with local_mesh(hosts=1, gpu_per_host=1):
187
+ from typing import cast
188
+
189
+ from monarch.common.messages import CallFunction, CommandGroup
190
+
191
+ a = cast(monarch.Tensor, torch.rand(3, 4))
192
+ result = monarch.Tensor(a._fake, a.mesh, a.stream)
193
+ msg = CallFunction(
194
+ 0,
195
+ result,
196
+ (),
197
+ monarch.common.function.ResolvableFunctionFromPath(
198
+ "torch.ops.aten.mul.Tensor"
199
+ ),
200
+ (2, a),
201
+ {},
202
+ a.stream._to_ref(a.mesh.client),
203
+ a.mesh,
204
+ [],
205
+ )
206
+ # Internally, this will call CallFunction(...).to_rust_message().
207
+ # The 2 arg will be converted to an IValue tensor via rust + C++.
208
+ # Then when the CommandGroup message gets converted to rust, it
209
+ # will attempt to clone the rust CallFunction message, which will
210
+ # attempt to clone the IValue tensor, which will cause a crash.
211
+ # Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
212
+ # intercepts the following two calls:
213
+ # aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
214
+ # aten.clone.default () (2,) {}
215
+
216
+ with torch.utils._python_dispatch._disable_current_modes():
217
+ CommandGroup([msg]).to_rust_message()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.9
3
+ Version: 2025.6.11
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,15 +1,16 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=hpE8smD7nqpmTQZsRLg08OMLryvUehI9_0aDbdcsVLQ,39166496
2
+ monarch/_rust_bindings.so,sha256=g2tlum6iqfdR4KRkVhp_BwUmlz0tYUSITNVaJjSNitE,40645720
3
3
  monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
4
- monarch/actor_mesh.py,sha256=5DbU9OrmNk5I9yasmE-rkTgHyO07oiLlAG0jbJBOXgI,23000
5
- monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
6
- monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
4
+ monarch/actor_mesh.py,sha256=4I8xp_XIM6KZJY_jXVjJ8tPW2l1J4a6ZhrknU7zKbAk,23947
5
+ monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
6
+ monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
7
7
  monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
8
8
  monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
9
9
  monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
10
10
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
11
11
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
12
- monarch/monarch_controller,sha256=TvAJzOeJIiFdC9QPnzrsw5ziCFA9balBWzEStq3O8u8,20395288
12
+ monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
13
+ monarch/monarch_controller,sha256=41B7zLv7M7_CSmChN5bfvVrygi2VeBhMDcNQXlnbVZU,20394376
13
14
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
14
15
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
15
16
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -131,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
131
132
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
132
133
  tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
133
134
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
135
+ tests/test_allocator.py,sha256=dqQbQyOjOX3JgnHIPT0iawT0wMeFztbLCYjK2tl8GcI,8149
134
136
  tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
135
137
  tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
136
138
  tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -139,9 +141,9 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
139
141
  tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
140
142
  tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
141
143
  tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
142
- tests/test_python_actors.py,sha256=fDvHUIWNZeL3CWnTJMbdh98i1tnH1-LJEG1pIFkGYF8,10898
144
+ tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
143
145
  tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
144
- tests/test_rust_backend.py,sha256=nXSa0ZQ0NniZm4PzvKhrWvVLD-RKvIWYkPXm1BEBXq8,6235
146
+ tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
145
147
  tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
146
148
  tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
147
149
  tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -149,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
149
151
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
150
152
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
151
153
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
152
- torchmonarch_nightly-2025.6.9.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
153
- torchmonarch_nightly-2025.6.9.dist-info/METADATA,sha256=SFAiEIRUzlpHy2_j-bRjx22U-753WotqxjEp0uwud-w,2771
154
- torchmonarch_nightly-2025.6.9.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
155
- torchmonarch_nightly-2025.6.9.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
156
- torchmonarch_nightly-2025.6.9.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
157
- torchmonarch_nightly-2025.6.9.dist-info/RECORD,,
154
+ torchmonarch_nightly-2025.6.11.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
155
+ torchmonarch_nightly-2025.6.11.dist-info/METADATA,sha256=SCdAxETtVZ5ESzbLepOp6mf1L4G-HSYVkjdRFT7D0kg,2772
156
+ torchmonarch_nightly-2025.6.11.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
157
+ torchmonarch_nightly-2025.6.11.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
158
+ torchmonarch_nightly-2025.6.11.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
159
+ torchmonarch_nightly-2025.6.11.dist-info/RECORD,,