torchmonarch-nightly 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
monarch/_rust_bindings.so CHANGED
Binary file
monarch/actor_mesh.py CHANGED
@@ -7,6 +7,7 @@
7
7
  import asyncio
8
8
  import collections
9
9
  import contextvars
10
+ import functools
10
11
  import inspect
11
12
 
12
13
  import itertools
@@ -38,6 +39,7 @@ from typing import (
38
39
 
39
40
  import monarch
40
41
  from monarch import ActorFuture as Future
42
+ from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
41
43
 
42
44
  from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
43
45
  from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
@@ -49,6 +51,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
49
51
  )
50
52
  from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
51
53
  from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
54
+
52
55
  from monarch.common.pickle_flatten import flatten, unflatten
53
56
  from monarch.common.shape import MeshTrait, NDSlice
54
57
 
@@ -492,13 +495,29 @@ class _Actor:
492
495
  return None
493
496
  else:
494
497
  the_method = getattr(self.instance, message.method)._method
495
- result = the_method(self.instance, *args, **kwargs)
498
+
496
499
  if not inspect.iscoroutinefunction(the_method):
500
+ enter_span(
501
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
502
+ )
503
+ result = the_method(self.instance, *args, **kwargs)
504
+ exit_span()
497
505
  if port is not None:
498
506
  port.send("result", result)
499
507
  return None
500
508
 
501
- return self.run_async(ctx, self.run_task(port, result, panic_flag))
509
+ async def instrumented():
510
+ enter_span(
511
+ the_method.__module__, message.method, str(ctx.mailbox.actor_id)
512
+ )
513
+ result = await the_method(self.instance, *args, **kwargs)
514
+ exit_span()
515
+ return result
516
+
517
+ return self.run_async(
518
+ ctx,
519
+ self.run_task(port, instrumented(), panic_flag),
520
+ )
502
521
  except Exception as e:
503
522
  traceback.print_exc()
504
523
  s = ActorError(e)
@@ -510,7 +529,11 @@ class _Actor:
510
529
  else:
511
530
  raise s from None
512
531
 
513
- async def run_async(self, ctx, coroutine):
532
+ async def run_async(
533
+ self,
534
+ ctx: MonarchContext,
535
+ coroutine: Coroutine[Any, None, Any],
536
+ ) -> None:
514
537
  _context.set(ctx)
515
538
  if self.complete_task is None:
516
539
  self.complete_task = asyncio.create_task(self._complete())
@@ -564,6 +587,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
564
587
 
565
588
 
566
589
  class Actor(MeshTrait):
590
+ @functools.cached_property
591
+ def logger(cls) -> logging.Logger:
592
+ lgr = logging.getLogger(cls.__class__.__name__)
593
+ lgr.setLevel(logging.DEBUG)
594
+ return lgr
595
+
567
596
  @property
568
597
  def _ndslice(self) -> NDSlice:
569
598
  raise NotImplementedError(
monarch/allocator.py CHANGED
@@ -4,6 +4,9 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-strict
8
+
9
+ import abc
7
10
  from typing import final
8
11
 
9
12
  from monarch import ActorFuture as Future
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//mon
15
18
  from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension
16
19
  LocalAllocatorBase,
17
20
  ProcessAllocatorBase,
21
+ RemoteAllocatorBase,
18
22
  )
19
23
 
20
24
 
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
60
64
  lambda: self.allocate_nonblocking(spec),
61
65
  lambda: self.allocate_blocking(spec),
62
66
  )
67
+
68
+
69
+ class RemoteAllocInitializer(abc.ABC):
70
+ """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
71
+
72
+ NOTE: changes to method signatures of this class must be made to the call-site at
73
+ `PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
74
+ """
75
+
76
+ @abc.abstractmethod
77
+ async def initialize_alloc(self) -> list[str]:
78
+ """
79
+ Return the addresses of the servers that should be used to allocate processes
80
+ for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
81
+
82
+ Each address is of the form `{transport}!{addr}(:{port})`.
83
+ This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
84
+ For example, `tcp!127.0.0.1:1234`.
85
+
86
+ NOTE: Currently, all the addresses must have the same transport type and port
87
+ NOTE: Although this method is currently called once at the initialization of the Allocator,
88
+ in the future this method can be called multiple times and should return the current set of
89
+ addresses that are eligible to handle allocation requests.
90
+
91
+ """
92
+ ...
93
+
94
+
95
+ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
96
+ """
97
+ Returns the static list of server addresses that this initializer
98
+ was constructed with on each `initialize_alloc()` call.
99
+ """
100
+
101
+ def __init__(self, *addrs: str) -> None:
102
+ super().__init__()
103
+ self.addrs: list[str] = list(addrs)
104
+
105
+ async def initialize_alloc(self) -> list[str]:
106
+ return list(self.addrs)
107
+
108
+
109
+ @final
110
+ class RemoteAllocator(RemoteAllocatorBase):
111
+ """
112
+ An allocator that allocates by spawning actors on a remote host.
113
+ The remote host must be running hyperactor's remote-process-allocator.
114
+ """
115
+
116
+ def allocate(self, spec: AllocSpec) -> Future[Alloc]:
117
+ """
118
+ Allocate a process according to the provided spec.
119
+
120
+ Arguments:
121
+ - `spec`: The spec to allocate according to.
122
+
123
+ Returns:
124
+ - A future that will be fulfilled when the requested allocation is fulfilled.
125
+ """
126
+ return Future(
127
+ lambda: self.allocate_nonblocking(spec),
128
+ lambda: self.allocate_blocking(spec),
129
+ )
monarch/bootstrap_main.py CHANGED
@@ -58,7 +58,7 @@ def invoke_main():
58
58
 
59
59
  # forward logs to rust tracing. Defaults to on.
60
60
  if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
61
- logging.root.addHandler(TracingForwarder())
61
+ logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
62
62
 
63
63
  try:
64
64
  with (
Binary file
@@ -0,0 +1,216 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ import contextlib
10
+ import importlib.resources
11
+ import math
12
+ import os
13
+ import subprocess
14
+ import sys
15
+ import unittest
16
+ from datetime import timedelta
17
+ from typing import Generator
18
+
19
+ import cloudpickle
20
+
21
+ import torch
22
+ import torch.distributed as dist
23
+ import torch.nn.functional as F
24
+
25
+ from monarch._rust_bindings.hyperactor_extension.alloc import (
26
+ AllocConstraints,
27
+ AllocSpec,
28
+ )
29
+
30
+ from monarch._rust_bindings.monarch_hyperactor.channel import (
31
+ ChannelAddr,
32
+ ChannelTransport,
33
+ )
34
+ from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
35
+
36
+ from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
37
+ from monarch.proc_mesh import ProcMesh
38
+
39
+ from torch.distributed.elastic.utils.distributed import get_free_port
40
+
41
+ _100_MILLISECONDS = timedelta(milliseconds=100)
42
+
43
+
44
+ class TestActor(Actor):
45
+ """Silly actor that computes the world size by all-reducing rank-hot tensors"""
46
+
47
+ def __init__(self) -> None:
48
+ self.rank: int = current_rank().rank
49
+ self.world_size: int = math.prod(current_size().values())
50
+
51
+ @endpoint
52
+ async def compute_world_size(self, master_addr: str, master_port: int) -> int:
53
+ os.environ["MASTER_ADDR"] = master_addr
54
+ os.environ["MASTER_PORT"] = str(master_port)
55
+ dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
56
+
57
+ try:
58
+ t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
59
+ dist.all_reduce(t)
60
+ return int(torch.sum(t).item())
61
+ finally:
62
+ dist.destroy_process_group()
63
+
64
+
65
+ @contextlib.contextmanager
66
+ def remote_process_allocator() -> Generator[str, None, None]:
67
+ with importlib.resources.path(__package__, "") as package_path:
68
+ addr = ChannelAddr.any(ChannelTransport.Unix)
69
+
70
+ process_allocator = subprocess.Popen(
71
+ args=[
72
+ "process_allocator",
73
+ f"--addr={addr}",
74
+ ],
75
+ env={
76
+ # prefix PATH with this test module's directory to
77
+ # give 'process_allocator' and 'monarch_bootstrap' binary resources
78
+ # in this test module's directory precedence over the installed ones
79
+ # useful in BUCK where these binaries are added as 'resources' of this test target
80
+ "PATH": f"{package_path}:{os.getenv('PATH', '')}",
81
+ "RUST_LOG": "debug",
82
+ },
83
+ )
84
+ try:
85
+ yield addr
86
+ finally:
87
+ process_allocator.terminate()
88
+ try:
89
+ five_seconds = 5
90
+ process_allocator.wait(timeout=five_seconds)
91
+ except subprocess.TimeoutExpired:
92
+ process_allocator.kill()
93
+
94
+
95
+ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
96
+ @classmethod
97
+ def setUpClass(cls) -> None:
98
+ cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
99
+
100
+ @classmethod
101
+ def tearDownClass(cls) -> None:
102
+ cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
103
+
104
+ def assert_computed_world_size(
105
+ self, computed: ValueMesh[int], expected_world_size: int
106
+ ) -> None:
107
+ expected_world_sizes = {
108
+ rank: expected_world_size for rank in range(0, expected_world_size)
109
+ }
110
+ computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
111
+ self.assertDictEqual(expected_world_sizes, computed_world_sizes)
112
+
113
+ async def test_call_allocate_twice(self) -> None:
114
+ class DeletingAllocInitializer(StaticRemoteAllocInitializer):
115
+ """test initializer that removes the last address from the list each time initialize_alloc() is called
116
+ used to test that the state of the initializer is preserved across calls to allocate()
117
+ """
118
+
119
+ async def initialize_alloc(self) -> list[str]:
120
+ alloc = await super().initialize_alloc()
121
+ self.addrs.pop(-1)
122
+ return alloc
123
+
124
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
125
+ initializer = DeletingAllocInitializer(host1, host2)
126
+
127
+ allocator = RemoteAllocator(
128
+ world_id="test_remote_allocator",
129
+ initializer=initializer,
130
+ heartbeat_interval=_100_MILLISECONDS,
131
+ )
132
+
133
+ spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
134
+
135
+ await allocator.allocate(spec)
136
+ self.assertEqual([host1], initializer.addrs)
137
+
138
+ await allocator.allocate(spec)
139
+ self.assertEqual([], initializer.addrs)
140
+
141
+ async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
142
+ class EmptyAllocInitializer(StaticRemoteAllocInitializer):
143
+ """test initializer that returns an empty list of addresses"""
144
+
145
+ async def initialize_alloc(self) -> list[str]:
146
+ return []
147
+
148
+ empty_initializer = EmptyAllocInitializer()
149
+ with self.assertRaisesRegex(
150
+ RuntimeError, r"initializer must return non-empty list of addresses"
151
+ ):
152
+ allocator = RemoteAllocator(
153
+ world_id="test_remote_allocator",
154
+ initializer=empty_initializer,
155
+ heartbeat_interval=_100_MILLISECONDS,
156
+ )
157
+ await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
158
+
159
+ async def test_allocate_2d_mesh(self) -> None:
160
+ hosts = 2
161
+ gpus = 4
162
+ world_size = hosts * gpus
163
+ spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
164
+
165
+ # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
166
+ with remote_process_allocator() as host1, remote_process_allocator() as host2:
167
+ allocator = RemoteAllocator(
168
+ world_id="test_remote_allocator",
169
+ initializer=StaticRemoteAllocInitializer(host1, host2),
170
+ heartbeat_interval=_100_MILLISECONDS,
171
+ )
172
+ alloc = await allocator.allocate(spec)
173
+ proc_mesh = await ProcMesh.from_alloc(alloc)
174
+ actor = await proc_mesh.spawn("test_actor", TestActor)
175
+
176
+ values = await actor.compute_world_size.call(
177
+ master_addr="::",
178
+ master_port=get_free_port(),
179
+ )
180
+
181
+ self.assert_computed_world_size(values, world_size)
182
+
183
+ async def test_stacked_1d_meshes(self) -> None:
184
+ # create two stacked actor meshes on the same host
185
+ # each actor mesh running on separate process-allocators
186
+
187
+ with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
188
+ allocator_a = RemoteAllocator(
189
+ world_id="a",
190
+ initializer=StaticRemoteAllocInitializer(host1_a),
191
+ heartbeat_interval=_100_MILLISECONDS,
192
+ )
193
+ allocator_b = RemoteAllocator(
194
+ world_id="b",
195
+ initializer=StaticRemoteAllocInitializer(host1_b),
196
+ heartbeat_interval=_100_MILLISECONDS,
197
+ )
198
+
199
+ spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
200
+ spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
201
+
202
+ proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
203
+ proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
204
+
205
+ actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
206
+ actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
207
+
208
+ results_a = await actor_a.compute_world_size.call(
209
+ master_addr="::", master_port=get_free_port()
210
+ )
211
+ results_b = await actor_b.compute_world_size.call(
212
+ master_addr="::", master_port=get_free_port()
213
+ )
214
+
215
+ self.assert_computed_world_size(results_a, 2) # a is a 1x2 mesh
216
+ self.assert_computed_world_size(results_b, 6) # b is a 1x6 mesh
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.10
3
+ Version: 2025.6.11
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -1,16 +1,16 @@
1
1
  monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
2
- monarch/_rust_bindings.so,sha256=0-svsKnUJboaOBd5i-LOfpHiRRAgVLX_1Hq_YYREQi8,39756680
2
+ monarch/_rust_bindings.so,sha256=g2tlum6iqfdR4KRkVhp_BwUmlz0tYUSITNVaJjSNitE,40645720
3
3
  monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
4
- monarch/actor_mesh.py,sha256=AKdjPg3FM6Yt35uFPBnP7fNVEu6busu5BXVWLwjU2A4,23000
5
- monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
6
- monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
4
+ monarch/actor_mesh.py,sha256=4I8xp_XIM6KZJY_jXVjJ8tPW2l1J4a6ZhrknU7zKbAk,23947
5
+ monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
6
+ monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
7
7
  monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
8
8
  monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
9
9
  monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
10
10
  monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
11
11
  monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
12
12
  monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
13
- monarch/monarch_controller,sha256=Q1eR_EVJqDQLrJZ_6p1ldxVDAU1OmN5lSSuctDcaAFY,20396832
13
+ monarch/monarch_controller,sha256=41B7zLv7M7_CSmChN5bfvVrygi2VeBhMDcNQXlnbVZU,20394376
14
14
  monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
15
15
  monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
16
16
  monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -132,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
132
132
  tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
133
133
  tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
134
134
  tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
135
+ tests/test_allocator.py,sha256=dqQbQyOjOX3JgnHIPT0iawT0wMeFztbLCYjK2tl8GcI,8149
135
136
  tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
136
137
  tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
137
138
  tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -150,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
150
151
  tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
151
152
  tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
152
153
  tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
153
- torchmonarch_nightly-2025.6.10.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
154
- torchmonarch_nightly-2025.6.10.dist-info/METADATA,sha256=DR1GtSFqtqsjhKWi38uGcvhw2p3ycHYSOwDmsErwLj0,2772
155
- torchmonarch_nightly-2025.6.10.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
156
- torchmonarch_nightly-2025.6.10.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
157
- torchmonarch_nightly-2025.6.10.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
158
- torchmonarch_nightly-2025.6.10.dist-info/RECORD,,
154
+ torchmonarch_nightly-2025.6.11.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
155
+ torchmonarch_nightly-2025.6.11.dist-info/METADATA,sha256=SCdAxETtVZ5ESzbLepOp6mf1L4G-HSYVkjdRFT7D0kg,2772
156
+ torchmonarch_nightly-2025.6.11.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
157
+ torchmonarch_nightly-2025.6.11.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
158
+ torchmonarch_nightly-2025.6.11.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
159
+ torchmonarch_nightly-2025.6.11.dist-info/RECORD,,