torchmonarch-nightly 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. monarch/__init__.py +13 -9
  2. monarch/_rust_bindings.so +0 -0
  3. monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
  4. monarch/_src/actor/actor_mesh.py +874 -0
  5. monarch/{allocator.py → _src/actor/allocator.py} +26 -17
  6. monarch/_src/actor/bootstrap_main.py +73 -0
  7. monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
  8. monarch/_src/actor/code_sync/auto_reload.py +223 -0
  9. monarch/_src/actor/debugger.py +565 -0
  10. monarch/_src/actor/endpoint.py +270 -0
  11. monarch/_src/actor/event_loop.py +97 -0
  12. monarch/_src/actor/future.py +100 -0
  13. monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
  14. monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
  15. monarch/_src/actor/proc_mesh.py +500 -0
  16. monarch/_src/actor/sync_state.py +18 -0
  17. monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
  18. monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
  19. monarch/_src/actor/tensor_engine_shim.py +56 -0
  20. monarch/_src/tensor_engine/rdma.py +180 -0
  21. monarch/_testing.py +3 -2
  22. monarch/actor/__init__.py +51 -0
  23. monarch/actor_mesh.py +6 -752
  24. monarch/bootstrap_main.py +8 -47
  25. monarch/common/client.py +1 -1
  26. monarch/common/controller_api.py +2 -1
  27. monarch/common/device_mesh.py +12 -2
  28. monarch/common/messages.py +12 -1
  29. monarch/common/recording.py +4 -3
  30. monarch/common/remote.py +135 -52
  31. monarch/common/tensor.py +2 -1
  32. monarch/controller/backend.py +2 -2
  33. monarch/controller/controller.py +2 -1
  34. monarch/controller/rust_backend/controller.py +2 -1
  35. monarch/fetch.py +3 -5
  36. monarch/mesh_controller.py +201 -139
  37. monarch/monarch_controller +0 -0
  38. monarch/opaque_module.py +4 -6
  39. monarch/opaque_object.py +3 -3
  40. monarch/proc_mesh.py +6 -309
  41. monarch/python_local_mesh.py +1 -1
  42. monarch/rust_backend_mesh.py +2 -1
  43. monarch/rust_local_mesh.py +4 -2
  44. monarch/sim_mesh.py +10 -19
  45. monarch/simulator/command_history.py +1 -1
  46. monarch/simulator/interface.py +2 -1
  47. monarch/simulator/mock_controller.py +1 -1
  48. monarch/simulator/simulator.py +1 -1
  49. monarch/tensor_engine/__init__.py +23 -0
  50. monarch/tensor_worker_main.py +3 -1
  51. monarch/tools/cli.py +3 -1
  52. monarch/tools/commands.py +95 -35
  53. monarch/tools/mesh_spec.py +55 -0
  54. monarch/tools/utils.py +38 -0
  55. monarch/worker/worker.py +1 -1
  56. monarch/world_mesh.py +2 -1
  57. monarch_supervisor/python_executable.py +6 -3
  58. tests/error_test_binary.py +75 -9
  59. tests/test_actor_error.py +370 -21
  60. tests/test_alloc.py +1 -1
  61. tests/test_allocator.py +373 -17
  62. tests/test_controller.py +2 -0
  63. tests/test_debugger.py +416 -0
  64. tests/test_env_before_cuda.py +162 -0
  65. tests/test_python_actors.py +184 -332
  66. tests/test_rdma.py +198 -0
  67. tests/test_remote_functions.py +40 -12
  68. tests/test_rust_backend.py +7 -5
  69. tests/test_sim_backend.py +1 -4
  70. tests/test_tensor_engine.py +55 -1
  71. {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
  72. {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
  73. torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
  74. monarch/_monarch/hyperactor/__init__.py +0 -58
  75. monarch/_monarch/worker/debugger.py +0 -117
  76. monarch/_monarch/worker/logging.py +0 -107
  77. monarch/debugger.py +0 -379
  78. monarch/future.py +0 -76
  79. monarch/rdma.py +0 -162
  80. torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
  81. /monarch/{_monarch/worker → _src}/__init__.py +0 -0
  82. /monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
  83. /monarch/{common → _src/actor}/shape.py +0 -0
  84. /monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
  85. {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
  86. {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
  87. {torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0
tests/test_rdma.py ADDED
@@ -0,0 +1,198 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import pytest
8
+
9
+ import torch
10
+ from monarch.actor import Actor, current_rank, endpoint, proc_mesh
11
+ from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
12
+
13
+
14
+ needs_cuda = pytest.mark.skipif(
15
+ not torch.cuda.is_available(),
16
+ reason="CUDA not available",
17
+ )
18
+ needs_rdma = pytest.mark.skipif(
19
+ not rdma_available(),
20
+ reason="RDMA not available",
21
+ )
22
+
23
+
24
+ class ParameterServer(Actor):
25
+ def __init__(self):
26
+ self.params = torch.rand(10, 10)
27
+ self.grad_buffer = torch.rand(10, 10)
28
+
29
+ @endpoint
30
+ async def grad_handle(self) -> RDMABuffer:
31
+ byte_tensor = self.grad_buffer.view(torch.uint8).flatten()
32
+ buffer = RDMABuffer(byte_tensor)
33
+ return buffer
34
+
35
+ @endpoint
36
+ async def update(self):
37
+ self.params += 0.01 * self.grad_buffer
38
+
39
+ @endpoint
40
+ async def get_grad_buffer(self) -> torch.Tensor:
41
+ # just used for testing
42
+ return self.grad_buffer
43
+
44
+
45
+ class ParameterClient(Actor):
46
+ def __init__(self, server, buffer):
47
+ self.server = server
48
+ byte_tensor = buffer.view(torch.uint8).flatten()
49
+ self.buffer = byte_tensor
50
+
51
+ @endpoint
52
+ async def upload(self, tensor):
53
+ gh = await self.server.grad_handle.call_one()
54
+ await gh.write_from(tensor)
55
+
56
+ @endpoint
57
+ async def download(self):
58
+ gh = await self.server.grad_handle.call_one()
59
+ await gh.read_into(self.buffer)
60
+
61
+ @endpoint
62
+ async def get_buffer(self):
63
+ return self.buffer
64
+
65
+
66
+ @needs_rdma
67
+ @needs_cuda
68
+ async def test_proc_mesh_rdma():
69
+ proc = await proc_mesh(gpus=1)
70
+ server = await proc.spawn("server", ParameterServer)
71
+
72
+ # --- CPU TESTS ---
73
+ client_cpu = await proc.spawn(
74
+ "client_cpu", ParameterClient, server, torch.ones(10, 10)
75
+ )
76
+ x = await client_cpu.get_buffer.call_one()
77
+ assert torch.sum(x.view(torch.float32).view(10, 10)) == 100
78
+ zeros = torch.zeros(10, 10)
79
+ await client_cpu.upload.call_one(zeros.view(torch.uint8).flatten())
80
+ await client_cpu.download.call_one()
81
+ x = await client_cpu.get_buffer.call_one()
82
+ assert torch.sum(x.view(torch.float32).view(10, 10)) == 0
83
+
84
+ # --- Modify server's backing buffer directly ---
85
+ await server.update.call_one()
86
+
87
+ # Should reflect updated values
88
+ await client_cpu.download.call_one()
89
+
90
+ buffer = await client_cpu.get_buffer.call_one()
91
+ remote_grad = await server.get_grad_buffer.call_one()
92
+ assert torch.allclose(buffer.view(torch.float32).view(10, 10), remote_grad)
93
+
94
+ # --- GPU TESTS ---
95
+ client_gpu = await proc.spawn(
96
+ "client_gpu", ParameterClient, server, torch.ones(10, 10, device="cuda")
97
+ )
98
+ x = await client_gpu.get_buffer.call_one()
99
+ buffer = x.view(torch.float32).view(10, 10)
100
+ assert torch.sum(buffer) == 100
101
+ zeros = torch.zeros(10, 10, device="cuda")
102
+ await client_gpu.upload.call_one(zeros.view(torch.uint8).flatten())
103
+ await client_gpu.download.call_one()
104
+ x = await client_gpu.get_buffer.call_one()
105
+ buffer_gpu = x.view(torch.float32).view(10, 10)
106
+ assert torch.sum(buffer_gpu) == 0
107
+ # copying a tensor across hosts moves it to CPU
108
+ assert buffer_gpu.device.type == "cpu"
109
+
110
+ # Modify server state again
111
+ await server.update.call_one()
112
+ await client_gpu.download.call_one()
113
+ x = await client_gpu.get_buffer.call_one()
114
+ buffer_gpu = x.view(torch.float32).view(10, 10)
115
+ remote_grad = await server.get_grad_buffer.call_one()
116
+ assert torch.allclose(buffer_gpu.cpu(), remote_grad)
117
+
118
+
119
+ class TrainerActor(Actor):
120
+ def __init__(self):
121
+ super().__init__()
122
+ # TODO - switch to CUDA once GPU support is added
123
+ self.trainer = torch.nn.Linear(10, 10).to("cpu")
124
+ self.trainer.weight.data.zero_()
125
+
126
+ @endpoint
127
+ async def init(self, gen):
128
+ ranks = current_rank()
129
+ self.gen = gen.slice(**ranks)
130
+
131
+ @endpoint
132
+ async def exchange_metadata(self):
133
+ byte_tensor = self.trainer.weight.data.view(torch.uint8).flatten()
134
+ self.handle = RDMABuffer(byte_tensor)
135
+ await self.gen.attach_weight_buffer.call(self.handle)
136
+
137
+ @endpoint
138
+ async def weights_ready(self):
139
+ self.trainer.weight.data.add_(1.0)
140
+
141
+
142
+ class GeneratorActor(Actor):
143
+ def __init__(self):
144
+ super().__init__()
145
+ self.generator = torch.nn.Linear(10, 10).to("cuda")
146
+ self.step = 0
147
+
148
+ @endpoint
149
+ async def init(self, trainer):
150
+ ranks = current_rank()
151
+ self.trainer = trainer.slice(**ranks)
152
+
153
+ @endpoint
154
+ async def attach_weight_buffer(self, handle):
155
+ self.handle = handle
156
+
157
+ @endpoint
158
+ async def update_weights(self):
159
+ self.step += 1
160
+ byte_tensor = self.generator.weight.data.view(torch.uint8).flatten()
161
+ await self.handle.read_into(byte_tensor)
162
+ assert (
163
+ torch.sum(self.generator.weight.data) == self.step * 100
164
+ ), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
165
+
166
+
167
+ @needs_rdma
168
+ @needs_cuda
169
+ async def test_gpu_trainer_generator():
170
+ trainer_proc = await proc_mesh(gpus=1)
171
+ gen_proc = await proc_mesh(gpus=1)
172
+ trainer = await trainer_proc.spawn("trainer", TrainerActor)
173
+ generator = await gen_proc.spawn("gen", GeneratorActor)
174
+
175
+ await generator.init.call(trainer)
176
+ await trainer.init.call(generator)
177
+ await trainer.exchange_metadata.call()
178
+
179
+ for _ in range(3):
180
+ await trainer.weights_ready.call()
181
+ await generator.update_weights.call()
182
+
183
+
184
+ @needs_rdma
185
+ @needs_cuda
186
+ def test_gpu_trainer_generator_sync() -> None:
187
+ trainer_proc = proc_mesh(gpus=1).get()
188
+ gen_proc = proc_mesh(gpus=1).get()
189
+ trainer = trainer_proc.spawn("trainer", TrainerActor).get()
190
+ generator = gen_proc.spawn("gen", GeneratorActor).get()
191
+
192
+ generator.init.call(trainer).get()
193
+ trainer.init.call(generator).get()
194
+ trainer.exchange_metadata.call().get()
195
+
196
+ for _ in range(1):
197
+ trainer.weights_ready.call().get()
198
+ generator.update_weights.call().get()
@@ -9,7 +9,6 @@ import itertools
9
9
  import math
10
10
  import sys
11
11
  import traceback
12
- from enum import Enum
13
12
  from typing import Callable, ContextManager, Tuple
14
13
  from unittest.mock import patch
15
14
 
@@ -25,16 +24,18 @@ from monarch import (
25
24
  Pipe,
26
25
  remote,
27
26
  remote_generator,
28
- RemoteException,
27
+ RemoteException as OldRemoteException,
29
28
  Stream,
30
29
  )
30
+
31
31
  from monarch._testing import BackendType, TestingContext
32
32
  from monarch.builtins.log import log_remote
33
33
  from monarch.builtins.random import set_manual_seed_remote
34
34
  from monarch.cached_remote_function import remote_autograd_function
35
35
  from monarch.common import remote as remote_module
36
36
  from monarch.common.device_mesh import DeviceMesh
37
- from monarch.common.remote import Remote
37
+ from monarch.common.remote import call_on_shard_and_fetch, Remote
38
+ from monarch.mesh_controller import RemoteException as NewRemoteException
38
39
 
39
40
  from monarch.opaque_module import OpaqueModule
40
41
  from monarch.opaque_object import opaque_method, OpaqueObject
@@ -57,6 +58,8 @@ from monarch.worker._testing_function import (
57
58
  from monarch_supervisor.logging import fix_exception_lines
58
59
  from torch.distributed import ReduceOp
59
60
 
61
+ RemoteException = (NewRemoteException, OldRemoteException)
62
+
60
63
 
61
64
  def custom_excepthook(exc_type, exc_value, exc_traceback):
62
65
  tb_lines = fix_exception_lines(
@@ -181,7 +184,9 @@ class RemoteFunctionsTestBase:
181
184
  # out is not counted as a failure, so we set a more restrictive timeout to
182
185
  # ensure we see a hard failure in CI.
183
186
  @pytest.mark.timeout(120)
184
- @pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
187
+ @pytest.mark.parametrize(
188
+ "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
189
+ )
185
190
  class TestRemoteFunctions(RemoteFunctionsTestBase):
186
191
  @classmethod
187
192
  def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor):
@@ -326,7 +331,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
326
331
  _ = fetch_shard(a).result(timeout=40)
327
332
 
328
333
  def test_set_device_inside_udf_fails_with_explanation(self, backend_type):
329
- if backend_type == BackendType.PY:
334
+ if backend_type != BackendType.RS:
330
335
  pytest.skip("Python support not planned for this test")
331
336
  with self.local_device_mesh(2, 2, backend_type):
332
337
  t = set_device_udf(2)
@@ -628,11 +633,10 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
628
633
  with self.local_device_mesh(2, 2, backend_type):
629
634
  assert (
630
635
  "an argument processed"
631
- == remote("monarch.worker._testing_function.do_some_processing")
632
- .call_on_shard_and_fetch(
636
+ == call_on_shard_and_fetch(
637
+ remote("monarch.worker._testing_function.do_some_processing"),
633
638
  "an argument",
634
- )
635
- .result()
639
+ ).result()
636
640
  )
637
641
 
638
642
  def test_cached_remote_function(self, backend_type):
@@ -727,7 +731,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
727
731
 
728
732
  with self.local_device_mesh(2, 2, backend_type):
729
733
  a = torch.ones(())
730
- assert check.call_on_shard_and_fetch(bar(a, a)).result()
734
+ assert call_on_shard_and_fetch(check, bar(a, a)).result()
731
735
  # ensure we do not attempt to pickle closures
732
736
  close()
733
737
 
@@ -770,7 +774,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
770
774
 
771
775
  with self.local_device_mesh(1, 1, backend_type):
772
776
  # This should be a valid return than an exception to raise
773
- simple.call_on_shard_and_fetch().result()
777
+ call_on_shard_and_fetch(simple).result()
774
778
 
775
779
  def test_opaque_object(self, backend_type):
776
780
  with self.local_device_mesh(2, 2, backend_type):
@@ -948,10 +952,13 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
948
952
  x = outer_remote_function_that_calls_inner()
949
953
  try:
950
954
  inspect(x)
951
- except RemoteException as e:
955
+ except OldRemoteException as e:
952
956
  backtrace = "\n".join([frame.name for frame in e.worker_frames])
953
957
  assert "outer_remote_function" in backtrace
954
958
  assert "inner_remote_function" in backtrace
959
+ except NewRemoteException as e:
960
+ assert "outer_remote_function" in e.worker_error_string
961
+ assert "inner_remote_function" in e.worker_error_string
955
962
 
956
963
  def test_remote_function_broadcast(self, backend_type):
957
964
  with self.local_device_mesh(2, 2, backend_type) as device_mesh:
@@ -1269,3 +1276,24 @@ def a_function_called_by_a_live_function(x):
1269
1276
 
1270
1277
  def a_live_function_call_by_a_live_function(x):
1271
1278
  return 3 * x
1279
+
1280
+
1281
+ @remote
1282
+ def return_them(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
1283
+ return (x, y)
1284
+
1285
+
1286
+ @pytest.mark.skipif(
1287
+ torch.cuda.device_count() < 2,
1288
+ reason="Not enough GPUs, this test requires at least 2 GPUs",
1289
+ )
1290
+ class TestMeshSpecific(RemoteFunctionsTestBase):
1291
+ def test_value_mesh(self):
1292
+ with self.local_device_mesh(2, 2, "mesh") as device_mesh:
1293
+ x = device_mesh.rank("host")
1294
+ y = device_mesh.rank("gpu")
1295
+ r = return_them.call(x, y).get()
1296
+
1297
+ for p, (h, g) in r:
1298
+ assert p["host"] == h.item()
1299
+ assert p["gpu"] == g.item()
@@ -17,6 +17,7 @@ import torch
17
17
  import torch.utils._python_dispatch
18
18
  from monarch import fetch_shard, no_mesh, remote, Stream
19
19
  from monarch.common.device_mesh import DeviceMesh
20
+ from monarch.common.remote import call_on_shard_and_fetch
20
21
  from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
21
22
  from torch.nn.attention import sdpa_kernel, SDPBackend
22
23
  from torch.nn.functional import scaled_dot_product_attention
@@ -111,9 +112,10 @@ class TestRustBackend(TestCase):
111
112
  with local_mesh():
112
113
  assert (
113
114
  "an argument processed"
114
- == remote("monarch.worker._testing_function.do_some_processing")
115
- .call_on_shard_and_fetch("an argument")
116
- .result()
115
+ == call_on_shard_and_fetch(
116
+ remote("monarch.worker._testing_function.do_some_processing"),
117
+ "an argument",
118
+ ).result()
117
119
  )
118
120
 
119
121
  def test_brutal_shutdown(self):
@@ -143,8 +145,8 @@ class TestRustBackend(TestCase):
143
145
  return torch.isnan(t).any().item()
144
146
 
145
147
  t = torch.rand(3, 4)
146
- res = has_nan.call_on_shard_and_fetch(
147
- t, shard={"host": 0, "gpu": 0}
148
+ res = call_on_shard_and_fetch(
149
+ has_nan, t, shard={"host": 0, "gpu": 0}
148
150
  ).result()
149
151
 
150
152
  self.assertFalse(res)
tests/test_sim_backend.py CHANGED
@@ -24,11 +24,8 @@ def local_sim_mesh(
24
24
  # TODO: support multiple gpus in a mesh.
25
25
  gpu_per_host: int = 1,
26
26
  activate: bool = True,
27
- proxy_addr: Optional[str] = None,
28
27
  ) -> Generator[DeviceMesh, None, None]:
29
- dms = sim_mesh(
30
- n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host, proxy_addr=proxy_addr
31
- )
28
+ dms = sim_mesh(n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host)
32
29
  dm = dms[0]
33
30
  try:
34
31
  if activate:
@@ -7,8 +7,9 @@
7
7
  import monarch
8
8
  import pytest
9
9
  import torch
10
+ from monarch import remote
11
+ from monarch.actor import Actor, endpoint, proc_mesh
10
12
  from monarch.mesh_controller import spawn_tensor_engine
11
- from monarch.proc_mesh import proc_mesh
12
13
 
13
14
 
14
15
  two_gpu = pytest.mark.skipif(
@@ -32,6 +33,14 @@ def test_tensor_engine() -> None:
32
33
  assert torch.allclose(torch.zeros(3, 4), r)
33
34
  assert torch.allclose(torch.zeros(3, 4), f)
34
35
 
36
+ @remote(propagate=lambda x: x)
37
+ def nope(x):
38
+ raise ValueError("nope")
39
+
40
+ with pytest.raises(monarch.mesh_controller.RemoteException):
41
+ with dm.activate():
42
+ monarch.inspect(nope(torch.zeros(3, 4)))
43
+
35
44
  dm.exit()
36
45
 
37
46
 
@@ -50,3 +59,48 @@ def test_proc_mesh_tensor_engine() -> None:
50
59
  assert a == 0
51
60
  assert b == 10
52
61
  assert c == 100
62
+
63
+
64
+ class AddWithState(Actor):
65
+ def __init__(self, state: torch.Tensor):
66
+ super().__init__()
67
+ self.state = state
68
+
69
+ @endpoint
70
+ def forward(self, x) -> torch.Tensor:
71
+ return x + self.state
72
+
73
+
74
+ @two_gpu
75
+ def test_actor_with_tensors() -> None:
76
+ pm = proc_mesh(gpus=1).get()
77
+ with pm.activate():
78
+ x = pm.spawn("adder", AddWithState, torch.ones(())).get()
79
+ y = torch.ones(())
80
+ assert x.forward.call(y).get(timeout=5).item(hosts=0, gpus=0).item() == 2
81
+
82
+
83
+ class Counter(Actor):
84
+ def __init__(self):
85
+ super().__init__()
86
+ self.c = 0
87
+
88
+ @endpoint
89
+ def incr(self, x) -> int:
90
+ self.c += 1
91
+ return self.c - 1
92
+
93
+
94
+ @two_gpu
95
+ def test_actor_tensor_ordering() -> None:
96
+ pm = proc_mesh(gpus=1).get()
97
+ with pm.activate():
98
+ counter = pm.spawn("a", Counter).get()
99
+ results = []
100
+ for _ in range(0, 10, 2):
101
+ # tensor engine call
102
+ results.append(counter.incr.call(torch.ones(())))
103
+ # non-tensor engine call
104
+ results.append(counter.incr.call(1))
105
+
106
+ assert list(range(10)) == [r.get().item(hosts=0, gpus=0) for r in results]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchmonarch-nightly
3
- Version: 2025.6.30
3
+ Version: 2025.7.25
4
4
  Summary: Monarch: Single controller library
5
5
  Author: Meta
6
6
  Author-email: oncall+monarch@xmail.facebook.com
@@ -15,6 +15,8 @@ Requires-Dist: numpy
15
15
  Requires-Dist: pyre-extensions
16
16
  Requires-Dist: cloudpickle
17
17
  Requires-Dist: torchx-nightly
18
+ Requires-Dist: lark
19
+ Requires-Dist: tabulate
18
20
  Dynamic: author
19
21
  Dynamic: author-email
20
22
  Dynamic: description
@@ -69,6 +71,9 @@ sudo dnf install clang-devel libnccl-devel
69
71
  conda install -c conda-forge clangdev nccl
70
72
  conda update -n monarchenv --all -c conda-forge -y
71
73
 
74
+ # If you are building with RDMA support, build monarch with `USE_TENSOR_ENGINE=1 pip install --no-build-isolation .` and dnf install the following packages
75
+ sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
76
+
72
77
  # Install build dependencies
73
78
  pip install -r build-requirements.txt
74
79
  # Install test dependencies