torchmonarch-nightly 2025.7.29__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.31__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +9 -5
- monarch/_src/actor/allocator.py +5 -6
- monarch/_src/actor/debugger.py +159 -98
- monarch/_src/actor/endpoint.py +15 -4
- monarch/_src/actor/future.py +79 -32
- monarch/_src/actor/pdb_wrapper.py +10 -4
- monarch/_src/actor/proc_mesh.py +82 -114
- monarch/_src/actor/shape.py +32 -38
- monarch/_src/tensor_engine/rdma.py +12 -6
- monarch/gradient/_gradient_generator.so +0 -0
- monarch/mesh_controller.py +37 -4
- monarch/monarch_controller +0 -0
- tests/test_actor_error.py +3 -4
- tests/test_actor_shape.py +114 -0
- tests/test_allocator.py +34 -9
- tests/test_debugger.py +406 -178
- tests/test_python_actors.py +67 -67
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/RECORD +24 -23
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
from unittest import TestCase
|
10
|
+
|
11
|
+
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
12
|
+
from monarch._src.actor.shape import ShapeExt
|
13
|
+
|
14
|
+
|
15
|
+
class TestShapeSlicing(TestCase):
|
16
|
+
def test_shape_at_removes_dimension(self) -> None:
|
17
|
+
"""Test that at() removes dimensions and updates offset
|
18
|
+
correctly."""
|
19
|
+
|
20
|
+
slice_obj = Slice(offset=0, sizes=[2, 3, 4], strides=[12, 4, 1])
|
21
|
+
shape = Shape(["batch", "height", "width"], slice_obj)
|
22
|
+
|
23
|
+
# Test removing first dimension
|
24
|
+
result = shape.at("batch", 1)
|
25
|
+
self.assertEqual(result.labels, ["height", "width"])
|
26
|
+
self.assertEqual(result.ndslice.sizes, [3, 4])
|
27
|
+
self.assertEqual(result.ndslice.strides, [4, 1])
|
28
|
+
self.assertEqual(result.ndslice.offset, 12) # 1 * 12
|
29
|
+
|
30
|
+
# Test removing middle dimension
|
31
|
+
result = shape.at("height", 2)
|
32
|
+
self.assertEqual(result.labels, ["batch", "width"])
|
33
|
+
self.assertEqual(result.ndslice.sizes, [2, 4])
|
34
|
+
self.assertEqual(result.ndslice.strides, [12, 1])
|
35
|
+
self.assertEqual(result.ndslice.offset, 8) # 2 * 4
|
36
|
+
|
37
|
+
# Test removing last dimension
|
38
|
+
result = shape.at("width", 3)
|
39
|
+
self.assertEqual(result.labels, ["batch", "height"])
|
40
|
+
self.assertEqual(result.ndslice.sizes, [2, 3])
|
41
|
+
self.assertEqual(result.ndslice.strides, [12, 4])
|
42
|
+
self.assertEqual(result.ndslice.offset, 3) # 3 * 1
|
43
|
+
|
44
|
+
def test_shape_select_keeps_dimension(self) -> None:
|
45
|
+
"""Test that select() keeps dimensions but changes sizes."""
|
46
|
+
|
47
|
+
slice_obj = Slice.new_row_major([4, 6])
|
48
|
+
shape = Shape(["rows", "cols"], slice_obj)
|
49
|
+
|
50
|
+
# Test range selection
|
51
|
+
result = shape.select("rows", slice(1, 3))
|
52
|
+
self.assertEqual(result.labels, ["rows", "cols"])
|
53
|
+
self.assertEqual(result.ndslice.sizes, [2, 6]) # 3-1=2 rows
|
54
|
+
self.assertEqual(result.ndslice.offset, 6) # 1 * 6
|
55
|
+
|
56
|
+
# Test step selection
|
57
|
+
result = shape.select("cols", slice(0, 6, 2))
|
58
|
+
self.assertEqual(result.labels, ["rows", "cols"])
|
59
|
+
self.assertEqual(result.ndslice.sizes, [4, 3]) # every 2nd col = 3 cols
|
60
|
+
self.assertEqual(result.ndslice.strides, [6, 2]) # stride becomes 2
|
61
|
+
|
62
|
+
def test_shape_slice_mixed_operations(self) -> None:
|
63
|
+
"""Test mixing at() and select() operations."""
|
64
|
+
|
65
|
+
slice_obj = Slice.new_row_major([2, 3, 4])
|
66
|
+
shape = Shape(["batch", "height", "width"], slice_obj)
|
67
|
+
|
68
|
+
# Chain operations: select then at
|
69
|
+
result = shape.select("width", slice(1, 4)).at("batch", 0)
|
70
|
+
self.assertEqual(result.labels, ["height", "width"])
|
71
|
+
self.assertEqual(result.ndslice.sizes, [3, 3])
|
72
|
+
|
73
|
+
# Chain operations: at then select
|
74
|
+
result = shape.at("height", 1).select("width", slice(2, 4))
|
75
|
+
self.assertEqual(result.labels, ["batch", "width"])
|
76
|
+
self.assertEqual(result.ndslice.sizes, [2, 2])
|
77
|
+
|
78
|
+
def test_shape_slice_errors(self) -> None:
|
79
|
+
"""Test error conditions."""
|
80
|
+
slice_obj = Slice.new_row_major([2, 3])
|
81
|
+
shape = Shape(["rows", "cols"], slice_obj)
|
82
|
+
|
83
|
+
# Test invalid label
|
84
|
+
with self.assertRaises(ValueError):
|
85
|
+
shape.at("nonexistent", 0)
|
86
|
+
|
87
|
+
# Test index out of range
|
88
|
+
with self.assertRaises(ValueError):
|
89
|
+
shape.at("rows", 5)
|
90
|
+
|
91
|
+
# Test negative index (Python-Rust boundary issue)
|
92
|
+
with self.assertRaises(OverflowError): # Changed from ValueError
|
93
|
+
shape.at("rows", -1)
|
94
|
+
|
95
|
+
def test_shape_slice_comprehensive(self) -> None:
|
96
|
+
"""Comprehensive test of slice() method."""
|
97
|
+
|
98
|
+
slice_obj = Slice.new_row_major([4, 5, 6])
|
99
|
+
shape = Shape(["a", "b", "c"], slice_obj)
|
100
|
+
|
101
|
+
# Test integer selection (removes dimensions)
|
102
|
+
result = ShapeExt.slice(shape, a=1, c=2)
|
103
|
+
self.assertEqual(result.labels, ["b"])
|
104
|
+
self.assertEqual(result.ndslice.sizes, [5])
|
105
|
+
|
106
|
+
# Test slice selection (keeps dimensions)
|
107
|
+
result = ShapeExt.slice(shape, b=slice(1, 4), c=slice(0, 6, 2))
|
108
|
+
self.assertEqual(result.labels, ["a", "b", "c"])
|
109
|
+
self.assertEqual(result.ndslice.sizes, [4, 3, 3])
|
110
|
+
|
111
|
+
# Test mixed selection
|
112
|
+
result = ShapeExt.slice(shape, a=2, b=slice(1, 3))
|
113
|
+
self.assertEqual(result.labels, ["b", "c"])
|
114
|
+
self.assertEqual(result.ndslice.sizes, [2, 6])
|
tests/test_allocator.py
CHANGED
@@ -104,7 +104,9 @@ class TestActor(Actor):
|
|
104
104
|
|
105
105
|
@contextlib.contextmanager
|
106
106
|
def remote_process_allocator(
|
107
|
-
addr: Optional[str] = None,
|
107
|
+
addr: Optional[str] = None,
|
108
|
+
timeout: Optional[int] = None,
|
109
|
+
envs: Optional[dict[str, str]] = None,
|
108
110
|
) -> Generator[str, None, None]:
|
109
111
|
"""Start a remote process allocator on addr. If timeout is not None, have it
|
110
112
|
timeout after that many seconds if no messages come in"""
|
@@ -120,16 +122,19 @@ def remote_process_allocator(
|
|
120
122
|
if timeout is not None:
|
121
123
|
args.append(f"--timeout-sec={timeout}")
|
122
124
|
|
125
|
+
env = {
|
126
|
+
# prefix PATH with this test module's directory to
|
127
|
+
# give 'process_allocator' and 'monarch_bootstrap' binary resources
|
128
|
+
# in this test module's directory precedence over the installed ones
|
129
|
+
# useful in BUCK where these binaries are added as 'resources' of this test target
|
130
|
+
"PATH": f"{package_path}:{os.getenv('PATH', '')}",
|
131
|
+
"RUST_LOG": "debug",
|
132
|
+
}
|
133
|
+
if envs:
|
134
|
+
env.update(envs)
|
123
135
|
process_allocator = subprocess.Popen(
|
124
136
|
args=args,
|
125
|
-
env=
|
126
|
-
# prefix PATH with this test module's directory to
|
127
|
-
# give 'process_allocator' and 'monarch_bootstrap' binary resources
|
128
|
-
# in this test module's directory precedence over the installed ones
|
129
|
-
# useful in BUCK where these binaries are added as 'resources' of this test target
|
130
|
-
"PATH": f"{package_path}:{os.getenv('PATH', '')}",
|
131
|
-
"RUST_LOG": "debug",
|
132
|
-
},
|
137
|
+
env=env,
|
133
138
|
)
|
134
139
|
try:
|
135
140
|
yield addr
|
@@ -233,6 +238,26 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
|
|
233
238
|
computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
|
234
239
|
self.assertDictEqual(expected_world_sizes, computed_world_sizes)
|
235
240
|
|
241
|
+
async def test_allocate_failure_message(self) -> None:
|
242
|
+
spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
|
243
|
+
|
244
|
+
with self.assertRaisesRegex(
|
245
|
+
Exception,
|
246
|
+
r"exited with code 1: Traceback \(most recent call last\).*",
|
247
|
+
):
|
248
|
+
with remote_process_allocator(
|
249
|
+
envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
|
250
|
+
) as host1, remote_process_allocator(
|
251
|
+
envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
|
252
|
+
) as host2:
|
253
|
+
allocator = RemoteAllocator(
|
254
|
+
world_id="test_remote_allocator",
|
255
|
+
initializer=StaticRemoteAllocInitializer(host1, host2),
|
256
|
+
heartbeat_interval=_100_MILLISECONDS,
|
257
|
+
)
|
258
|
+
alloc = await allocator.allocate(spec)
|
259
|
+
await ProcMesh.from_alloc(alloc)
|
260
|
+
|
236
261
|
async def test_call_allocate_twice(self) -> None:
|
237
262
|
class DeletingAllocInitializer(StaticRemoteAllocInitializer):
|
238
263
|
"""test initializer that removes the last address from the list each time initialize_alloc() is called
|