PyPI - torchmonarch-nightly - Versions diffs - 2025.7.29__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.31__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.29__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.31__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +9 -5
monarch/_src/actor/allocator.py +5 -6
monarch/_src/actor/debugger.py +159 -98
monarch/_src/actor/endpoint.py +15 -4
monarch/_src/actor/future.py +79 -32
monarch/_src/actor/pdb_wrapper.py +10 -4
monarch/_src/actor/proc_mesh.py +82 -114
monarch/_src/actor/shape.py +32 -38
monarch/_src/tensor_engine/rdma.py +12 -6
monarch/gradient/_gradient_generator.so +0 -0
monarch/mesh_controller.py +37 -4
monarch/monarch_controller +0 -0
tests/test_actor_error.py +3 -4
tests/test_actor_shape.py +114 -0
tests/test_allocator.py +34 -9
tests/test_debugger.py +406 -178
tests/test_python_actors.py +67 -67
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/RECORD +24 -23
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/top_level.txt +0 -0

tests/test_actor_shape.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from unittest import TestCase
+from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
+from monarch._src.actor.shape import ShapeExt
+class TestShapeSlicing(TestCase):
+    def test_shape_at_removes_dimension(self) -> None:
+        """Test that at() removes dimensions and updates offset
+        correctly."""
+        slice_obj = Slice(offset=0, sizes=[2, 3, 4], strides=[12, 4, 1])
+        shape = Shape(["batch", "height", "width"], slice_obj)
+        # Test removing first dimension
+        result = shape.at("batch", 1)
+        self.assertEqual(result.labels, ["height", "width"])
+        self.assertEqual(result.ndslice.sizes, [3, 4])
+        self.assertEqual(result.ndslice.strides, [4, 1])
+        self.assertEqual(result.ndslice.offset, 12)  # 1 * 12
+        # Test removing middle dimension
+        result = shape.at("height", 2)
+        self.assertEqual(result.labels, ["batch", "width"])
+        self.assertEqual(result.ndslice.sizes, [2, 4])
+        self.assertEqual(result.ndslice.strides, [12, 1])
+        self.assertEqual(result.ndslice.offset, 8)  # 2 * 4
+        # Test removing last dimension
+        result = shape.at("width", 3)
+        self.assertEqual(result.labels, ["batch", "height"])
+        self.assertEqual(result.ndslice.sizes, [2, 3])
+        self.assertEqual(result.ndslice.strides, [12, 4])
+        self.assertEqual(result.ndslice.offset, 3)  # 3  * 1
+    def test_shape_select_keeps_dimension(self) -> None:
+        """Test that select() keeps dimensions but changes sizes."""
+        slice_obj = Slice.new_row_major([4, 6])
+        shape = Shape(["rows", "cols"], slice_obj)
+        # Test range selection
+        result = shape.select("rows", slice(1, 3))
+        self.assertEqual(result.labels, ["rows", "cols"])
+        self.assertEqual(result.ndslice.sizes, [2, 6])  # 3-1=2 rows
+        self.assertEqual(result.ndslice.offset, 6)  # 1 * 6
+        # Test step selection
+        result = shape.select("cols", slice(0, 6, 2))
+        self.assertEqual(result.labels, ["rows", "cols"])
+        self.assertEqual(result.ndslice.sizes, [4, 3])  # every 2nd col = 3 cols
+        self.assertEqual(result.ndslice.strides, [6, 2])  # stride becomes 2
+    def test_shape_slice_mixed_operations(self) -> None:
+        """Test mixing at() and select() operations."""
+        slice_obj = Slice.new_row_major([2, 3, 4])
+        shape = Shape(["batch", "height", "width"], slice_obj)
+        # Chain operations: select then at
+        result = shape.select("width", slice(1, 4)).at("batch", 0)
+        self.assertEqual(result.labels, ["height", "width"])
+        self.assertEqual(result.ndslice.sizes, [3, 3])
+        # Chain operations: at then select
+        result = shape.at("height", 1).select("width", slice(2, 4))
+        self.assertEqual(result.labels, ["batch", "width"])
+        self.assertEqual(result.ndslice.sizes, [2, 2])
+    def test_shape_slice_errors(self) -> None:
+        """Test error conditions."""
+        slice_obj = Slice.new_row_major([2, 3])
+        shape = Shape(["rows", "cols"], slice_obj)
+        # Test invalid label
+        with self.assertRaises(ValueError):
+            shape.at("nonexistent", 0)
+        # Test index out of range
+        with self.assertRaises(ValueError):
+            shape.at("rows", 5)
+        # Test negative index (Python-Rust boundary issue)
+        with self.assertRaises(OverflowError):  # Changed from ValueError
+            shape.at("rows", -1)
+    def test_shape_slice_comprehensive(self) -> None:
+        """Comprehensive test of slice() method."""
+        slice_obj = Slice.new_row_major([4, 5, 6])
+        shape = Shape(["a", "b", "c"], slice_obj)
+        # Test integer selection (removes dimensions)
+        result = ShapeExt.slice(shape, a=1, c=2)
+        self.assertEqual(result.labels, ["b"])
+        self.assertEqual(result.ndslice.sizes, [5])
+        # Test slice selection (keeps dimensions)
+        result = ShapeExt.slice(shape, b=slice(1, 4), c=slice(0, 6, 2))
+        self.assertEqual(result.labels, ["a", "b", "c"])
+        self.assertEqual(result.ndslice.sizes, [4, 3, 3])
+        # Test mixed selection
+        result = ShapeExt.slice(shape, a=2, b=slice(1, 3))
+        self.assertEqual(result.labels, ["b", "c"])
+        self.assertEqual(result.ndslice.sizes, [2, 6])

tests/test_allocator.py CHANGED Viewed

@@ -104,7 +104,9 @@ class TestActor(Actor):
 @contextlib.contextmanager
 def remote_process_allocator(
-    addr: Optional[str] = None, timeout: Optional[int] = None
+    addr: Optional[str] = None,
+    timeout: Optional[int] = None,
+    envs: Optional[dict[str, str]] = None,
 ) -> Generator[str, None, None]:
     """Start a remote process allocator on addr. If timeout is not None, have it
     timeout after that many seconds if no messages come in"""
@@ -120,16 +122,19 @@ def remote_process_allocator(
         if timeout is not None:
             args.append(f"--timeout-sec={timeout}")
+        env = {
+            # prefix PATH with this test module's directory to
+            # give 'process_allocator' and 'monarch_bootstrap' binary resources
+            # in this test module's directory precedence over the installed ones
+            # useful in BUCK where these binaries are added as 'resources' of this test target
+            "PATH": f"{package_path}:{os.getenv('PATH', '')}",
+            "RUST_LOG": "debug",
+        }
+        if envs:
+            env.update(envs)
         process_allocator = subprocess.Popen(
             args=args,
-            env={
-                # prefix PATH with this test module's directory to
-                # give 'process_allocator' and 'monarch_bootstrap' binary resources
-                # in this test module's directory precedence over the installed ones
-                # useful in BUCK where these binaries are added as 'resources' of this test target
-                "PATH": f"{package_path}:{os.getenv('PATH', '')}",
-                "RUST_LOG": "debug",
-            },
+            env=env,
         )
         try:
             yield addr
@@ -233,6 +238,26 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
         computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
         self.assertDictEqual(expected_world_sizes, computed_world_sizes)
+    async def test_allocate_failure_message(self) -> None:
+        spec = AllocSpec(AllocConstraints(), host=2, gpu=4)
+        with self.assertRaisesRegex(
+            Exception,
+            r"exited with code 1: Traceback \(most recent call last\).*",
+        ):
+            with remote_process_allocator(
+                envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
+            ) as host1, remote_process_allocator(
+                envs={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
+            ) as host2:
+                allocator = RemoteAllocator(
+                    world_id="test_remote_allocator",
+                    initializer=StaticRemoteAllocInitializer(host1, host2),
+                    heartbeat_interval=_100_MILLISECONDS,
+                )
+                alloc = await allocator.allocate(spec)
+                await ProcMesh.from_alloc(alloc)
     async def test_call_allocate_twice(self) -> None:
         class DeletingAllocInitializer(StaticRemoteAllocInitializer):
             """test initializer that removes the last address from the list each time initialize_alloc() is called