torchmonarch-nightly 2025.7.28__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.30__cp311-cp311-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monarch/_rust_bindings.so +0 -0
- monarch/_src/actor/actor_mesh.py +9 -5
- monarch/_src/actor/allocator.py +18 -47
- monarch/_src/actor/debugger.py +159 -98
- monarch/_src/actor/endpoint.py +15 -4
- monarch/_src/actor/future.py +79 -32
- monarch/_src/actor/pdb_wrapper.py +10 -4
- monarch/_src/actor/proc_mesh.py +82 -114
- monarch/_src/actor/shape.py +32 -33
- monarch/_src/tensor_engine/rdma.py +12 -6
- monarch/mesh_controller.py +37 -4
- monarch/monarch_controller +0 -0
- monarch/tools/components/hyperactor.py +1 -1
- monarch/tools/config/__init__.py +1 -1
- monarch/tools/config/defaults.py +1 -1
- monarch/tools/utils.py +27 -0
- tests/test_actor_error.py +3 -4
- tests/test_actor_shape.py +114 -0
- tests/test_debugger.py +406 -178
- tests/test_python_actors.py +67 -67
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/METADATA +1 -1
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/RECORD +26 -25
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/WHEEL +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/entry_points.txt +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/licenses/LICENSE +0 -0
- {torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,7 @@ DEFAULT_NAME: str = f"monarch-{_USER}"
|
|
22
22
|
__version__ = "latest" # TODO get version from monarch.__version_
|
23
23
|
|
24
24
|
|
25
|
-
def
|
25
|
+
def host_mesh(
|
26
26
|
image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr
|
27
27
|
meshes: list[str] = _DEFAULT_MESHES,
|
28
28
|
env: Optional[dict[str, str]] = None,
|
monarch/tools/config/__init__.py
CHANGED
monarch/tools/config/defaults.py
CHANGED
@@ -25,7 +25,7 @@ from torchx.schedulers import (
|
|
25
25
|
|
26
26
|
def component_fn(scheduler: str) -> Callable[..., UnnamedAppDef]:
|
27
27
|
"""The default TorchX component function for the scheduler"""
|
28
|
-
return hyperactor.
|
28
|
+
return hyperactor.host_mesh
|
29
29
|
|
30
30
|
|
31
31
|
def scheduler_factories() -> dict[str, SchedulerFactory]:
|
monarch/tools/utils.py
CHANGED
@@ -6,9 +6,36 @@
|
|
6
6
|
|
7
7
|
# pyre-strict
|
8
8
|
import os
|
9
|
+
import pathlib
|
9
10
|
from typing import Optional
|
10
11
|
|
11
12
|
|
13
|
+
def MONARCH_HOME(*subdir_paths: str) -> pathlib.Path:
|
14
|
+
"""
|
15
|
+
Path to the "dot-directory" for monarch.
|
16
|
+
Defaults to `~/.monarch` and is overridable via the `MONARCH_HOME` environment variable.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
|
20
|
+
.. doc-test::
|
21
|
+
|
22
|
+
from pathlib import Path
|
23
|
+
from monarch.tools.utils import MONARCH_HOME
|
24
|
+
|
25
|
+
assert MONARCH_HOME() == Path.home() / ".monarch"
|
26
|
+
assert MONARCH_HOME("conda-pack-out") == Path.home() / ".monarch" / "conda-pack-out"
|
27
|
+
```
|
28
|
+
"""
|
29
|
+
|
30
|
+
default_dir = str(pathlib.Path.home() / ".monarch")
|
31
|
+
monarch_home = pathlib.Path(os.getenv("MONARCH_HOME", default_dir))
|
32
|
+
|
33
|
+
monarch_home_subdir = monarch_home / os.path.sep.join(subdir_paths)
|
34
|
+
monarch_home_subdir.mkdir(parents=True, exist_ok=True)
|
35
|
+
|
36
|
+
return monarch_home_subdir
|
37
|
+
|
38
|
+
|
12
39
|
class conda:
|
13
40
|
"""Conda related util functions."""
|
14
41
|
|
tests/test_actor_error.py
CHANGED
@@ -598,8 +598,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh):
|
|
598
598
|
# TODO - re-enable after resolving T232206970
|
599
599
|
@pytest.mark.oss_skip
|
600
600
|
async def test_supervision_with_sending_error():
|
601
|
-
os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "
|
602
|
-
os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "1"
|
601
|
+
os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "50000000"
|
603
602
|
|
604
603
|
proc = await proc_mesh(gpus=1)
|
605
604
|
actor_mesh = await proc.spawn("healthy", HealthyActor)
|
@@ -611,9 +610,9 @@ async def test_supervision_with_sending_error():
|
|
611
610
|
|
612
611
|
# send a large payload to trigger send timeout error
|
613
612
|
with pytest.raises(
|
614
|
-
SupervisionError, match="supervision error:.*
|
613
|
+
SupervisionError, match="supervision error:.*actor mesh is stopped"
|
615
614
|
):
|
616
|
-
await actor_mesh.check_with_payload.call(payload="a" *
|
615
|
+
await actor_mesh.check_with_payload.call(payload="a" * 55000000)
|
617
616
|
|
618
617
|
# new call should fail with check of health state of actor mesh
|
619
618
|
with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
# pyre-strict
|
8
|
+
|
9
|
+
from unittest import TestCase
|
10
|
+
|
11
|
+
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
|
12
|
+
from monarch._src.actor.shape import ShapeExt
|
13
|
+
|
14
|
+
|
15
|
+
class TestShapeSlicing(TestCase):
|
16
|
+
def test_shape_at_removes_dimension(self) -> None:
|
17
|
+
"""Test that at() removes dimensions and updates offset
|
18
|
+
correctly."""
|
19
|
+
|
20
|
+
slice_obj = Slice(offset=0, sizes=[2, 3, 4], strides=[12, 4, 1])
|
21
|
+
shape = Shape(["batch", "height", "width"], slice_obj)
|
22
|
+
|
23
|
+
# Test removing first dimension
|
24
|
+
result = shape.at("batch", 1)
|
25
|
+
self.assertEqual(result.labels, ["height", "width"])
|
26
|
+
self.assertEqual(result.ndslice.sizes, [3, 4])
|
27
|
+
self.assertEqual(result.ndslice.strides, [4, 1])
|
28
|
+
self.assertEqual(result.ndslice.offset, 12) # 1 * 12
|
29
|
+
|
30
|
+
# Test removing middle dimension
|
31
|
+
result = shape.at("height", 2)
|
32
|
+
self.assertEqual(result.labels, ["batch", "width"])
|
33
|
+
self.assertEqual(result.ndslice.sizes, [2, 4])
|
34
|
+
self.assertEqual(result.ndslice.strides, [12, 1])
|
35
|
+
self.assertEqual(result.ndslice.offset, 8) # 2 * 4
|
36
|
+
|
37
|
+
# Test removing last dimension
|
38
|
+
result = shape.at("width", 3)
|
39
|
+
self.assertEqual(result.labels, ["batch", "height"])
|
40
|
+
self.assertEqual(result.ndslice.sizes, [2, 3])
|
41
|
+
self.assertEqual(result.ndslice.strides, [12, 4])
|
42
|
+
self.assertEqual(result.ndslice.offset, 3) # 3 * 1
|
43
|
+
|
44
|
+
def test_shape_select_keeps_dimension(self) -> None:
|
45
|
+
"""Test that select() keeps dimensions but changes sizes."""
|
46
|
+
|
47
|
+
slice_obj = Slice.new_row_major([4, 6])
|
48
|
+
shape = Shape(["rows", "cols"], slice_obj)
|
49
|
+
|
50
|
+
# Test range selection
|
51
|
+
result = shape.select("rows", slice(1, 3))
|
52
|
+
self.assertEqual(result.labels, ["rows", "cols"])
|
53
|
+
self.assertEqual(result.ndslice.sizes, [2, 6]) # 3-1=2 rows
|
54
|
+
self.assertEqual(result.ndslice.offset, 6) # 1 * 6
|
55
|
+
|
56
|
+
# Test step selection
|
57
|
+
result = shape.select("cols", slice(0, 6, 2))
|
58
|
+
self.assertEqual(result.labels, ["rows", "cols"])
|
59
|
+
self.assertEqual(result.ndslice.sizes, [4, 3]) # every 2nd col = 3 cols
|
60
|
+
self.assertEqual(result.ndslice.strides, [6, 2]) # stride becomes 2
|
61
|
+
|
62
|
+
def test_shape_slice_mixed_operations(self) -> None:
|
63
|
+
"""Test mixing at() and select() operations."""
|
64
|
+
|
65
|
+
slice_obj = Slice.new_row_major([2, 3, 4])
|
66
|
+
shape = Shape(["batch", "height", "width"], slice_obj)
|
67
|
+
|
68
|
+
# Chain operations: select then at
|
69
|
+
result = shape.select("width", slice(1, 4)).at("batch", 0)
|
70
|
+
self.assertEqual(result.labels, ["height", "width"])
|
71
|
+
self.assertEqual(result.ndslice.sizes, [3, 3])
|
72
|
+
|
73
|
+
# Chain operations: at then select
|
74
|
+
result = shape.at("height", 1).select("width", slice(2, 4))
|
75
|
+
self.assertEqual(result.labels, ["batch", "width"])
|
76
|
+
self.assertEqual(result.ndslice.sizes, [2, 2])
|
77
|
+
|
78
|
+
def test_shape_slice_errors(self) -> None:
|
79
|
+
"""Test error conditions."""
|
80
|
+
slice_obj = Slice.new_row_major([2, 3])
|
81
|
+
shape = Shape(["rows", "cols"], slice_obj)
|
82
|
+
|
83
|
+
# Test invalid label
|
84
|
+
with self.assertRaises(ValueError):
|
85
|
+
shape.at("nonexistent", 0)
|
86
|
+
|
87
|
+
# Test index out of range
|
88
|
+
with self.assertRaises(ValueError):
|
89
|
+
shape.at("rows", 5)
|
90
|
+
|
91
|
+
# Test negative index (Python-Rust boundary issue)
|
92
|
+
with self.assertRaises(OverflowError): # Changed from ValueError
|
93
|
+
shape.at("rows", -1)
|
94
|
+
|
95
|
+
def test_shape_slice_comprehensive(self) -> None:
|
96
|
+
"""Comprehensive test of slice() method."""
|
97
|
+
|
98
|
+
slice_obj = Slice.new_row_major([4, 5, 6])
|
99
|
+
shape = Shape(["a", "b", "c"], slice_obj)
|
100
|
+
|
101
|
+
# Test integer selection (removes dimensions)
|
102
|
+
result = ShapeExt.slice(shape, a=1, c=2)
|
103
|
+
self.assertEqual(result.labels, ["b"])
|
104
|
+
self.assertEqual(result.ndslice.sizes, [5])
|
105
|
+
|
106
|
+
# Test slice selection (keeps dimensions)
|
107
|
+
result = ShapeExt.slice(shape, b=slice(1, 4), c=slice(0, 6, 2))
|
108
|
+
self.assertEqual(result.labels, ["a", "b", "c"])
|
109
|
+
self.assertEqual(result.ndslice.sizes, [4, 3, 3])
|
110
|
+
|
111
|
+
# Test mixed selection
|
112
|
+
result = ShapeExt.slice(shape, a=2, b=slice(1, 3))
|
113
|
+
self.assertEqual(result.labels, ["b", "c"])
|
114
|
+
self.assertEqual(result.ndslice.sizes, [2, 6])
|