PyPI - torchmonarch-nightly - Versions diffs - 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.27__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.27__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +109 -52
monarch/_src/actor/endpoint.py +99 -8
monarch/_src/actor/event_loop.py +1 -1
monarch/_src/actor/proc_mesh.py +17 -9
monarch/_src/actor/tensor_engine_shim.py +5 -2
monarch/actor/__init__.py +2 -0
monarch/common/messages.py +9 -0
monarch/common/remote.py +2 -2
monarch/gradient/_gradient_generator.so +0 -0
monarch/mesh_controller.py +76 -14
monarch/monarch_controller +0 -0
monarch/tools/cli.py +2 -2
monarch/tools/commands.py +49 -27
monarch/tools/components/hyperactor.py +5 -3
monarch/tools/config/__init__.py +18 -1
monarch/tools/config/defaults.py +2 -2
monarch/tools/mesh_spec.py +4 -1
tests/test_allocator.py +11 -15
tests/test_env_before_cuda.py +2 -3
tests/test_python_actors.py +12 -0
tests/test_tensor_engine.py +27 -1
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/METADATA +34 -1
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/RECORD +28 -28
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.25.dist-info → torchmonarch_nightly-2025.7.27.dist-info}/top_level.txt +0 -0

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/_src/actor/actor_mesh.py CHANGED Viewed

@@ -29,8 +29,10 @@ from typing import (
     Iterable,
     Iterator,
     List,
+    Literal,
     NamedTuple,
     Optional,
+    overload,
     ParamSpec,
     Tuple,
     Type,
@@ -39,6 +41,7 @@ from typing import (
 )
 from monarch._rust_bindings.monarch_hyperactor.actor import (
+    MethodSpecifier,
     PanicFlag,
     PythonMessage,
     PythonMessageKind,
@@ -65,6 +68,7 @@ from monarch._src.actor.endpoint import (
     Endpoint,
     EndpointProperty,
     Extent,
+    NotAnEndpoint,
     Propagator,
     Selection,
 )
@@ -76,7 +80,7 @@ from monarch._src.actor.pickle import flatten, unflatten
 from monarch._src.actor.shape import MeshTrait, NDSlice
 from monarch._src.actor.sync_state import fake_sync_state
-from monarch._src.actor.tensor_engine_shim import actor_send
+from monarch._src.actor.tensor_engine_shim import actor_rref, actor_send
 if TYPE_CHECKING:
     from monarch._src.actor.proc_mesh import ProcMesh
@@ -281,16 +285,18 @@ class ActorEndpoint(Endpoint[P, R]):
     def __init__(
         self,
         actor_mesh_ref: _ActorMeshRefImpl,
-        name: str,
+        name: MethodSpecifier,
         impl: Callable[Concatenate[Any, P], Awaitable[R]],
         mailbox: Mailbox,
-        propagator: Propagator = None,
+        propagator: Propagator,
+        explicit_response_port: bool,
     ) -> None:
         super().__init__(propagator)
         self._actor_mesh = actor_mesh_ref
         self._name = name
         self._signature: inspect.Signature = inspect.signature(impl)
         self._mailbox = mailbox
+        self._explicit_response_port = explicit_response_port
     def _supervise(self, r: HyPortReceiver | OncePortReceiver) -> Any:
         mesh = self._actor_mesh._actor_mesh
@@ -299,6 +305,12 @@ class ActorEndpoint(Endpoint[P, R]):
     def _call_name(self) -> Any:
         return self._name
+    def _check_arguments(self, args, kwargs):
+        if self._explicit_response_port:
+            self._signature.bind(None, None, *args, **kwargs)
+        else:
+            self._signature.bind(None, *args, **kwargs)
     def _send(
         self,
         args: Tuple[Any, ...],
@@ -311,10 +323,9 @@ class ActorEndpoint(Endpoint[P, R]):
         This sends the message to all actors but does not wait for any result.
         """
-        self._signature.bind(None, *args, **kwargs)
+        self._check_arguments(args, kwargs)
         objects, bytes = flatten((args, kwargs), _is_ref_or_mailbox)
-        refs = [obj for obj in objects if hasattr(obj, "__monarch_ref__")]
-        if not refs:
+        if all(not hasattr(obj, "__monarch_ref__") for obj in objects):
             message = PythonMessage(
                 PythonMessageKind.CallMethod(
                     self._name, None if port is None else port._port_ref
@@ -323,7 +334,7 @@ class ActorEndpoint(Endpoint[P, R]):
             )
             self._actor_mesh.cast(message, selection)
         else:
-            actor_send(self, bytes, refs, port, selection)
+            actor_send(self, bytes, objects, port, selection)
         shape = self._actor_mesh._shape
         return Extent(shape.labels, shape.ndslice.sizes)
@@ -335,6 +346,53 @@ class ActorEndpoint(Endpoint[P, R]):
             ), "unexpected receiver type"
         return PortTuple(p, PortReceiver(self._mailbox, self._supervise(r._receiver)))
+    def _rref(self, args, kwargs):
+        self._check_arguments(args, kwargs)
+        refs, bytes = flatten((args, kwargs), _is_ref_or_mailbox)
+        return actor_rref(self, bytes, refs)
+@overload
+def as_endpoint(
+    not_an_endpoint: Callable[P, R],
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: Literal[False] = False,
+) -> Endpoint[P, R]: ...
+@overload
+def as_endpoint(
+    not_an_endpoint: Callable[Concatenate["PortProtocol[R]", P], None],
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: Literal[True],
+) -> Endpoint[P, R]: ...
+def as_endpoint(
+    not_an_endpoint: Any,
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: bool = False,
+):
+    if not isinstance(not_an_endpoint, NotAnEndpoint):
+        raise ValueError("expected an method of a spawned actor")
+    kind = (
+        MethodSpecifier.ExplicitPort
+        if explicit_response_port
+        else MethodSpecifier.ReturnsResponse
+    )
+    return ActorEndpoint(
+        not_an_endpoint._ref._actor_mesh_ref,
+        kind(not_an_endpoint._name),
+        getattr(not_an_endpoint._ref, not_an_endpoint._name),
+        not_an_endpoint._ref._mailbox,
+        propagate,
+        explicit_response_port,
+    )
 class Accumulator(Generic[P, R, A]):
     def __init__(
@@ -578,7 +636,7 @@ class _Actor:
         mailbox: Mailbox,
         rank: int,
         shape: Shape,
-        method: str,
+        method_spec: MethodSpecifier,
         message: bytes,
         panic_flag: PanicFlag,
         local_state: Iterable[Any],
@@ -596,17 +654,23 @@ class _Actor:
             args, kwargs = unflatten(message, local_state)
-            if method == "__init__":
-                Class, *args = args
-                try:
-                    self.instance = Class(*args, **kwargs)
-                except Exception as e:
-                    self._saved_error = ActorError(
-                        e, f"Remote actor {Class}.__init__ call failed."
-                    )
-                    raise e
-                port.send(None)
-                return None
+            match method_spec:
+                case MethodSpecifier.Init():
+                    Class, *args = args
+                    try:
+                        self.instance = Class(*args, **kwargs)
+                    except Exception as e:
+                        self._saved_error = ActorError(
+                            e, f"Remote actor {Class}.__init__ call failed."
+                        )
+                        raise e
+                    port.send(None)
+                    return None
+                case MethodSpecifier.ReturnsResponse(name=method):
+                    pass
+                case MethodSpecifier.ExplicitPort(name=method):
+                    args = (port, *args)
+                    port = DroppingPort()
             if self.instance is None:
                 # This could happen because of the following reasons. Both
@@ -625,18 +689,23 @@ class _Actor:
                         f" This is likely due to an earlier error: {self._saved_error}"
                     )
                 raise AssertionError(error_message)
-            the_method = getattr(self.instance, method)._method
+            the_method = getattr(self.instance, method)
+            if isinstance(the_method, EndpointProperty):
+                module = the_method._method.__module__
+                the_method = functools.partial(the_method._method, self.instance)
+            else:
+                module = the_method.__module__
             if inspect.iscoroutinefunction(the_method):
                 async def instrumented():
                     enter_span(
-                        the_method.__module__,
+                        module,
                         method,
                         str(ctx.mailbox.actor_id),
                     )
                     try:
-                        result = await the_method(self.instance, *args, **kwargs)
+                        result = await the_method(*args, **kwargs)
                         self._maybe_exit_debugger()
                     except Exception as e:
                         logging.critical(
@@ -649,9 +718,9 @@ class _Actor:
                 result = await instrumented()
             else:
-                enter_span(the_method.__module__, method, str(ctx.mailbox.actor_id))
+                enter_span(module, method, str(ctx.mailbox.actor_id))
                 with fake_sync_state():
-                    result = the_method(self.instance, *args, **kwargs)
+                    result = the_method(*args, **kwargs)
                 self._maybe_exit_debugger()
                 exit_span()
@@ -750,43 +819,29 @@ class ActorMeshRef(MeshTrait):
         for attr_name in dir(self._class):
             attr_value = getattr(self._class, attr_name, None)
             if isinstance(attr_value, EndpointProperty):
+                # Convert string method name to appropriate MethodSpecifier
+                kind = (
+                    MethodSpecifier.ExplicitPort
+                    if attr_value._explicit_response_port
+                    else MethodSpecifier.ReturnsResponse
+                )
                 setattr(
                     self,
                     attr_name,
                     ActorEndpoint(
                         self._actor_mesh_ref,
-                        attr_name,
+                        kind(attr_name),
                         attr_value._method,
                         self._mailbox,
+                        attr_value._propagator,
+                        attr_value._explicit_response_port,
                     ),
                 )
-    def __getattr__(self, name: str) -> Any:
-        # This method is called when an attribute is not found
-        # For linting purposes, we need to tell the type checker that any attribute
-        # could be an endpoint that's dynamically added at runtime
-        # At runtime, we still want to raise AttributeError for truly missing attributes
-        # Check if this is a method on the underlying class
-        if hasattr(self._class, name):
-            attr = getattr(self._class, name)
-            if isinstance(attr, EndpointProperty):
-                # Dynamically create the endpoint
-                endpoint = ActorEndpoint(
-                    self._actor_mesh_ref,
-                    name,
-                    attr._method,
-                    self._mailbox,
-                    propagator=attr._propagator,
-                )
-                # Cache it for future use
-                setattr(self, name, endpoint)
-                return endpoint
-        # If we get here, it's truly not found
-        raise AttributeError(
-            f"'{self.__class__.__name__}' object has no attribute '{name}'"
-        )
+    def __getattr__(self, attr: str) -> NotAnEndpoint:
+        if attr in dir(self._class):
+            return NotAnEndpoint(self, attr)
+        raise AttributeError(attr)
     def _create(
         self,
@@ -798,9 +853,11 @@ class ActorMeshRef(MeshTrait):
         ep = ActorEndpoint(
             self._actor_mesh_ref,
-            "__init__",
+            MethodSpecifier.Init(),
             null_func,
             self._mailbox,
+            None,
+            False,
         )
         send(ep, (self._class, *args), kwargs)

monarch/_src/actor/endpoint.py CHANGED Viewed

@@ -34,6 +34,7 @@ from monarch._src.actor.tensor_engine_shim import _cached_propagation, fake_call
 if TYPE_CHECKING:
     from monarch._src.actor.actor_mesh import (
+        ActorMeshRef,
         HyPortReceiver,
         OncePortReceiver,
         Port,
@@ -182,11 +183,22 @@ class Endpoint(ABC, Generic[P, R]):
         # pyre-ignore
         send(self, args, kwargs)
+    @abstractmethod
+    def _rref(self, args, kwargs) -> Any: ...
+    def rref(self, *args: P.args, **kwargs: P.kwargs) -> R:
+        return self._rref(args, kwargs)
     def _propagate(self, args, kwargs, fake_args, fake_kwargs):
         if self._propagator_arg is None or self._propagator_arg == "cached":
             if self._cache is None:
                 self._cache = {}
-            return _cached_propagation(self._cache, self._resolvable, args, kwargs)
+            resolvable = getattr(self, "_resolvable", None)
+            if resolvable is None:
+                raise NotImplementedError(
+                    "Cached propagation is not implemented for actor endpoints."
+                )
+            return _cached_propagation(self._cache, resolvable, args, kwargs)
         elif self._propagator_arg == "inspect":
             return None
         elif self._propagator_arg == "mocked":
@@ -211,16 +223,23 @@ class EndpointProperty(Generic[P, R]):
         self,
         method: Callable[Concatenate[Any, P], Awaitable[R]],
         propagator: Propagator,
+        explicit_response_port: bool,
     ) -> None: ...
     @overload
     def __init__(
-        self, method: Callable[Concatenate[Any, P], R], propagator: Propagator
+        self,
+        method: Callable[Concatenate[Any, P], R],
+        propagator: Propagator,
+        explicit_response_port: bool,
     ) -> None: ...
-    def __init__(self, method: Any, propagator: Propagator) -> None:
+    def __init__(
+        self, method: Any, propagator: Propagator, explicit_response_port: bool
+    ) -> None:
         self._method = method
         self._propagator = propagator
+        self._explicit_response_port = explicit_response_port
     def __get__(self, instance, owner) -> Endpoint[P, R]:
         # this is a total lie, but we have to actually
@@ -229,13 +248,50 @@ class EndpointProperty(Generic[P, R]):
         return cast(Endpoint[P, R], self)
+class NotAnEndpoint:
+    """
+    Used as the dynamic value of functions on an ActorMeshRef that were not marked as endpoints.
+    This is used both to give a better error message (since we cannot prevent the type system from thinking they are methods),
+    and to provide the oppurtunity for someone to do endpoint(x.foo) on something that wasn't marked as an endpoint.
+    """
+    def __init__(self, ref: "ActorMeshRef", name: str):
+        self._ref = ref
+        self._name = name
+    def __call__(self, *args, **kwargs) -> None:
+        raise RuntimeError(
+            f"Actor {self._ref._class}.{self._name} is not annotated as an endpoint. To call it as one, add a @endpoint decorator to it, or directly wrap it in one as_endpoint(obj.method).call(...)"
+        )
 # This can't just be Callable because otherwise we are not
 # allowed to use type arguments in the return value.
 class EndpointIfy:
     @overload
-    def __call__(self, function: Callable[P, Awaitable[R]]) -> Endpoint[P, R]: ...
+    def __call__(
+        self, function: Callable[Concatenate[Any, P], Awaitable[R]]
+    ) -> Endpoint[P, R]: ...
     @overload
-    def __call__(self, function: Callable[P, R]) -> Endpoint[P, R]: ...
+    def __call__(
+        self, function: Callable[Concatenate[Any, P], R]
+    ) -> Endpoint[P, R]: ...
+    def __call__(self, function: Any):
+        pass
+class PortedEndpointIfy:
+    @overload
+    def __call__(
+        self,
+        function: Callable[Concatenate[Any, "Port[R]", P], Awaitable[None]],
+    ) -> Endpoint[P, R]: ...
+    @overload
+    def __call__(
+        self, function: Callable[Concatenate[Any, "Port[R]", P], None]
+    ) -> Endpoint[P, R]: ...
     def __call__(self, function: Any):
         pass
@@ -246,6 +302,7 @@ def endpoint(
     method: Callable[Concatenate[Any, P], Awaitable[R]],
     *,
     propagate: Propagator = None,
+    explicit_response_port: Literal[False] = False,
 ) -> EndpointProperty[P, R]: ...
@@ -254,6 +311,7 @@ def endpoint(
     method: Callable[Concatenate[Any, P], R],
     *,
     propagate: Propagator = None,
+    explicit_response_port: Literal[False] = False,
 ) -> EndpointProperty[P, R]: ...
@@ -261,10 +319,43 @@ def endpoint(
 def endpoint(
     *,
     propagate: Propagator = None,
+    explicit_response_port: Literal[False] = False,
 ) -> EndpointIfy: ...
-def endpoint(method=None, *, propagate=None):
+@overload
+def endpoint(
+    method: Callable[Concatenate[Any, "Port[R]", P], Awaitable[None]],
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: Literal[True],
+) -> EndpointProperty[P, R]: ...
+@overload
+def endpoint(
+    method: Callable[Concatenate[Any, "Port[R]", P], None],
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: Literal[True],
+) -> EndpointProperty[P, R]: ...
+@overload
+def endpoint(
+    *,
+    propagate: Propagator = None,
+    explicit_response_port: Literal[True],
+) -> PortedEndpointIfy: ...
+def endpoint(method=None, *, propagate=None, explicit_response_port: bool = False):
     if method is None:
-        return functools.partial(endpoint, propagate=propagate)
-    return EndpointProperty(method, propagator=propagate)
+        return functools.partial(
+            endpoint,
+            propagate=propagate,
+            explicit_response_port=explicit_response_port,
+        )
+    return EndpointProperty(
+        method, propagator=propagate, explicit_response_port=explicit_response_port
+    )

monarch/_src/actor/event_loop.py CHANGED Viewed

@@ -14,7 +14,7 @@ import logging
 import threading
 from typing import Optional
-from libfb.py.pyre import none_throws
+from pyre_extensions import none_throws
 logger = logging.getLogger(__name__)

monarch/_src/actor/proc_mesh.py CHANGED Viewed

@@ -43,7 +43,6 @@ from monarch._src.actor.actor_mesh import (
     Actor,
     ActorMeshRef,
     fake_sync_state,
-    MonarchContext,
 )
 from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
@@ -89,7 +88,7 @@ class SetupActor(Actor):
     Typically used to setup the environment variables.
     """
-    def __init__(self, env: Callable[[MonarchContext], None]) -> None:
+    def __init__(self, env: Callable[[], None]) -> None:
         """
         Initialize the setup actor with the user defined setup method.
         """
@@ -100,8 +99,7 @@ class SetupActor(Actor):
         """
         Call the user defined setup method with the monarch context.
         """
-        ctx = MonarchContext.get()
-        self._setup_method(ctx)
+        self._setup_method()
 T = TypeVar("T")
@@ -114,7 +112,7 @@ except ImportError:
 async def _allocate_nonblocking(
-    alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
+    alloc: Alloc, setup: Callable[[], None] | None = None
 ) -> "ProcMesh":
     _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
     if setup is None:
@@ -211,7 +209,7 @@ class ProcMesh(MeshTrait):
     @classmethod
     def from_alloc(
-        self, alloc: Alloc, setup: Callable[[MonarchContext], None] | None = None
+        self, alloc: Alloc, setup: Callable[[], None] | None = None
     ) -> Future["ProcMesh"]:
         """
         Allocate a process mesh according to the provided alloc.
@@ -219,7 +217,17 @@ class ProcMesh(MeshTrait):
         Arguments:
         - `alloc`: The alloc to allocate according to.
-        - `setup`: A lambda taking MonarchContext as param, can be used to setup env vars on the allocated mesh
+        - `setup`: An optional lambda function to configure environment variables on the allocated mesh.
+        Use the `current_rank()` method within the lambda to obtain the rank.
+        Example of a setup method to initialize torch distributed environment variables:
+        ```
+        def setup():
+            rank = current_rank()
+            os.environ["RANK"] = str(rank)
+            os.environ["WORLD_SIZE"] = str(len(rank.shape))
+            os.environ["LOCAL_RANK"] = str(rank["gpus"])
+        ```
         """
         return Future(
             impl=lambda: _allocate_nonblocking(alloc, setup),
@@ -428,7 +436,7 @@ async def proc_mesh_nonblocking(
     gpus: Optional[int] = None,
     hosts: int = 1,
     env: dict[str, str] | None = None,
-    setup: Callable[[MonarchContext], None] | None = None,
+    setup: Callable[[], None] | None = None,
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
@@ -457,7 +465,7 @@ def proc_mesh(
     gpus: Optional[int] = None,
     hosts: int = 1,
     env: dict[str, str] | None = None,
-    setup: Callable[[MonarchContext], None] | None = None,
+    setup: Callable[[], None] | None = None,
 ) -> Future[ProcMesh]:
     return Future(
         impl=lambda: proc_mesh_nonblocking(

monarch/_src/actor/tensor_engine_shim.py CHANGED Viewed

@@ -19,7 +19,6 @@ time it is used.
 if TYPE_CHECKING:
     from monarch._src.actor.actor_mesh import ActorEndpoint, Port, Selection
-    from monarch._src.actor.endpoint import Endpoint
 def shim(fn=None, *, module=None):
@@ -48,8 +47,12 @@ def actor_send(
 ) -> None: ...
+@shim(module="monarch.mesh_controller")
+def actor_rref(endpoint, args_kwargs_tuple: bytes, refs: Sequence[Any]): ...
 @shim(module="monarch.common.remote")
-def _cached_propagation(_cache, rfunction: "Endpoint", args, kwargs) -> Any: ...
+def _cached_propagation(_cache, rfunction, args, kwargs) -> Any: ...
 @shim(module="monarch.common.fake")

monarch/actor/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from monarch._src.actor.actor_mesh import (
     Accumulator,
     Actor,
     ActorError,
+    as_endpoint,
     current_actor_name,
     current_rank,
     current_size,
@@ -35,6 +36,7 @@ __all__ = [
     "Actor",
     "ActorError",
     "current_actor_name",
+    "as_endpoint",
     "current_rank",
     "current_size",
     "endpoint",

monarch/common/messages.py CHANGED Viewed

@@ -435,6 +435,15 @@ class SendResultOfActorCall(NamedTuple):
     stream: tensor_worker.StreamRef
+class CallActorMethod(NamedTuple):
+    seq: int
+    result: object
+    broker_id: Tuple[str, int]
+    local_state: Sequence[Tensor | tensor_worker.Ref]
+    mutates: List[tensor_worker.Ref]
+    stream: tensor_worker.StreamRef
 class SplitComm(NamedTuple):
     dims: Dims
     device_mesh: DeviceMesh

monarch/common/remote.py CHANGED Viewed

@@ -157,7 +157,7 @@ class Remote(Generic[P, R], Endpoint[P, R]):
     def _maybe_resolvable(self):
         return None if self._remote_impl is None else self._resolvable
-    def rref(self, *args: P.args, **kwargs: P.kwargs) -> R:
+    def _rref(self, args, kwargs):
         return dtensor_dispatch(
             self._resolvable,
             self._propagate,
@@ -352,7 +352,7 @@ _miss = 0
 _hit = 0
-def _cached_propagation(_cache, rfunction: Endpoint, args, kwargs):
+def _cached_propagation(_cache, rfunction: ResolvableFunction, args, kwargs):
     tensors, shape_key = hashable_tensor_flatten(args, kwargs)
     # pyre-ignore
     inputs_group = TensorGroup([t._fake for t in tensors])

monarch/gradient/_gradient_generator.so CHANGED Viewed

Binary file