PyPI - torchmonarch-nightly - Versions diffs - 2025.6.20__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.27__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.20__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.27__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

monarch/_rust_bindings.so +0 -0
monarch/actor_mesh.py +13 -5
monarch/allocator.py +87 -1
monarch/code_sync.py +10 -0
monarch/debugger.py +4 -2
monarch/monarch_controller +0 -0
monarch/proc_mesh.py +43 -3
monarch/tools/mesh_spec.py +42 -4
monarch/tools/network.py +34 -27
tests/test_allocator.py +154 -6
tests/test_python_actors.py +8 -44
tests/test_tensor_engine.py +52 -0
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/METADATA +2 -2
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/RECORD +18 -16
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/top_level.txt +0 -0

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/actor_mesh.py CHANGED Viewed

@@ -288,11 +288,11 @@ class Endpoint(Generic[P, R]):
     def broadcast(self, *args: P.args, **kwargs: P.kwargs) -> None:
         """
-        Broadcast to all actors and wait for each to acknowledge receipt.
+        Fire-and-forget broadcast to all actors without waiting for actors to
+        acknowledge receipt.
-        This behaves like `cast`, but ensures that each actor has received and
-        processed the message by awaiting a response from each one. Does not
-        return any results.
+        In other words, the return of this method does not guarrantee the
+        delivery of the message.
         """
         # pyre-ignore
         send(self, args, kwargs)
@@ -319,6 +319,10 @@ class Accumulator(Generic[P, R, A]):
 class ValueMesh(MeshTrait, Generic[R]):
+    """
+    Container of return values, indexed by rank.
+    """
     def __init__(self, shape: Shape, values: List[R]) -> None:
         self._shape = shape
         self._values = values
@@ -516,6 +520,10 @@ class _Actor:
                 self.instance = Class(*args, **kwargs)
                 return None
+            if self.instance is None:
+                raise AssertionError(
+                    "__init__ failed earlier and no Actor object is available"
+                )
             the_method = getattr(self.instance, message.method)._method
             if inspect.iscoroutinefunction(the_method):
@@ -622,7 +630,7 @@ class Actor(MeshTrait):
         )
-class ActorMeshRef(MeshTrait):
+class ActorMeshRef(MeshTrait, Generic[T]):
     def __init__(
         self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
     ) -> None:

monarch/allocator.py CHANGED Viewed

@@ -7,7 +7,8 @@
 # pyre-strict
 import abc
-from typing import final
+import logging
+from typing import final, Optional
 from monarch import ActorFuture as Future
 from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
@@ -21,6 +22,10 @@ from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monar
     RemoteAllocatorBase,
 )
+ALLOC_LABEL_PROC_MESH_NAME = "procmesh.monarch.meta.com/name"
+logger: logging.Logger = logging.getLogger(__name__)
 @final
 class ProcessAllocator(ProcessAllocatorBase):
@@ -111,6 +116,87 @@ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
         return list(self.addrs)
+class TorchXRemoteAllocInitializer(RemoteAllocInitializer):
+    """
+    For monarch runtimes running as a job on a supported scheduler.
+    Such runtimes are typically launched using the monarch CLI (e.g `monarch create --scheduler slurm ...`).
+    Returns the server addresses of a specific monarch runtime by using TorchX's status API
+    to get the hostnames of the nodes.
+    """
+    def __init__(
+        self,
+        server_handle: str,
+        /,
+        transport: Optional[str] = None,
+        port: Optional[int] = None,
+    ) -> None:
+        """
+        NOTE: If `transport` and `port` specified, they are used over the `transport` and `port`
+          information that is tagged as metadata on the server's job. This is useful in two specific
+          situations:
+            1) The job was NOT created wit monarch CLI (hence no metadata tags exist)
+            2) The scheduler does not support job metadata tagging
+        Arguments:
+        - `server_handle`: points to a monarch runtime. Of the form `{scheduler}://{namespace}/{job_id}`.
+             the `{namespace}` can be empty if not configured (e.g. `slurm:///1234` - notice the triple slashes).
+        - `transport`: the channel transport that should be used to connect to the remote process allocator address
+        - `port`: the port that the remote process allocator is running on
+        """
+        self.server_handle = server_handle
+        self.transport = transport
+        self.port = port
+    async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
+        # lazy import since torchx-fb is not included in `fbcode//monarch/python/monarch:monarch.whl`
+        # nor any of the base conda environments
+        from monarch.tools.commands import server_ready
+        mesh_name = match_labels.get(ALLOC_LABEL_PROC_MESH_NAME)
+        server = await server_ready(self.server_handle)
+        # job does not exist or it is in a terminal state (SUCCEEDED, FAILED, CANCELLED)
+        if not (server and server.is_running):
+            raise ValueError(
+                f"{self.server_handle} does not exist or is in a terminal state"
+            )
+        if not mesh_name:
+            logger.info(
+                "no match label `%s` specified in alloc constraints",
+                ALLOC_LABEL_PROC_MESH_NAME,
+            )
+            num_meshes = len(server.meshes)
+            if num_meshes == 1:
+                logger.info(
+                    "found a single proc mesh `%s` in %s, will allocate on it",
+                    server.meshes[0].name,
+                    self.server_handle,
+                )
+            else:
+                raise RuntimeError(
+                    f"{num_meshes} proc meshes in {self.server_handle},"
+                    f" please specify the mesh name as a match label `{ALLOC_LABEL_PROC_MESH_NAME}`"
+                    f" in allocation constraints of the alloc spec"
+                )
+            mesh = server.meshes[0]
+        else:
+            mesh = server.get_mesh_spec(mesh_name)
+        server_addrs = mesh.server_addrs(self.transport, self.port)
+        logger.info(
+            "initializing alloc on remote allocator addresses: %s", server_addrs
+        )
+        return server_addrs
 @final
 class RemoteAllocator(RemoteAllocatorBase):
     """

monarch/code_sync.py ADDED Viewed

@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from monarch._rust_bindings.monarch_extension.code_sync import (  # noqa: F401
+    RemoteWorkspace,
+    RsyncMeshClient,
+)

monarch/debugger.py CHANGED Viewed

@@ -11,7 +11,7 @@ from dataclasses import dataclass
 from typing import Dict, List, Tuple, Union
 from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
-from monarch.actor_mesh import Actor, endpoint
+from monarch.actor_mesh import Actor, ActorMeshRef, endpoint
 from monarch.pdb_wrapper import DebuggerWrite
@@ -370,7 +370,9 @@ class DebugClient(Actor):
         await session.debugger_write(write)
-async def init_debugging(actor_mesh: Actor) -> DebugClient:
+async def init_debugging(
+    actor_mesh: ActorMeshRef,
+) -> ActorMeshRef[DebugClient]:
     debugger_proc_mesh = await local_proc_mesh(gpus=1, hosts=1)
     debug_client_mesh = await debugger_proc_mesh.spawn("debug_client", DebugClient)
     await actor_mesh._set_debug_client.call(debug_client_mesh)

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/proc_mesh.py CHANGED Viewed

@@ -6,6 +6,7 @@
 # pyre-strict
+import os
 import sys
 from contextlib import AbstractContextManager
@@ -27,6 +28,10 @@ if TYPE_CHECKING:
 import monarch
 from monarch import ActorFuture as Future
+# Conditionally import DeviceMesh and spawn_tensor_engine only if tensor_engine is available
+# pyre-ignore[21]
+from monarch._rust_bindings import has_tensor_engine
 from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
     Alloc,
     AllocConstraints,
@@ -37,12 +42,18 @@ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyPr
 from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
 from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
+from monarch.code_sync import RemoteWorkspace, RsyncMeshClient
 from monarch.common._device_utils import _local_device_count
-from monarch.common.device_mesh import DeviceMesh
 from monarch.common.shape import MeshTrait
-from monarch.mesh_controller import spawn_tensor_engine
 from monarch.rdma import RDMAManager
+if has_tensor_engine():
+    from monarch.common.device_mesh import DeviceMesh
+    from monarch.mesh_controller import spawn_tensor_engine
+else:
+    DeviceMesh = None
+    spawn_tensor_engine = None
 T = TypeVar("T")
 try:
     from __manifest__ import fbmake  # noqa
@@ -71,6 +82,7 @@ class ProcMesh(MeshTrait):
         self._mock_shape: Optional[Shape] = _mock_shape
         self._mailbox: Mailbox = self._proc_mesh.client
         self._rdma_manager: Optional[RDMAManager] = None
+        self._rsync_mesh_client: Optional[RsyncMeshClient] = None
         self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
         if _mock_shape is None:
             self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
@@ -95,7 +107,9 @@ class ProcMesh(MeshTrait):
         )
         return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
-    def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
+    def spawn(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> Future[ActorMeshRef[T]]:
         if self._mock_shape is not None:
             raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
         return Future(
@@ -156,6 +170,10 @@ class ProcMesh(MeshTrait):
     @property
     def _device_mesh(self) -> "DeviceMesh":
+        if spawn_tensor_engine is None:
+            raise RuntimeError(
+                "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
+            )
         if self._maybe_device_mesh is None:
             if self._mock_shape is not None:
                 raise NotImplementedError(
@@ -174,6 +192,28 @@ class ProcMesh(MeshTrait):
     def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
         return self._device_mesh.ranks
+    async def sync_workspace(self) -> None:
+        if self._rsync_mesh_client is None:
+            # TODO(agallagher): We need some way to configure and pass this
+            # in -- right now we're assuming the `gpu` dimension, which isn't
+            # correct.
+            assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
+            # The workspace shape (i.e. only perform one rsync per host).
+            workspace_shape = self.slice(gpus=slice(0, 1, 1))._mock_shape
+            assert workspace_shape is not None
+            # TODO(agallagher): We should probably hide this behind something
+            # like a `Workspace` class and support abstracting/configuring
+            # different sync methods.
+            self._rsync_mesh_client = RsyncMeshClient.spawn_blocking(
+                proc_mesh=self._proc_mesh,
+                shape=workspace_shape,
+                # TODO(agallagher): Is there a better way to infer/set the local
+                # workspace dir, rather than use PWD?
+                local_workspace=os.getcwd(),
+                remote_workspace=RemoteWorkspace.FromEnvVar("WORKSPACE_DIR"),
+            )
+        await self._rsync_mesh_client.sync_workspace()
 async def local_proc_mesh_nonblocking(
     *, gpus: Optional[int] = None, hosts: int = 1

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -9,6 +9,7 @@ import string
 from dataclasses import dataclass, field
 from typing import Any, Optional
+from monarch.tools.network import get_sockaddr
 from torchx import specs
 DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
@@ -16,6 +17,10 @@ DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
 _TAG_MESHES_PREFIX = "monarch/meshes/${mesh_name}/"
 _TAG_HOST_TYPE: str = _TAG_MESHES_PREFIX + "host_type"
 _TAG_GPUS: str = _TAG_MESHES_PREFIX + "gpus"
+_TAG_TRANSPORT: str = _TAG_MESHES_PREFIX + "transport"
+_UNSET_INT = -1
+_UNSET_STR = "__UNSET__"
 @dataclass
@@ -26,11 +31,38 @@ class MeshSpec:
     name: str
     num_hosts: int
-    host_type: str
-    gpus: int
+    host_type: str = _UNSET_STR
+    gpus: int = _UNSET_INT
+    # NOTE: using str over monarch._rust_bindings.monarch_hyperactor.channel.ChannelTransport enum
+    #  b/c the rust binding doesn't have Python enum semantics, hence doesn't serialize well
+    transport: str = "tcp"
     port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
     hostnames: list[str] = field(default_factory=list)
+    def server_addrs(
+        self, transport: Optional[str] = None, port: Optional[int] = None
+    ) -> list[str]:
+        """
+        Returns the hostnames (servers) in channel address format.
+        `transport` and `port` is typically taken from this mesh spec's fields, but
+        the caller can override them when calling this function.
+        """
+        transport = transport or self.transport
+        port = port or self.port
+        if transport == "tcp":
+            # need to resolve hostnames to ip address for TCP
+            return [
+                f"tcp!{get_sockaddr(hostname, port)}" for hostname in self.hostnames
+            ]
+        elif transport == "metatls":
+            return [f"metatls!{hostname}:{port}" for hostname in self.hostnames]
+        else:
+            raise ValueError(
+                f"Unsupported transport: {transport}. Must be one of: 'tcp' or 'metatls'"
+            )
 def _tag(mesh_name: str, tag_template: str) -> str:
     return string.Template(tag_template).substitute(mesh_name=mesh_name)
@@ -39,6 +71,7 @@ def _tag(mesh_name: str, tag_template: str) -> str:
 def tag_as_metadata(mesh_spec: MeshSpec, appdef: specs.AppDef) -> None:
     appdef.metadata[_tag(mesh_spec.name, _TAG_HOST_TYPE)] = mesh_spec.host_type
     appdef.metadata[_tag(mesh_spec.name, _TAG_GPUS)] = str(mesh_spec.gpus)
+    appdef.metadata[_tag(mesh_spec.name, _TAG_TRANSPORT)] = mesh_spec.transport
 def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[MeshSpec]:
@@ -47,8 +80,13 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
             return MeshSpec(
                 name=mesh_name,
                 num_hosts=role.num_replicas,
-                host_type=appdef.metadata.get(_tag(mesh_name, _TAG_HOST_TYPE), ""),
-                gpus=int(appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), "-1")),
+                host_type=appdef.metadata.get(
+                    _tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR
+                ),
+                gpus=int(
+                    appdef.metadata.get(_tag(mesh_name, _TAG_GPUS), str(_UNSET_INT))
+                ),
+                transport=appdef.metadata.get(_tag(mesh_name, _TAG_TRANSPORT), "tcp"),
                 port=role.port_map.get("mesh", DEFAULT_REMOTE_ALLOCATOR_PORT),
             )

monarch/tools/network.py CHANGED Viewed

@@ -12,51 +12,58 @@ from typing import Optional
 logger: logging.Logger = logging.getLogger(__name__)
-def get_ip_addr(hostname: str) -> str:
-    """Resolves and returns the ip address of the given hostname.
+def get_sockaddr(hostname: str, port: int) -> str:
+    """Returns either an IPv6 or IPv4 socket address (that supports TCP) of the given hostname and port.
+    The socket address is of the form:
+      1. `{ipv4.address}:{port}` (e.g. `127.0.0.1:8080`)
+      2. `[{ipv6:address}]:{port}` (e.g. `[::1]:8080`)
-    This function will return an ipv6 address if one that can bind
-    `SOCK_STREAM` (TCP) socket is found. Otherwise it will fall-back
-    to resolving an ipv4 `SOCK_STREAM` address.
+    The hostname is resolved to an IPv6 (or IPv4 if IPv6 is not available on the host) address that
+    supports `SOCK_STREAM` (TCP).
     Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
     """
-    def get_sockaddr(family: socket.AddressFamily) -> Optional[str]:
+    def resolve_sockaddr(family: socket.AddressFamily) -> Optional[str]:
         try:
             # patternlint-disable-next-line python-dns-deps (only used for oss)
-            addrs = socket.getaddrinfo(
-                hostname, port=None, family=family, type=socket.SOCK_STREAM
-            )  # tcp
+            addrs = socket.getaddrinfo(hostname, port, family, type=socket.SOCK_STREAM)
             if addrs:
-                # socket.getaddrinfo return a list of addr 5-tuple addr infos
-                _, _, _, _, sockaddr = addrs[0]  # use the first address
+                family, _, _, _, sockaddr = addrs[0]  # use the first address
-                # sockaddr is a tuple (ipv4) or a 4-tuple (ipv6) where the first element is the ip addr
+                # sockaddr is a tuple (ipv4) or a 4-tuple (ipv6)
+                # in both cases the first element is the ip addr
                 ipaddr = str(sockaddr[0])
+                if family == socket.AF_INET6:
+                    socket_address = f"[{ipaddr}]:{port}"
+                else:  # socket.AF_INET
+                    socket_address = f"{ipaddr}:{port}"
                 logger.info(
-                    "Resolved %s address: `%s` for host: `%s`",
+                    "resolved %s address `%s` for `%s:%d`",
                     family.name,
-                    ipaddr,
+                    socket_address,
                     hostname,
+                    port,
                 )
-                return str(ipaddr)
-            else:
-                return None
+                return socket_address
         except socket.gaierror as e:
             logger.info(
-                "No %s address that can bind TCP sockets for host: %s. %s",
+                "no %s address that can bind TCP sockets for `%s:%d` (error: %s)",
                 family.name,
                 hostname,
+                port,
                 e,
             )
-            return None
-    ipaddr = get_sockaddr(socket.AF_INET6) or get_sockaddr(socket.AF_INET)
-    if not ipaddr:
-        raise RuntimeError(
-            f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
-            " Check the network configuration on the host."
-        )
-    return ipaddr
+        return None
+    for family in [socket.AF_INET6, socket.AF_INET]:
+        if ipaddr := resolve_sockaddr(family):
+            return ipaddr
+    raise RuntimeError(
+        f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
+        " Check the network configuration on the host."
+    )

tests/test_allocator.py CHANGED Viewed

@@ -14,9 +14,11 @@ import subprocess
 import sys
 import unittest
 from datetime import timedelta
-from typing import Generator
+from typing import Generator, Optional
+from unittest import mock
 import cloudpickle
+import pytest
 import torch
 import torch.distributed as dist
@@ -26,20 +28,28 @@ from monarch._rust_bindings.hyperactor_extension.alloc import (
     AllocConstraints,
     AllocSpec,
 )
 from monarch._rust_bindings.monarch_hyperactor.channel import (
     ChannelAddr,
     ChannelTransport,
 )
 from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
-from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
+from monarch.allocator import (
+    ALLOC_LABEL_PROC_MESH_NAME,
+    RemoteAllocator,
+    StaticRemoteAllocInitializer,
+    TorchXRemoteAllocInitializer,
+)
 from monarch.proc_mesh import ProcMesh
+from monarch.tools.mesh_spec import MeshSpec, ServerSpec
+from monarch.tools.network import get_sockaddr
 from torch.distributed.elastic.utils.distributed import get_free_port
+from torchx.specs import AppState
 _100_MILLISECONDS = timedelta(milliseconds=100)
+SERVER_READY = "monarch.tools.commands.server_ready"
 class TestActor(Actor):
     """Silly actor that computes the world size by all-reducing rank-hot tensors"""
@@ -63,9 +73,9 @@ class TestActor(Actor):
 @contextlib.contextmanager
-def remote_process_allocator() -> Generator[str, None, None]:
+def remote_process_allocator(addr: Optional[str] = None) -> Generator[str, None, None]:
     with importlib.resources.path(__package__, "") as package_path:
-        addr = ChannelAddr.any(ChannelTransport.Unix)
+        addr = addr or ChannelAddr.any(ChannelTransport.Unix)
         process_allocator = subprocess.Popen(
             args=[
@@ -215,3 +225,141 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
             self.assert_computed_world_size(results_a, 2)  # a is a 1x2 mesh
             self.assert_computed_world_size(results_b, 6)  # b is a 1x6 mesh
+    async def test_torchx_remote_alloc_initializer_no_server(self) -> None:
+        with mock.patch(SERVER_READY, return_value=None):
+            initializer = TorchXRemoteAllocInitializer("slurm:///123")
+            allocator = RemoteAllocator(world_id="test", initializer=initializer)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"slurm:///123 does not exist or is in a terminal state",
+            ):
+                await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
+    async def test_torchx_remote_alloc_initializer_no_match_label_gt_1_meshes(
+        self,
+    ) -> None:
+        # asserts that an exception is raised if no match label is specified in alloc constraints
+        # but there are more than 1 mesh (hence ambiguous which mesh to allocate on)
+        server = ServerSpec(
+            name="__UNUSED__",
+            state=AppState.RUNNING,
+            meshes=[MeshSpec(name="x", num_hosts=1), MeshSpec(name="y", num_hosts=1)],
+        )
+        with mock.patch(SERVER_READY, return_value=server):
+            initializer = TorchXRemoteAllocInitializer("slurm:///123")
+            allocator = RemoteAllocator(world_id="test", initializer=initializer)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"2 proc meshes in slurm:///123, please specify the mesh name as a match label `procmesh.monarch.meta.com/name`",
+            ):
+                await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
+    @pytest.mark.oss_skip  # pyre-ignore[56] TODO T228752279
+    async def test_torchx_remote_alloc_initializer_no_match_label_1_mesh(self) -> None:
+        server = ServerSpec(
+            name="__UNUSED__",
+            state=AppState.RUNNING,
+            meshes=[
+                MeshSpec(
+                    name="x",
+                    num_hosts=1,
+                    transport="tcp",
+                    hostnames=["localhost"],
+                )
+            ],
+        )
+        port = get_free_port()
+        with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
+            with mock.patch(SERVER_READY, return_value=server):
+                initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
+                allocator = RemoteAllocator(
+                    world_id="test",
+                    initializer=initializer,
+                    heartbeat_interval=_100_MILLISECONDS,
+                )
+                alloc = await allocator.allocate(
+                    AllocSpec(AllocConstraints(), host=1, gpu=4)
+                )
+                proc_mesh = await ProcMesh.from_alloc(alloc)
+                actor = await proc_mesh.spawn("test_actor", TestActor)
+                results = await actor.compute_world_size.call(
+                    master_addr="0.0.0.0", master_port=get_free_port()
+                )
+                self.assert_computed_world_size(results, 4)  # 1x4 mesh
+    @pytest.mark.oss_skip  # pyre-ignore[56] TODO T228752279
+    async def test_torchx_remote_alloc_initializer_with_match_label(self) -> None:
+        server = ServerSpec(
+            name="__UNUSED__",
+            state=AppState.RUNNING,
+            meshes=[
+                MeshSpec(
+                    name="x",
+                    num_hosts=1,
+                    transport="tcp",
+                    hostnames=["localhost"],
+                )
+            ],
+        )
+        port = get_free_port()
+        with remote_process_allocator(addr=f"tcp!{get_sockaddr('localhost', port)}"):
+            with mock.patch(SERVER_READY, return_value=server):
+                initializer = TorchXRemoteAllocInitializer("local:///test", port=port)
+                allocator = RemoteAllocator(
+                    world_id="test",
+                    initializer=initializer,
+                    heartbeat_interval=_100_MILLISECONDS,
+                )
+                alloc = await allocator.allocate(
+                    AllocSpec(
+                        AllocConstraints(
+                            match_labels={ALLOC_LABEL_PROC_MESH_NAME: "x"}
+                        ),
+                        host=1,
+                        gpu=3,
+                    )
+                )
+                proc_mesh = await ProcMesh.from_alloc(alloc)
+                actor = await proc_mesh.spawn("test_actor", TestActor)
+                results = await actor.compute_world_size.call(
+                    master_addr="0.0.0.0", master_port=get_free_port()
+                )
+                self.assert_computed_world_size(results, 3)  # 1x3 mesh
+    async def test_torchx_remote_alloc_initializer_with_match_label_no_match(
+        self,
+    ) -> None:
+        # assert that match label with a mesh name that does not exist should error out
+        server = ServerSpec(
+            name="test",
+            state=AppState.RUNNING,
+            meshes=[
+                MeshSpec(
+                    name="x",
+                    num_hosts=1,
+                    transport="tcp",
+                    hostnames=["localhost"],
+                )
+            ],
+        )
+        with mock.patch(SERVER_READY, return_value=server):
+            with self.assertRaisesRegex(RuntimeError, r"'y' not found in job: test"):
+                initializer = TorchXRemoteAllocInitializer("local:///test")
+                allocator = RemoteAllocator(world_id="test", initializer=initializer)
+                alloc = await allocator.allocate(
+                    AllocSpec(
+                        AllocConstraints(
+                            match_labels={ALLOC_LABEL_PROC_MESH_NAME: "y"}
+                        ),
+                        host=1,
+                        gpu=1,
+                    )
+                )
+                await ProcMesh.from_alloc(alloc)

tests/test_python_actors.py CHANGED Viewed

@@ -6,7 +6,6 @@
 import asyncio
 import operator
-import os
 import re
 import threading
 import time
@@ -31,11 +30,14 @@ from monarch.actor_mesh import (
 from monarch.debugger import init_debugging
 from monarch.future import ActorFuture
-from monarch.mesh_controller import spawn_tensor_engine
 from monarch.proc_mesh import local_proc_mesh, proc_mesh
 from monarch.rdma import RDMABuffer
+needs_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="CUDA not available",
+)
 class Counter(Actor):
     def __init__(self, v: int):
@@ -116,6 +118,7 @@ class ParameterClient(Actor):
         return self.buffer
+@needs_cuda
 async def test_proc_mesh_rdma():
     proc = await proc_mesh(gpus=1)
     server = await proc.spawn("server", ParameterServer)
@@ -284,6 +287,7 @@ class GeneratorActor(Actor):
         ), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
+@needs_cuda
 async def test_gpu_trainer_generator():
     trainer_proc = await proc_mesh(gpus=1)
     gen_proc = await proc_mesh(gpus=1)
@@ -313,6 +317,7 @@ async def test_sync_actor():
     assert r == 5
+@needs_cuda
 def test_gpu_trainer_generator_sync() -> None:
     trainer_proc = proc_mesh(gpus=1).get()
     gen_proc = proc_mesh(gpus=1).get()
@@ -403,30 +408,6 @@ def test_proc_mesh_liveness() -> None:
     counter.value.call().get()
-two_gpu = pytest.mark.skipif(
-    torch.cuda.device_count() < 2,
-    reason="Not enough GPUs, this test requires at least 2 GPUs",
-)
-@two_gpu
-def test_tensor_engine() -> None:
-    pm = proc_mesh(gpus=2).get()
-    dm = spawn_tensor_engine(pm)
-    with dm.activate():
-        r = monarch.inspect(2 * torch.zeros(3, 4))
-    fm = dm.flatten("all")
-    with fm.activate():
-        f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
-    assert torch.allclose(torch.zeros(3, 4), r)
-    assert torch.allclose(torch.zeros(3, 4), f)
-    dm.exit()
 def _debugee_actor_internal(rank):
     if rank == 0:
         breakpoint()  # noqa
@@ -632,23 +613,6 @@ async def test_actor_tls_full_sync() -> None:
     assert 4 == await am.get.call_one()
-@two_gpu
-def test_proc_mesh_tensor_engine() -> None:
-    pm = proc_mesh(gpus=2).get()
-    with pm.activate():
-        f = 10 * pm.rank_tensor("gpus").cuda()
-        a = monarch.inspect(f, hosts=0, gpus=0)
-        b = monarch.inspect(f, hosts=0, gpus=1)
-    one = pm.slice(gpus=1)
-    with one.activate():
-        sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
-        c = monarch.inspect(sliced_b * 10)
-    assert a == 0
-    assert b == 10
-    assert c == 100
 class AsyncActor(Actor):
     def __init__(self):
         self.should_exit = False

tests/test_tensor_engine.py ADDED Viewed

@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import monarch
+import pytest
+import torch
+from monarch.mesh_controller import spawn_tensor_engine
+from monarch.proc_mesh import proc_mesh
+two_gpu = pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+@two_gpu
+def test_tensor_engine() -> None:
+    pm = proc_mesh(gpus=2).get()
+    dm = spawn_tensor_engine(pm)
+    with dm.activate():
+        r = monarch.inspect(2 * torch.zeros(3, 4))
+    fm = dm.flatten("all")
+    with fm.activate():
+        f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
+    assert torch.allclose(torch.zeros(3, 4), r)
+    assert torch.allclose(torch.zeros(3, 4), f)
+    dm.exit()
+@two_gpu
+def test_proc_mesh_tensor_engine() -> None:
+    pm = proc_mesh(gpus=2).get()
+    with pm.activate():
+        f = 10 * pm.rank_tensor("gpus").cuda()
+        a = monarch.inspect(f, hosts=0, gpus=0)
+        b = monarch.inspect(f, hosts=0, gpus=1)
+    one = pm.slice(gpus=1)
+    with one.activate():
+        sliced_b = monarch.slice_mesh(f, gpus=1).to_mesh(one)
+        c = monarch.inspect(sliced_b * 10)
+    assert a == 0
+    assert b == 10
+    assert c == 100

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.20
+Version: 2025.6.27
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com
@@ -42,7 +42,7 @@ Note: Monarch is currently only supported on Linux systems
 ## Installation
-`pip install torchmonarch`
+`pip install torchmonarch-nightly`
 or manually

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,23 @@
 monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
-monarch/_rust_bindings.so,sha256=sDdg6RjptgNPmFnFiDAgv36k_Or_Kz47aYaZ2M5EAao,41088032
+monarch/_rust_bindings.so,sha256=SCTdGchlMLPZEiF4SNSbLSczRY7ZC3f7t0e-YZHGNDk,43327072
 monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
-monarch/actor_mesh.py,sha256=m6QapbZHqYujXya28jW1II2wkBUV_nKGvxmWPSW9lsQ,24327
-monarch/allocator.py,sha256=UEaVLntH4xQ8Lr84TbgcXusvuK8FhSMJmav-omztUbw,4473
+monarch/actor_mesh.py,sha256=QqKHVTJk9H_I-v7GoxgOdOL8-ymnRpGvNFdda0-cNrE,24534
+monarch/allocator.py,sha256=l0_mN43AH3K2aCchb5fk8ml95rvdgR31nRC_PqRmZWg,7865
 monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
 monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
-monarch/debugger.py,sha256=AdlvOG3X-9Pw9c1DLQYEy4vjEfh0ZtwtsNJEFLFzN8o,13312
+monarch/code_sync.py,sha256=SIqXx-zAKx60s2LbS_e9XSSlE1YSEo75vE05tMrOyYo,332
+monarch/debugger.py,sha256=AizU8MWBdloe0wj1ysxlOXmUhCwGoShVH_xGfVBCQjs,13354
 monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
 monarch/future.py,sha256=g1VYJl8ReBBS6VbikwWilnFqEr5qJDiSKid92AnWFV4,2058
 monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
 monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
 monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
-monarch/monarch_controller,sha256=sWOUMClz3JPUjZbppDWgdrPOAjbydygdRPDZ1kaAVC4,20328464
+monarch/monarch_controller,sha256=Vr5ym1QWSWyd02YCd5q8tC9X_V-ony1v7v-pFfrXVQA,21664144
 monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
 monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
 monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
 monarch/pdb_wrapper.py,sha256=gm46AZnfR4amH1vYFWnWivEv5MaU3Nb6KIWjSM8KjWM,4052
-monarch/proc_mesh.py,sha256=5RaKPQZJD-sKzEAbqMorKsZA7SOUzIflk3Fn6cdfzvw,8607
+monarch/proc_mesh.py,sha256=ZnNWjINoFTdkRVbu_ikos2jV4Ham-I9jqeWdEN-1ZtQ,10436
 monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
 monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
 monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
@@ -108,8 +109,8 @@ monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_u
 monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 monarch/tools/cli.py,sha256=EIdarsfuFX0WqRCe29_5GNKWJBhxx0lABalw3zPSagw,4977
 monarch/tools/commands.py,sha256=OuFDVAcl5LvBdBZ-HyemErR0IiDtiMMNgmGPD4MWTHY,8996
-monarch/tools/mesh_spec.py,sha256=3Qp7Lu3pAa9tfaG-METsCmj-QXECQ6OsrPWiLydWvKc,3914
-monarch/tools/network.py,sha256=bRj-jOs5qDqnM3BcE9MSXCLS01hiMN4YSWfKZ_d7bc4,2182
+monarch/tools/mesh_spec.py,sha256=gj3p4fqLOVAnkrCcE0gY8tGhGBNi1Eu3KpQv5xzWCZ0,5484
+monarch/tools/network.py,sha256=mN8Fx9mervxM3VdFHRn4ZXt4z7yWxZp52BTxx2tfpus,2455
 monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
 monarch/tools/config/__init__.py,sha256=OPSflEmJB2zxAaRVzzWSWXV5M5vlknLgpulGdW1ze5U,510
@@ -136,7 +137,7 @@ tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,55
 tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
 tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
 tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
-tests/test_allocator.py,sha256=jaYWPVEFdcK0XmmEA1Y9uwkeBjhxb2iI1GUL6IZKh4s,8305
+tests/test_allocator.py,sha256=c7b4ylEjFV2WDhB8fbWiDuGi-vrBeD1E0Rpu-efrSVQ,14478
 tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
 tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
 tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -145,19 +146,20 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
 tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
 tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
 tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
-tests/test_python_actors.py,sha256=du0AiGiKtVHOLkDUKu6gV75eYf_NoHDKV6utKzrplz4,21010
+tests/test_python_actors.py,sha256=0kF3LQpvPnAqT6xbNaBQxaG3gsMyBzzM4Ou7om9ZhoE,20069
 tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
 tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
 tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
 tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
+tests/test_tensor_engine.py,sha256=ZYQlr77d1txMfQ4w7qqyCLhHGRwt57bsHs6E9oAd7SQ,1361
 tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wkB0sg,4565
 tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
 tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
 tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
-torchmonarch_nightly-2025.6.20.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
-torchmonarch_nightly-2025.6.20.dist-info/METADATA,sha256=QKiDH01IYFpa492TDs5WzWeDRbjMKmpSAc3V9NpQ5YM,2772
-torchmonarch_nightly-2025.6.20.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
-torchmonarch_nightly-2025.6.20.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
-torchmonarch_nightly-2025.6.20.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
-torchmonarch_nightly-2025.6.20.dist-info/RECORD,,
+torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
+torchmonarch_nightly-2025.6.27.dist-info/METADATA,sha256=0PKqq2myfJJjhPa9nAZVJCp4vymD0dBmC1w-RmRKgYI,2780
+torchmonarch_nightly-2025.6.27.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
+torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
+torchmonarch_nightly-2025.6.27.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
+torchmonarch_nightly-2025.6.27.dist-info/RECORD,,

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.20.dist-info → torchmonarch_nightly-2025.6.27.dist-info}/top_level.txt RENAMED Viewed

File without changes