PyPI - torchmonarch-nightly - Versions diffs - 2025.6.30__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.30__cp312-cp312-manylinux2014_x86_64.whl → 2025.7.25__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -752
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +75 -9
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -332
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

monarch/_src/actor/tensor_engine_shim.py ADDED Viewed

@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+from functools import partial
+from typing import Any, Optional, Sequence, TYPE_CHECKING
+"""
+This file provides a type annoated shim for using tensor engine functions
+from within the actor module which only optionally includes the tensor engine.
+Each function that is needed should have a @shim entry below which gives the name,
+module, and type of the function. Each function is resolved dynamically the first
+time it is used.
+"""
+if TYPE_CHECKING:
+    from monarch._src.actor.actor_mesh import ActorEndpoint, Port, Selection
+    from monarch._src.actor.endpoint import Endpoint
+def shim(fn=None, *, module=None):
+    if fn is None:
+        return partial(shim, module=module)
+    impl = None
+    name = fn.__name__
+    def wrap(*args, **kwargs):
+        nonlocal impl
+        if impl is None:
+            impl = getattr(importlib.import_module(module), name)
+        return impl(*args, **kwargs)
+    return wrap
+@shim(module="monarch.mesh_controller")
+def actor_send(
+    endpoint: "ActorEndpoint",
+    args_kwargs_tuple: bytes,
+    refs: "Sequence[Any]",
+    port: "Optional[Port[Any]]",
+    selection: "Selection",
+) -> None: ...
+@shim(module="monarch.common.remote")
+def _cached_propagation(_cache, rfunction: "Endpoint", args, kwargs) -> Any: ...
+@shim(module="monarch.common.fake")
+def fake_call(fn, *args, **kwargs): ...

monarch/_src/tensor_engine/rdma.py ADDED Viewed

@@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import warnings
+from typing import Optional
+import torch
+try:
+    from monarch._rust_bindings.rdma import _RdmaBuffer
+except ImportError as e:
+    logging.error("RDMA is not available: {}".format(e))
+    raise e
+from monarch._src.actor.actor_mesh import MonarchContext
+from monarch._src.actor.future import Future
+# RDMARead/WriteTransferWarnings are warnings that are only printed once per process.
+# Remove these once GPU support is added.
+class RDMAReadTransferWarning(Warning):
+    pass
+class RDMAWriteTransferWarning(Warning):
+    pass
+warnings.simplefilter("once", RDMAReadTransferWarning)
+warnings.simplefilter("once", RDMAWriteTransferWarning)
+def is_available():
+    return _RdmaBuffer.rdma_supported()
+def _assert_tensor_is_1d_contiguous_uint8(t: torch.Tensor) -> None:
+    if t.ndim != 1:
+        raise ValueError(f"Tensor must be 1D, got {t.ndim}D")
+    if t.dtype != torch.uint8:
+        raise ValueError(f"Tensor must be uint8, got {t.dtype}")
+    if not t.is_contiguous():
+        raise ValueError("Tensor must be contiguous")
+class RDMABuffer:
+    def __init__(self, data: torch.Tensor) -> None:
+        """
+        RDMABuffer only supports 1D contiguous tensors that are 1 byte per item.
+        To create a 1 byte, 1D view, use t.view(torch.uint8).flatten()
+        TODO: Create TensorBuffer, which will be main user API supporting non-contiguous , multi-byte-per-elment tensors
+        """
+        assert (
+            is_available()
+        ), "Tried to create an RDMABuffer, but RDMA is not available on this platform."
+        if data.device.type != "cpu":
+            # TODO - CUDA support for RDMABuffer exists at the Rust layer, but
+            # runs into issues with MR creation. For now, only support CPU tensors.
+            # Remove this once GPU support is added.
+            raise ValueError(
+                "RDMABuffer currently only supports CPU tensors (got device {})".format(
+                    data.device
+                )
+            )
+        _assert_tensor_is_1d_contiguous_uint8(data)
+        assert data.storage_offset() == 0
+        try:
+            storage = data.untyped_storage()
+            addr: int = storage.data_ptr()
+            size = storage.element_size() * data.numel()
+            ctx = MonarchContext.get()
+            self._buffer: _RdmaBuffer = _RdmaBuffer.create_rdma_buffer_blocking(
+                addr=addr,
+                size=size,
+                proc_id=ctx.proc_id,
+                client=ctx.mailbox,
+            )
+        # TODO - specific exception
+        except Exception as e:
+            logging.error("Failed to create buffer %s", e)
+            raise e
+    def read_into(
+        self,
+        dst: torch.Tensor,
+        offset: int = 0,
+        timeout: int = 3,
+    ) -> Future[Optional[int]]:
+        """
+        Read data from the RDMABuffer into a destination tensor.
+        The destination tensor must be contiguous and 1 byte per item.
+        Returns an ActorFuture that can be awaited or called with .get() for blocking operation.
+        """
+        _assert_tensor_is_1d_contiguous_uint8(dst)
+        dst_gpu = None
+        if dst.device.type != "cpu":
+            # TODO - remove this once GPU support is added.
+            warnings.warn(
+                "note: read_into only supports CPU tensors, so `dst` is being copied to CPU.",
+                RDMAReadTransferWarning,
+                stacklevel=2,
+            )
+            dst_gpu = dst
+            dst = dst.cpu()
+        storage = dst.untyped_storage()
+        addr: int = storage.data_ptr() + offset
+        size = storage.element_size() * dst.numel()
+        if offset + size > dst.numel():
+            raise ValueError(
+                f"offset + size ({offset + size}) must be <= dst.numel() ({dst.numel()})"
+            )
+        async def read_into_nonblocking() -> Optional[int]:
+            res = await self._buffer.read_into(
+                addr=addr,
+                size=size,
+                local_proc_id=MonarchContext.get().proc_id,
+                client=MonarchContext.get().mailbox,
+                timeout=timeout,
+            )
+            # TODO - remove this once GPU support is added.
+            if dst_gpu is not None:
+                dst_gpu.copy_(dst)
+            return res
+        return Future(impl=read_into_nonblocking, requires_loop=False)
+    def write_from(
+        self, src: torch.Tensor, offset: int = 0, timeout: int = 3
+    ) -> Future[None]:
+        """
+        Write data from a source tensor into the RDMABuffer.
+        The source tensor must be contiguous and 1 byte per item.
+        Returns an ActorFuture that can be awaited or called with .get() for blocking operation.
+        """
+        _assert_tensor_is_1d_contiguous_uint8(src)
+        src_gpu = None
+        if src.device.type != "cpu":
+            # TODO - remove this once GPU support is added.
+            warnings.warn(
+                "note: write_from only supports CPU tensors, so we will write to CPU first, then transfer to `src` in place.",
+                RDMAWriteTransferWarning,
+                stacklevel=2,
+            )
+            src_gpu = src  # Save the original GPU tensor reference
+            src = src.cpu()  # Convert to CPU for RDMA operation
+        storage = src.untyped_storage()
+        addr: int = storage.data_ptr()
+        size = storage.element_size() * src.numel()
+        if size + offset > src.numel():
+            raise ValueError(
+                f"size + offset ({size + offset}) must be <= src.numel() ({src.numel()})"
+            )
+        async def write_from_nonblocking() -> None:
+            res = await self._buffer.write_from(
+                addr=addr,
+                size=size,
+                local_proc_id=MonarchContext.get().proc_id,
+                client=MonarchContext.get().mailbox,
+                timeout=timeout,
+            )
+            # TODO - remove this once GPU support is added.
+            if src_gpu is not None:
+                src_gpu.copy_(src)
+            return res
+        return Future(impl=write_from_nonblocking, requires_loop=False)

monarch/_testing.py CHANGED Viewed

@@ -13,13 +13,13 @@ from contextlib import contextmanager, ExitStack
 from typing import Any, Callable, Dict, Generator, Literal, Optional
 import monarch_supervisor
+from monarch._src.actor.shape import NDSlice
+from monarch.actor import proc_mesh, ProcMesh
 from monarch.common.client import Client
 from monarch.common.device_mesh import DeviceMesh
 from monarch.common.invocation import DeviceException, RemoteException
-from monarch.common.shape import NDSlice
 from monarch.controller.backend import ProcessBackend
 from monarch.mesh_controller import spawn_tensor_engine
-from monarch.proc_mesh import proc_mesh, ProcMesh
 from monarch.python_local_mesh import PythonLocalContext
 from monarch.rust_local_mesh import (
     local_mesh,
@@ -228,3 +228,4 @@ def mock_mesh(hosts: int, gpus: int):
 class BackendType:
     PY = "py"
     RS = "rs"
+    MESH = "mesh"

monarch/actor/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Monarch Actor API - Public interface for actor functionality.
+"""
+from monarch._src.actor.actor_mesh import (
+    Accumulator,
+    Actor,
+    ActorError,
+    current_actor_name,
+    current_rank,
+    current_size,
+    Point,
+    port,
+    send,
+    ValueMesh,
+)
+from monarch._src.actor.endpoint import endpoint
+from monarch._src.actor.future import Future
+from monarch._src.actor.proc_mesh import (
+    debug_client,
+    local_proc_mesh,
+    proc_mesh,
+    ProcMesh,
+    sim_proc_mesh,
+)
+__all__ = [
+    "Accumulator",
+    "Actor",
+    "ActorError",
+    "current_actor_name",
+    "current_rank",
+    "current_size",
+    "endpoint",
+    "Future",
+    "local_proc_mesh",
+    "Point",
+    "proc_mesh",
+    "ProcMesh",
+    "port",
+    "send",
+    "sim_proc_mesh",
+    "ValueMesh",
+    "debug_client",
+]