PyPI - torchmonarch-nightly - Versions diffs - 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +74 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +198 -0
monarch/actor_mesh.py +692 -0
monarch/allocator.py +62 -0
monarch/bootstrap_main.py +75 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +69 -0
monarch/cached_remote_function.py +257 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +646 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +443 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +572 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +304 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +204 -0
monarch/common/stream.py +111 -0
monarch/common/tensor.py +793 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/fetch.py +55 -0
monarch/future.py +25 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/proc_mesh.py +188 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +190 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +357 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +189 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +57 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +121 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +139 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +112 -0
tests/test_alloc.py +25 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +835 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +372 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +182 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0

monarch/allocator.py ADDED Viewed

@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import final
+from monarch import ActorFuture as Future
+from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
+    Alloc,
+    AllocSpec,
+)
+from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
+    LocalAllocatorBase,
+    ProcessAllocatorBase,
+)
+@final
+class ProcessAllocator(ProcessAllocatorBase):
+    """
+    An allocator that allocates by spawning local processes.
+    """
+    def allocate(self, spec: AllocSpec) -> Future[Alloc]:
+        """
+        Allocate a process according to the provided spec.
+        Arguments:
+        - `spec`: The spec to allocate according to.
+        Returns:
+        - A future that will be fulfilled when the requested allocation is fulfilled.
+        """
+        return Future(
+            lambda: self.allocate_nonblocking(spec),
+            lambda: self.allocate_blocking(spec),
+        )
+@final
+class LocalAllocator(LocalAllocatorBase):
+    """
+    An allocator that allocates by spawning actors into the current process.
+    """
+    def allocate(self, spec: AllocSpec) -> Future[Alloc]:
+        """
+        Allocate a process according to the provided spec.
+        Arguments:
+        - `spec`: The spec to allocate according to.
+        Returns:
+        - A future that will be fulfilled when the requested allocation is fulfilled.
+        """
+        return Future(
+            lambda: self.allocate_nonblocking(spec),
+            lambda: self.allocate_blocking(spec),
+        )

monarch/bootstrap_main.py ADDED Viewed

@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This is the main function for the boostrapping a new process using a ProcessAllocator.
+"""
+import asyncio
+import importlib.resources
+import logging
+import os
+import sys
+# Import torch to avoid import-time races if a spawned actor tries to import torch.
+import torch  # noqa[F401]
+async def main():
+    from monarch._rust_bindings.monarch_hyperactor.bootstrap import bootstrap_main
+    await bootstrap_main()
+def invoke_main():
+    # if this is invoked with the stdout piped somewhere, then print
+    # changes its buffering behavior. So we default to the standard
+    # behavior of std out as if it were a terminal.
+    sys.stdout.reconfigure(line_buffering=True)
+    global bootstrap_main
+    from monarch._rust_bindings.hyperactor_extension.telemetry import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
+        forward_to_tracing,
+    )
+    # TODO: figure out what from worker_main.py we should reproduce here.
+    class TracingForwarder(logging.Handler):
+        def emit(self, record: logging.LogRecord) -> None:
+            try:
+                forward_to_tracing(
+                    record.getMessage(),
+                    record.filename or "",
+                    record.lineno or 0,
+                    record.levelno,
+                )
+            except AttributeError:
+                forward_to_tracing(
+                    record.__str__(),
+                    record.filename or "",
+                    record.lineno or 0,
+                    record.levelno,
+                )
+    # forward logs to rust tracing. Defaults to on.
+    if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
+        logging.root.addHandler(TracingForwarder())
+    try:
+        with (
+            importlib.resources.path("monarch", "py-spy") as pyspy,
+        ):
+            if pyspy.exists():
+                os.environ["PYSPY_BIN"] = str(pyspy)
+            # fallback to using local py-spy
+    except Exception as e:
+        logging.warning(f"Failed to set up py-spy: {e}")
+    # Start an event loop for PythonActors to use.
+    asyncio.run(main())
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover

monarch/builtins/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+"""
+Builtins for Monarch is a set of remote function defintions for PyTorch functions and other utilities.
+"""
+from .log import log_remote, set_logging_level_remote
+__all__ = ["log_remote", "set_logging_level_remote"]

monarch/builtins/log.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from monarch.common.remote import remote
+logger = logging.getLogger(__name__)
+@remote(propagate="inspect")
+def log_remote(*args, level: int = logging.WARNING, **kwargs) -> None:
+    logger.log(level, *args, **kwargs)
+@remote(propagate="inspect")
+def set_logging_level_remote(level: int) -> None:
+    logger.setLevel(level)

monarch/builtins/random.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre strict
+from typing import Callable
+import torch
+from monarch.common.remote import remote
+@remote(propagate="inspect")
+def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
+    torch.manual_seed(seed ^ process_idx)
+@remote(propagate=lambda: 0)
+def initial_seed_remote() -> int:
+    return torch.initial_seed()
+@remote(propagate=lambda: torch.zeros(1))
+def get_rng_state_remote() -> torch.Tensor:
+    return torch.get_rng_state()
+@remote(propagate="inspect")
+def set_rng_state_remote(new_state: torch.Tensor) -> None:
+    torch.set_rng_state(new_state)
+def _run_no_return(f: Callable) -> None:
+    f()
+    return None
+# TODO: return result when uint64 is supported from remote function
+@remote(propagate=lambda: _run_no_return(torch.seed))
+def seed_remote() -> None:
+    torch.seed()
+# same underlying implementation as seed_remote (torch.seed)
+# TODO: return result when uint64 is supported from remote function
+@remote(propagate=lambda: _run_no_return(torch.random.seed))
+def random_seed_remote() -> None:
+    torch.random.seed()
+@remote(propagate="inspect")
+def manual_seed_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed(seed)
+@remote(propagate="inspect")
+def manual_seed_all_cuda_remote(seed: int) -> None:
+    torch.cuda.manual_seed_all(seed)
+@remote(propagate=lambda: [torch.zeros(1)])
+def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
+    return torch.cuda.get_rng_state_all()
+@remote(propagate="inspect")
+def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
+    torch.cuda.set_rng_state_all(states)

monarch/cached_remote_function.py ADDED Viewed

@@ -0,0 +1,257 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import importlib
+import logging
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Type, Union
+import torch
+from monarch.common.process_group import SingleControllerProcessGroupWrapper
+from monarch.common.remote import DummyProcessGroup, remote, RemoteProcessGroup
+from torch import autograd
+from torch.utils._pytree import tree_flatten, tree_unflatten
+logger = logging.getLogger(__name__)
+def _controller_autograd_function_forward(
+    autograd_function_class: Type[autograd.Function],
+):
+    """
+    Decorator for authoring a controller remote function wrapper that wraps an autograd.Function forward.
+    Sets up the autograd.function.FunctionCtx() to send over the wire and sets up the original ctx
+    with the ctx_tensors and ctx attributes.
+    """
+    def decorator(func):
+        def wrapper(ctx, *args):
+            # Need dummy context because cannot pickle autograd.FunctionBackward
+            wire_ctx = autograd.function.FunctionCtx()
+            # Track arg tensors that have requires_grad
+            arg_tensors, _ = tree_flatten(args)
+            wire_ctx.args_requires_grads = []
+            for i, arg in enumerate(arg_tensors):
+                if isinstance(arg, torch.Tensor) and arg.requires_grad:
+                    wire_ctx.args_requires_grads.append(i)
+            out, ctx_attrs, ctx_tensors = func(
+                autograd_function_class.__module__,
+                autograd_function_class.__name__,
+                wire_ctx,
+                *args,
+            )
+            if ctx is None:
+                return out
+            ctx.save_for_backward(*ctx_tensors)
+            ctx.attr_names = ctx_attrs.keys()
+            ctx.pg_names = []
+            dim_to_remote_group = {}
+            for arg in args:
+                if isinstance(arg, RemoteProcessGroup):
+                    dim_to_remote_group[arg.dims] = arg
+            for name, v in ctx_attrs.items():
+                if isinstance(v, DummyProcessGroup):
+                    setattr(ctx, name, dim_to_remote_group[v.dims])
+                    ctx.pg_names.append(name)
+                else:
+                    setattr(ctx, name, v)
+            return out
+        return wrapper
+    return decorator
+def _controller_autograd_function_backward(
+    autograd_function_class: Type[autograd.Function],
+):
+    """
+    Decorator for authoring a controller remote function wrapper that wraps an autograd.Function backward.
+    Manually sets up wire_ctx with ctx tensors and attributes.
+    """
+    def decorator(func):
+        def wrapper(ctx, *grad_outputs):
+            # Manually set up wire_ctx with ctx tensors and attributes
+            wire_ctx = autograd.function.FunctionCtx()
+            # send over tensor references with ctx_tensors
+            ctx_tensors = ctx.saved_tensors
+            wire_ctx.save_for_backward(ctx_tensors)
+            for name in ctx.attr_names:
+                setattr(wire_ctx, name, getattr(ctx, name))
+            process_groups = {name: getattr(ctx, name) for name in ctx.pg_names}
+            return func(
+                autograd_function_class.__module__,
+                autograd_function_class.__name__,
+                wire_ctx,
+                ctx_tensors,
+                # explicitly pass pg to worker
+                process_groups,
+                *grad_outputs,
+            )
+        return wrapper
+    return decorator
+@contextmanager
+def manage_grads(list_of_tensors, indices):
+    try:
+        for i in indices:
+            assert list_of_tensors[i].is_leaf, "can't have non-leaf tensors on worker"
+            list_of_tensors[i].requires_grad = True
+        yield list_of_tensors
+    finally:
+        for i in indices:
+            list_of_tensors[i].requires_grad = False
+def worker_autograd_function_forward(
+    module_name: str,
+    class_name: str,
+    ctx: autograd.function.FunctionCtx,
+    *args,
+    **kwargs,
+):
+    # Capture initial state of ctx attributes
+    before = set()
+    before.add("to_save")
+    for attr in dir(ctx):
+        if not attr.startswith("_"):
+            before.add(attr)
+    # Set tensors that require grad from additional arg
+    flatten_args, spec = tree_flatten(args)
+    # pyre-ignore
+    with manage_grads(flatten_args, ctx.args_requires_grads) as args_with_grad:
+        args = tree_unflatten(args_with_grad, spec)
+        # Call the original forward function
+        module = importlib.import_module(module_name)
+        class_ = getattr(module, class_name)
+        with torch.no_grad():
+            out = class_.forward(ctx, *args, **kwargs)
+        # Capture state of ctx attributes after the function call
+        after = set()
+        for attr in dir(ctx):
+            if not attr.startswith("_"):
+                after.add(attr)
+        ctx_attrs = {attr: getattr(ctx, attr) for attr in after - before}
+        ctx_attrs["ctx_requires_grads"] = []
+        if not hasattr(ctx, "to_save"):
+            to_save = []
+        else:
+            # pyre-ignore
+            for idx, t in enumerate(ctx.to_save):
+                # generally, workers should not have requires_grad set. Set to correct state after
+                # but record requires_grad for next forward
+                if isinstance(t, torch.Tensor) and t.requires_grad and t.is_leaf:
+                    t.requires_grad = False
+                    ctx_attrs["ctx_requires_grads"].append(idx)
+            to_save = ctx.to_save
+    return out, ctx_attrs, to_save
+def worker_autograd_function_backward(
+    module_name: str,
+    class_name: str,
+    ctx: autograd.function.FunctionCtx,
+    ctx_tensors: List[torch.Tensor],
+    process_groups: Dict[
+        str, Union[SingleControllerProcessGroupWrapper, DummyProcessGroup]
+    ],
+    *grad_outputs: torch.Tensor,
+):
+    # set correct requires_grad state pre backward
+    # pyre-ignore
+    with manage_grads(ctx_tensors, ctx.ctx_requires_grads) as ctx_grad_tensors:
+        # for i in ctx.ctx_requires_grads:
+        #     ctx_tensors[i].requires_grad = True
+        if ctx_grad_tensors:
+            # pyre-ignore
+            ctx.saved_tensors = ctx_grad_tensors
+        for name, v in process_groups.items():
+            setattr(ctx, name, v)
+        # Call the original backward function
+        module = importlib.import_module(module_name)
+        class_ = getattr(module, class_name)
+        with torch.no_grad():
+            out = class_.backward(ctx, *grad_outputs)
+    return out
+forward_remote_fn = remote(
+    "monarch.cached_remote_function.worker_autograd_function_forward"
+)
+backward_remote_fn = remote(
+    "monarch.cached_remote_function.worker_autograd_function_backward"
+)
+class RemoteAutogradFunction(autograd.Function):
+    """
+    New autograd.Function (custom forward/backward) that will run on the worker as a UDF RemoteFunction
+    Example::
+        my_remote_autograd_function = remote_autograd_function(my_custom_autograd_function)
+    """
+    @staticmethod
+    def forward(ctx, *args):
+        raise NotImplementedError()
+    @staticmethod
+    def backward(ctx, *grads):
+        raise NotImplementedError()
+def remote_autograd_function(
+    target_class: Type[autograd.Function], name: Optional[str] = None
+) -> Type[RemoteAutogradFunction]:
+    """
+    Returns a new autograd.Function (custom forward/backward) that will run on the worker as a UDF RemoteFunction
+    Logic is done on the controller (e.g., Dtensors set up and saved for backward).
+    The autograd.function.FunctionCtx() is sent over the wire to the worker.
+    Special handling is done for ctx_tensors, requires_grad fo tensors and process groups.
+    Args:
+        target_class: autograd.Function class to be run remotely
+        name: name of the new autograd.Function to be called on the worker
+    """
+    if issubclass(target_class, RemoteAutogradFunction):
+        logging.warning(
+            f"{target_class} is already a autograd.Function UDF! You are likely monkey-patching too many times"
+        )
+        return target_class
+    assert issubclass(
+        target_class, autograd.Function
+    ), f"{target_class} is not a torch.autograd.Function!"
+    if name is None:
+        name = f"Remote_{target_class.__name__}"
+    return type(
+        name,
+        (RemoteAutogradFunction,),
+        {
+            "forward": staticmethod(
+                _controller_autograd_function_forward(target_class)(forward_remote_fn)
+            ),
+            "backward": staticmethod(
+                _controller_autograd_function_backward(target_class)(backward_remote_fn)
+            ),
+        },
+    )

monarch/common/_C.pyi ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+def patch_cuda() -> None: ...
+def mock_cuda() -> None: ...
+def unmock_cuda() -> None: ...

monarch/common/_C.so ADDED Viewed

Binary file

monarch/common/__init__.py ADDED Viewed

File without changes