PyPI - torchmonarch-nightly - Versions diffs - 2025.6.27__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.27__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +58 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +230 -0
monarch/actor_mesh.py +761 -0
monarch/allocator.py +220 -0
monarch/bootstrap_main.py +59 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +68 -0
monarch/cached_remote_function.py +257 -0
monarch/code_sync.py +10 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +690 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +417 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +573 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +297 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +229 -0
monarch/common/stream.py +114 -0
monarch/common/tensor.py +814 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/debugger.py +379 -0
monarch/fetch.py +55 -0
monarch/future.py +76 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/mesh_controller.py +271 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/pdb_wrapper.py +135 -0
monarch/proc_mesh.py +299 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +162 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +359 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/telemetry.py +19 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +251 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +58 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +165 -0
monarch/tools/network.py +69 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +180 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +240 -0
tests/test_alloc.py +25 -0
tests/test_allocator.py +365 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +845 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +736 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +217 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
tests/test_tensor_engine.py +52 -0
torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0

monarch/common/pickle_flatten.py ADDED Viewed

@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import io
+import pickle
+from typing import Any, Callable, Iterable, List, Tuple
+import cloudpickle
+class _Pickler(cloudpickle.Pickler):
+    def __init__(self, filter):
+        self.f = io.BytesIO()
+        super().__init__(self.f)
+        self._filter = filter
+        self._saved = []
+    def persistent_id(self, obj):
+        if not self._filter(obj):
+            return None
+        self._saved.append(obj)
+        return len(self._saved) - 1
+class _Unpickler(pickle.Unpickler):
+    def __init__(self, data, sequence: Iterable[Any]):
+        super().__init__(io.BytesIO(data))
+        self._iter = iter(sequence)
+        self._values = []
+    def persistent_load(self, id):
+        while id >= len(self._values):
+            self._values.append(next(self._iter))
+        return self._values[id]
+def flatten(obj: Any, filter: Callable[[Any], bool]) -> Tuple[List[Any], bytes]:
+    pickler = _Pickler(filter)
+    pickler.dump(obj)
+    return pickler._saved, pickler.f.getvalue()
+def unflatten(data: bytes, values: Iterable[Any]) -> Any:
+    up = _Unpickler(data, values)
+    return up.load()

monarch/common/pipe.py ADDED Viewed

@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import uuid
+from collections import deque
+from typing import Any, Dict
+import torch
+from monarch.common.remote import Remote, remote
+from . import device_mesh, messages, stream
+from .fake import fake_call
+from .function import ResolvableFunctionFromPath
+from .reference import Referenceable
+from .tensor import dtensor_check, Tensor
+from .tree import flatten
+def remote_generator(path: str, max_messages: int = 50):
+    def wrapper(annotation):
+        fn = remote(path, propagate=annotation)
+        return lambda *args, **kwargs: create_pipe(
+            fn, *args, max_messages=max_messages, **kwargs
+        )
+    return wrapper
+def create_pipe(fn, *args, max_messages: int = 50, **kwargs):
+    return Pipe(fn, max_messages, args, kwargs)
+class Pipe(Referenceable):
+    """
+    Pipe abstraction on the controller. Designed to to be used with ipc PAIR sockets, e.g dataloaders and trainers.
+    Example::
+        @remote_generator('dataloader.main')
+        def dataloader_pipe(pipe: Pipe, batch_size: int, sequence_length: int):
+            while True:
+                yield {
+                    'input': torch.zeros(batch_size, sequence_length),
+                    'target': torch.zeros(batch_size)
+                }
+        # On the controller
+        with mesh.activate():
+            dataloader = dataloader_pipe(1, 1)
+            input, target = dataloader.recv()
+    """
+    def __init__(self, fn: Remote, max_messages: int, args, kwargs):
+        mesh = device_mesh._active
+        if mesh is None:
+            raise ValueError(
+                "Remote generators require an active device mesh (use `with mesh.activate():`"
+            )
+        mesh.define_remotely()
+        def no_references(x):
+            if isinstance(x, Referenceable):
+                raise ValueError("Cannot pass references to external generators")
+        flatten((args, kwargs), no_references)
+        self._fake_pipe = FakePipe()
+        if not isinstance(fn, Remote):
+            raise TypeError("expected fn to be a monarch.remote function.")
+        args_ = (self._fake_pipe, *args)
+        # we do not pass references to generators so fake_args == args
+        self._iterator = iter(fn._pipe_propagate(args_, kwargs, args_, kwargs))
+        self.ref = mesh.client.new_ref()
+        self.mesh = mesh
+        key = f"ipc:///tmp/proc-{uuid.uuid4()}"
+        self.mesh._send(
+            messages.CreatePipe(
+                self, key, fn._resolvable, max_messages, mesh, args, kwargs
+            )
+        )
+    def send(self, obj: Any):
+        client = self.mesh.client
+        _fake_result, dtensors, _mutates, device_mesh = dtensor_check(
+            (lambda args, kwargs, fake_args, fake_kwargs: fake_args[0]),
+            ResolvableFunctionFromPath("ident"),
+            (obj,),
+            {},
+            self.mesh,
+            stream._active,
+        )
+        if self.mesh is not device_mesh:
+            raise ValueError(
+                f"Pipe is defined on mesh {self.mesh} but inputs are defined on mesh {device_mesh}"
+            )
+        self._fake_pipe._fake_sends.append(_fake_result)
+        seq = client.new_node((), dtensors)
+        self.mesh._send(
+            messages.SendValue(
+                seq, self, (), None, (obj,), {}, stream._active._to_ref(client)
+            )
+        )
+    def recv(self) -> Any:
+        mesh = self.mesh
+        fake_result = fake_call(next, self._iterator)
+        fake_result_tensors, unflatten = flatten(
+            fake_result, lambda x: isinstance(x, torch.Tensor)
+        )
+        tensors = tuple(
+            Tensor(fake, mesh, stream._active) for fake in fake_result_tensors
+        )
+        seq = mesh.client.new_node(tensors, ())
+        result = unflatten(tensors)
+        mesh._send(
+            messages.PipeRecv(seq, result, self, stream._active._to_ref(mesh.client))
+        )
+        return result
+    def delete_ref(self, ref: int):
+        if not self.mesh.client._shutdown:
+            self.mesh.client.handle_deletes(self.mesh.processes, [ref])
+    # make typechecking happy for actual process functions
+    @property
+    def ranks(self) -> Dict["str", int]:
+        raise ValueError("cannot be accessed on controller")
+    @property
+    def sizes(self) -> Dict["str", int]:
+        raise ValueError("cannot be accessed on controller")
+class FakePipe(Pipe):
+    """
+    Container to observe faked objects that the controller sent to the process
+    """
+    def __init__(self):
+        self._fake_sends = deque[Any]()
+        self.ref = None
+    def send(self, obj: Any):
+        raise RuntimeError(
+            "Rather than p.send(x) use yield x to simulate a pipe worker sending data."
+        )
+    def recv(self):
+        if self._fake_sends:
+            return self._fake_sends.popleft()

monarch/common/process_group.py ADDED Viewed

@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import torch.distributed as dist
+logger = logging.getLogger(__name__)
+def _wrap_method(process_group: dist.ProcessGroup, method):
+    def wrapper(*args, **kwargs):
+        logger.debug(
+            "ProcessGroup Call: %s with args %s and kwargs %s", method, args, kwargs
+        )
+        fn = getattr(process_group, method)
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            logger.warning(
+                "ProcessGroup Call: %s with args %s and kwargs %s failed with exception: %s",
+                method,
+                args,
+                kwargs,
+                str(e),
+            )
+            # TODO(rajeshn): send a message back to the controller that this
+            # worker had a failed communication event
+            raise e
+    return wrapper
+class SingleControllerProcessGroupWrapper:
+    """
+    Wraps a ProcessGroup object to provide a single controller process group. This provides us a hook to observe
+    all the operatons on the process group to the controller.
+    """
+    def __new__(cls, pg: dist.ProcessGroup):
+        instance = super().__new__(cls)
+        for attr in dir(type(pg)):
+            if not attr.startswith("__") and callable(getattr(type(pg), attr)):
+                setattr(instance, attr, _wrap_method(pg, attr))
+        return instance
+    def __init__(self, process_group):
+        self.process_group = process_group

monarch/common/recording.py ADDED Viewed

@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import traceback
+from collections import defaultdict
+from typing import cast, Dict, Generator, List, NamedTuple, Tuple, TYPE_CHECKING, Union
+from monarch.common.reference import Ref
+from monarch.common.shape import iter_ranks
+from monarch.common.tensor import InputChecker
+from . import messages
+if TYPE_CHECKING:
+    from monarch.common.client import Client
+from .reference import Referenceable
+from .shape import NDSlice
+from .tensor import Tensor
+logger = logging.getLogger(__name__)
+_MAX_MESSAGES_PER_DEFINE_RECORDING = 1000
+def flatten_messages(
+    messages: List[Tuple[Union[NDSlice, List[NDSlice]], NamedTuple]],
+) -> Dict[int, List[NamedTuple]]:
+    result: Dict[int, List[NamedTuple]] = defaultdict(list)
+    for ranks, msg in messages:
+        for rank in iter_ranks(ranks):
+            result[rank].append(msg)
+    return result
+class Recording(Referenceable):
+    def __init__(
+        self,
+        client: "Client",
+        uses: List["Tensor"],
+        mutates: List["Tensor"],
+        mutated_formal_indices: List[int],
+        tracebacks: List[List[traceback.FrameSummary]],
+        buffered_messages: List[Tuple[Union[NDSlice, List[NDSlice]], NamedTuple]],
+        nresults: int,
+        nformals: int,
+        first_ref: int,
+    ):
+        self.uses = uses
+        self.mutates = mutates
+        # on future invocations of this recording, new aliases for our mutated tensors exists
+        # and we will technically mutate them as well. This would be simplified and faster if our
+        # node tracking worked with storages rather than tensors, but for now we have to collect
+        # all the aliases on each invocation
+        self.mutate_aliases = [m._aliases.aliases for m in self.mutates]
+        self.mutated_formal_indices = mutated_formal_indices
+        self.tracebacks = tracebacks
+        self.ref = client.new_ref()
+        self.first_ref = first_ref
+        self.client = client
+        self.buffered_messages = buffered_messages
+        flat_messages = flatten_messages(self.buffered_messages)
+        self.ranks = NDSlice.from_list(sorted(flat_messages.keys()))
+        for rank, msgs in flat_messages.items():
+            ndslice = NDSlice(offset=rank, sizes=[], strides=[])
+            ntotal_messages = len(msgs) // _MAX_MESSAGES_PER_DEFINE_RECORDING + (
+                1 if len(msgs) % _MAX_MESSAGES_PER_DEFINE_RECORDING else 0
+            )
+            for enum_index, msg_index in enumerate(
+                range(0, len(msgs), _MAX_MESSAGES_PER_DEFINE_RECORDING)
+            ):
+                self.client.send_nocoalesce(
+                    ndslice,
+                    messages.DefineRecording(
+                        self,
+                        nresults,
+                        nformals,
+                        msgs[
+                            msg_index : msg_index  # noqa: E203
+                            + _MAX_MESSAGES_PER_DEFINE_RECORDING
+                        ],
+                        ntotal_messages,
+                        enum_index,
+                    ),
+                )
+    def run(self, results: Generator[Tensor, None, None], actuals: List[Tensor]):
+        all_uses: List[Tensor] = [*self.uses, *actuals]
+        with InputChecker.from_flat_args(
+            "recording", all_uses, lambda ts: (tuple(ts), {})
+        ) as checker:
+            mutates_actuals = [
+                actuals[i]._aliases.aliases for i in self.mutated_formal_indices
+            ]
+            mutates = list(set().union(*self.mutate_aliases, *mutates_actuals))
+            checker.check_permission(mutates)
+        # we are careful to not generate the results tensors until
+        # after the input checker so that we do not create tensor objects
+        # for tensors that will never be defined by CallRecording
+        results_tuple = list(results)
+        seq = self.client.new_node(
+            results_tuple + mutates,
+            all_uses,
+            None,
+            self.tracebacks,
+        )
+        self.client.send(
+            self.ranks,
+            messages.CallRecording(
+                seq,
+                self,
+                cast(List[Tensor | Ref], results_tuple),
+                cast(List[Tensor | Ref], actuals),
+            ),
+        )
+        return results_tuple
+    def delete_ref(self, ref: int):
+        if not self.client.has_shutdown:
+            self.client.handle_deletes(self.ranks, [ref])

monarch/common/reference.py ADDED Viewed

@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Optional
+from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
+class Referenceable:
+    def __init__(self):
+        self.ref: Optional[int] = None
+    def delete_ref(self, ref):
+        raise NotImplementedError("no delete_ref method")
+    def __reduce_ex__(self, protocol):
+        assert (
+            self.ref is not None
+        ), f"{self} is being sent but does not have a reference"
+        return Ref, (self.ref,)
+    # Used by rust backend to get the ref for this object
+    def __monarch_ref__(self) -> int:
+        assert self.ref is not None
+        return self.ref
+    def __del__(self):
+        if self.ref is not None:
+            self.delete_ref(self.ref)