PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/checkpointing/pg_transport.py ADDED Viewed

@@ -0,0 +1,306 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import pickle
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import timedelta
+from typing import Callable, cast, Generator, Optional, TypeVar, Union
+import torch
+from torch.distributed import Work
+from torch.distributed.tensor import _DTensorSpec, DTensor
+from torch.utils._pytree import (
+    KeyPath,
+    tree_flatten_with_path,
+    tree_unflatten,
+    TreeSpec,
+)
+from torchft.checkpointing.transport import CheckpointTransport
+from torchft.process_group import ProcessGroup
+logger: logging.Logger = logging.getLogger(__name__)
+T = TypeVar("T")
+@dataclass
+class _TensorMeta:
+    """
+    This is the metadata for a tensor that is used to transfer checkpoints.
+    It contains the shape, the dtype, the storage offset and the stride of the
+    tensor.
+    This must be pickleable so that it can be sent over the wire.
+    """
+    shape: torch.Size
+    dtype: torch.dtype
+    storage_offset: int
+    stride: tuple[int, ...]
+    nbytes: int
+@dataclass
+class _DTensorMeta:
+    """
+    This is the metadata for a DTensor that is used to transfer checkpoints.
+    It contains the metadata for the local tensor and the spec of the DTensor.
+    This must be pickleable so that it can be sent over the wire.
+    """
+    local: _TensorMeta
+    spec: _DTensorSpec
+@dataclass
+class _StateDictMeta:
+    """
+    This is the metadata for a state dict that is used to transfer checkpoints.
+    It contains the step, the pytree spec of the state dict and the metadata for
+    each tensor in the state dict.
+    This must be pickleable so that it can be sent over the wire.
+    Args:
+        step: the step of the checkpoint to verify consistency
+        treespec: the pytree spec of the state dict
+        paths: the path of each leaf in the state dict
+        non_tensor_leaves: the metadata for each tensor in the state dict and any
+            non-tensor leaves in the state dict
+    """
+    step: int
+    treespec: TreeSpec
+    paths: list[KeyPath]
+    non_tensor_leaves: list[Union[object, _TensorMeta, _DTensorMeta]]
+@contextmanager
+def _timeit(name: str) -> Generator[None, None, None]:
+    start = time.perf_counter()
+    yield
+    dur = time.perf_counter() - start
+    logger.info(f"{name} took {dur}s")
+def _prepare_tensor(tensor: torch.Tensor) -> tuple[torch.Tensor, _TensorMeta]:
+    return (
+        _cast_tensor(tensor, torch.uint8),
+        _TensorMeta(
+            shape=tensor.shape,
+            dtype=tensor.dtype,
+            storage_offset=cast(int, tensor.storage_offset()),
+            stride=tensor.stride(),
+            nbytes=tensor.untyped_storage().nbytes(),
+        ),
+    )
+def _prepare_state_dict(
+    state_dict: object,
+    step: int,
+    device: torch.device,
+) -> tuple[_StateDictMeta, list[torch.Tensor]]:
+    leaves: list[tuple[KeyPath, object]]
+    leaves, treespec = tree_flatten_with_path(state_dict)
+    paths: list[KeyPath] = []
+    non_tensor_leaves: list[Union[object, _TensorMeta, _DTensorMeta]] = []
+    tensors: list[torch.Tensor] = []
+    for key_path, v in leaves:
+        paths.append(key_path)
+        if isinstance(v, DTensor):
+            tensor, tensor_meta = _prepare_tensor(v._local_tensor)
+            tensors.append(tensor)
+            non_tensor_leaves.append(
+                _DTensorMeta(
+                    local=tensor_meta,
+                    spec=v._spec,
+                )
+            )
+        elif isinstance(v, torch.Tensor):
+            tensor, tensor_meta = _prepare_tensor(v)
+            tensors.append(tensor)
+            non_tensor_leaves.append(tensor_meta)
+        else:
+            non_tensor_leaves.append(v)
+    return (
+        _StateDictMeta(
+            step=step,
+            treespec=treespec,
+            paths=paths,
+            non_tensor_leaves=non_tensor_leaves,
+        ),
+        tensors,
+    )
+def _cast_tensor(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """
+    Casts the underlying storage to a tensor of the given dtype.
+    The returned tensor will be of size ``storage.nbytes``.
+    This works for all datatypes and supports strided/offset tensors with the
+    caveat that the cast tensor may be larger than the original tensor due to
+    the differences in striding.
+    """
+    assert (
+        type(tensor) is torch.Tensor
+    ), f"can only cast standard tensors not {type(tensor)}"
+    storage = tensor.untyped_storage()
+    ret = torch.tensor(storage, dtype=dtype, device=tensor.device)
+    assert ret.untyped_storage() is storage, "storage should be the same"
+    return ret
+class PGTransport(CheckpointTransport[T]):
+    """
+    This is a checkpoint transport that uses the process group to transfer checkpoints.
+    This allows for fast recovery of workers by fetching the current weights
+    from an existing worker.
+    Args:
+        pg: the process group to use for communication
+        timeout: the timeout for communication
+        device: the device to use for tensors
+        state_dict: if specified this function will be called to do an inplace
+            receive into the returned state_dict. This is much faster than
+            having to allocate new tensors and transferring them to the CPU.
+    """
+    def __init__(
+        self,
+        pg: ProcessGroup,
+        timeout: timedelta,
+        device: torch.device,
+        state_dict: Optional[Callable[[], object]] = None,
+    ) -> None:
+        self._work: list[Work] = []
+        self._pg = pg
+        self._timeout = timeout
+        self._device = device
+        self._state_dict = state_dict
+    def metadata(self) -> str:
+        return "<n/a>"
+    def disallow_checkpoint(self) -> None:
+        pass
+    def send_checkpoint(
+        self, dst_ranks: list[int], step: int, state_dict: T, timeout: timedelta
+    ) -> None:
+        with _timeit("preparing state_dict"):
+            meta, tensors = _prepare_state_dict(state_dict, step, device=self._device)
+        work = []
+        with _timeit("send pickle"):
+            buf = pickle.dumps(meta)
+            len_t = torch.tensor([len(buf)], dtype=torch.int64, device=self._device)
+            buf_t = torch.frombuffer(buf, dtype=torch.uint8).to(self._device)
+            for dst_rank in dst_ranks:
+                work.append(self._pg.send([len_t], dst_rank, tag=1))
+                work.append(self._pg.send([buf_t], dst_rank, tag=2))
+        with _timeit("send tensors"):
+            for i, t in enumerate(tensors):
+                original_device = t.device
+                t = t.to(self._device)
+                for dst_rank in dst_ranks:
+                    work.append(self._pg.send([t], dst_rank, tag=3 + i))
+                # if we did a copy we should wait for the work to complete so we
+                # can free the memory to avoid OOMs
+                if original_device == torch.device("cpu"):
+                    for w in work:
+                        w.wait(timeout)
+                    work = []
+            for w in work:
+                w.wait(timeout)
+    def recv_checkpoint(
+        self, src_rank: int, metadata: str, step: int, timeout: timedelta
+    ) -> T:
+        state_dict = self._state_dict() if self._state_dict else {}
+        state_dict_leaves, _ = tree_flatten_with_path(state_dict)
+        dst_tensors: dict[KeyPath, object] = dict(state_dict_leaves)
+        len_t = torch.zeros(1, dtype=torch.int64, device=self._device)
+        self._pg.recv([len_t], src_rank, tag=1).wait(timeout)
+        length = cast(int, len_t.item())
+        assert length > 0, f"invalid metadata length {length=}"
+        buf = torch.empty(length, dtype=torch.uint8, device=self._device)
+        self._pg.recv([buf], src_rank, tag=2).wait(timeout)
+        meta: _StateDictMeta = pickle.loads(buf.cpu().numpy().tobytes())
+        assert meta.step == step
+        i: int = 0
+        works: list[Work] = []
+        def recv(path: KeyPath, v: _TensorMeta) -> torch.Tensor:
+            nonlocal i
+            inplace = dst_tensors.get(path)
+            if (
+                isinstance(inplace, torch.Tensor)
+                and inplace.device.type == self._device.type
+            ):
+                if isinstance(inplace, DTensor):
+                    inplace = inplace._local_tensor
+                t = _cast_tensor(inplace, torch.uint8)
+                assert (
+                    t.nbytes == v.nbytes
+                ), "inplace tensor storage must be the same size"
+            else:
+                t = torch.empty(v.nbytes, dtype=torch.uint8, device=self._device)
+            work = self._pg.recv([t], src_rank, tag=3 + i)
+            i += 1
+            if inplace is None:
+                # if not inplace we need to copy it to CPU to avoid OOMing
+                work.wait(timeout)
+                t = t.cpu()
+            else:
+                works.append(work)
+            return torch.as_strided(
+                t.view(v.dtype),
+                size=v.shape,
+                stride=v.stride,
+                storage_offset=v.storage_offset,
+            )
+        values = []
+        for path, v in zip(meta.paths, meta.non_tensor_leaves):
+            if isinstance(v, _TensorMeta):
+                values.append(recv(path, v))
+            elif isinstance(v, _DTensorMeta):
+                tensor = recv(path, v.local)
+                values.append(DTensor(tensor, v.spec, requires_grad=False))
+            else:
+                values.append(v)
+        for work in works:
+            work.wait(timeout)
+        return tree_unflatten(values, meta.treespec)

torchft/checkpointing/pg_transport_bench.py ADDED Viewed

@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from datetime import timedelta
+import torch
+import torch.distributed as dist
+from torchft.checkpointing.pg_transport import _timeit, PGTransport
+from torchft.process_group import ProcessGroupBabyNCCL
+logger: logging.Logger = logging.getLogger(__name__)
+def main(argv: list[str]) -> None:
+    import argparse
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inplace", action="store_true")
+    parser.add_argument("--device", type=str, default="cpu")
+    parser.add_argument("--chunk-size", type=int, default=3_000_000)  # 3MB
+    parser.add_argument("--total-size", type=int, default=12_000_000_000)  # 12GB
+    args = parser.parse_args(argv)
+    CHUNK_SIZE: int = args.chunk_size
+    TOTAL_SIZE: int = args.total_size
+    INPLACE: bool = args.inplace
+    DEVICE: str = args.device
+    timeout: timedelta = timedelta(seconds=10)
+    store = dist.TCPStore(
+        "localhost",
+        0,
+        is_master=True,
+        timeout=timeout,
+        wait_for_workers=False,
+    )
+    store_addr: str = f"localhost:{store.port}"
+    def run(rank: int) -> None:
+        torch.cuda.set_device(rank)
+        device = torch.device(DEVICE)
+        with _timeit("init_pg"):
+            pg = ProcessGroupBabyNCCL(timeout=timeout)
+            pg.configure(store_addr=store_addr, replica_id="0", rank=rank, world_size=2)
+            t = torch.zeros(10, device=device, dtype=torch.float32)
+            pg.allreduce([t], dist.ReduceOp.SUM).wait(timeout=timeout)
+        with _timeit("create state_dict"):
+            state_dict: dict[str, torch.Tensor] = {}
+            for i in range(0, TOTAL_SIZE, CHUNK_SIZE):
+                state_dict[f"chunk/{i}"] = torch.zeros(
+                    CHUNK_SIZE // 4, dtype=torch.float32, device=device
+                )
+        def get_state_dict() -> object:
+            return state_dict
+        transport = PGTransport(
+            pg=pg,
+            timeout=timeout,
+            device=device,
+            state_dict=get_state_dict if INPLACE else None,
+        )
+        metadata = transport.metadata()
+        if rank == 0:
+            with _timeit("send_checkpoint"):
+                transport.send_checkpoint(
+                    dst_ranks=[1],
+                    step=1,
+                    state_dict=state_dict,
+                    timeout=timedelta(seconds=60),
+                )
+        elif rank == 1:
+            with _timeit("recv_checkpoint"):
+                transport.recv_checkpoint(
+                    src_rank=0, metadata=metadata, step=1, timeout=timedelta(seconds=60)
+                )
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        results = executor.map(run, range(2))
+        list(results)
+if __name__ == "__main__":
+    main(sys.argv[1:])

torchft/checkpointing/pg_transport_test.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from datetime import timedelta
+from unittest import skipIf, skipUnless, TestCase
+import torch
+from torch.distributed import TCPStore
+from torchft.checkpointing.pg_transport import PGTransport
+from torchft.checkpointing.transport import CheckpointTransport
+from torchft.checkpointing.transport_test import (
+    make_state_dict,
+    run_multi_recovery_test,
+)
+from torchft.process_group import ProcessGroupBabyNCCL, ProcessGroupGloo
+class PGTransportTest(TestCase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipIf(sys.platform == "darwin", "not passing on mac")
+    def test_pg_transport_gloo(self) -> None:
+        store: TCPStore = TCPStore(
+            host_name="localhost", port=0, is_master=True, wait_for_workers=False
+        )
+        device: torch.device = torch.device("cpu")
+        def init(rank: int, world_size: int) -> CheckpointTransport[dict[str, object]]:
+            pg = ProcessGroupGloo()
+            pg.configure(
+                store_addr=f"localhost:{store.port}/prefix",
+                replica_id="0",
+                rank=rank,
+                world_size=world_size,
+            )
+            return PGTransport[dict[str, object]](
+                pg, timeout=timedelta(seconds=10), device=device
+            )
+        run_multi_recovery_test(self, init, device=device)
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipUnless(torch.cuda.device_count() >= 3, "need three CUDA devices")
+    def test_pg_transport_baby_nccl(self) -> None:
+        store: TCPStore = TCPStore(
+            host_name="localhost", port=0, is_master=True, wait_for_workers=False
+        )
+        device: torch.device = torch.device("cuda")
+        timeout: timedelta = timedelta(seconds=10)
+        def init(rank: int, world_size: int) -> CheckpointTransport[dict[str, object]]:
+            torch.cuda.set_device(rank)
+            pg = ProcessGroupBabyNCCL(timeout=timeout)
+            pg.configure(
+                store_addr=f"localhost:{store.port}/prefix",
+                replica_id="0",
+                rank=rank,
+                world_size=world_size,
+            )
+            return PGTransport[dict[str, object]](pg, timeout=timeout, device=device)
+        run_multi_recovery_test(self, init, device=device)
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipUnless(torch.cuda.device_count() >= 3, "need three CUDA devices")
+    def test_pg_transport_baby_nccl_inplace(self) -> None:
+        store: TCPStore = TCPStore(
+            host_name="localhost", port=0, is_master=True, wait_for_workers=False
+        )
+        device: torch.device = torch.device("cuda")
+        timeout: timedelta = timedelta(seconds=10)
+        def state_dict() -> dict[str, object]:
+            return make_state_dict(device)
+        def init(rank: int, world_size: int) -> CheckpointTransport[dict[str, object]]:
+            torch.cuda.set_device(rank)
+            pg = ProcessGroupBabyNCCL(timeout=timeout)
+            pg.configure(
+                store_addr=f"localhost:{store.port}/prefix",
+                replica_id="0",
+                rank=rank,
+                world_size=world_size,
+            )
+            return PGTransport[dict[str, object]](
+                pg,
+                timeout=timeout,
+                device=device,
+                state_dict=state_dict,
+            )
+        run_multi_recovery_test(self, init, device=device)

torchft/checkpointing/rwlock_test.py ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import pytest
+from torchft.checkpointing._rwlock import RWLock
+def test_w_locked() -> None:
+    lock = RWLock()
+    with lock.w_lock():
+        assert lock.w_locked()
+    assert not lock.w_locked()
+def test_w_lock_timeout() -> None:
+    lock = RWLock(timeout=0.01)
+    lock.r_acquire()
+    lock.r_acquire()
+    with pytest.raises(TimeoutError):
+        lock.w_acquire()
+    with pytest.raises(TimeoutError):
+        with lock.w_lock():
+            pass
+    lock.r_release()
+    with pytest.raises(TimeoutError):
+        lock.w_acquire()
+    lock.r_release()
+    with lock.w_lock():
+        pass
+    lock.w_acquire()
+def test_r_lock_timeout() -> None:
+    lock = RWLock(timeout=0.01)
+    lock.w_acquire()
+    with pytest.raises(TimeoutError):
+        lock.r_acquire()
+    with pytest.raises(TimeoutError):
+        with lock.r_lock():
+            pass
+    lock.w_release()
+    with lock.r_lock():
+        pass
+    lock.r_acquire()

torchft/checkpointing/transport.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+from datetime import timedelta
+from typing import Generic, List, TypeVar
+T = TypeVar("T")
+class CheckpointTransport(Generic[T], ABC):
+    @abstractmethod
+    def metadata(self) -> str:
+        """
+        Returns a string that will be used by the remote CheckpointTransport to fetch the checkpoint.
+        """
+        ...
+    @abstractmethod
+    def send_checkpoint(
+        self, dst_ranks: List[int], step: int, state_dict: T, timeout: timedelta
+    ) -> None:
+        """
+        Sends the checkpoint, only called when there is a rank that is behind.
+        This may be async.
+        Args:
+            dst_ranks: the ranks to send to
+            step: the step number to send
+            state_dict: the state dict to send
+            timeout: the timeout to wait for the checkpoint to be sent
+        """
+        ...
+    def disallow_checkpoint(self) -> None:
+        """
+        Called after send_checkpoint to wait for the checkpoint to be sent.
+        Once this returns, the state_dict may be mutated so no further data should be sent.
+        """
+        ...
+    @abstractmethod
+    def recv_checkpoint(
+        self, src_rank: int, metadata: str, step: int, timeout: timedelta
+    ) -> T:
+        """
+        Receives the checkpoint from the given rank.
+        Args:
+            src_rank: the rank to receive the checkpoint from
+            metadata: the metadata returned by the remote CheckpointTransport
+            step: the step number to receive
+            timeout: the timeout to wait for the checkpoint
+        """
+        ...
+    def shutdown(self, wait: bool = True) -> None:
+        """
+        Called to shutdown the checkpoint transport.
+        Args:
+            wait: whether to wait for the transport to shutdown
+        """