PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/checkpointing/http_transport.py ADDED Viewed

@@ -0,0 +1,299 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import socket
+import threading
+import time
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager, nullcontext
+from datetime import timedelta
+from http.server import BaseHTTPRequestHandler
+from typing import cast, Generator, List, Optional, TypeVar
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torchft.checkpointing._rwlock import RWLock
+from torchft.checkpointing._serialization import _streaming_load, _streaming_save
+from torchft.checkpointing.transport import CheckpointTransport
+from torchft.http import _IPv6HTTPServer
+logger: logging.Logger = logging.getLogger(__name__)
+T = TypeVar("T")
+@contextmanager
+def _time(desc: str) -> Generator[None, None, None]:
+    start = time.perf_counter()
+    yield
+    end = time.perf_counter()
+    logger.info(f"{desc} took {end - start}s")
+class HTTPTransport(CheckpointTransport[T]):
+    """
+    This is an HTTP server that can be used to transfer checkpoints
+    between workers.
+    This allows for fast recovery of workers by fetching the current weights
+    from an existing worker.
+    Args:
+        timeout: the timeout for HTTP requests
+        num_chunks: the number of chunks to split the checkpoint into (0 for no chunking)
+    """
+    def __init__(self, timeout: timedelta, num_chunks: int) -> None:
+        self._checkpoint_lock = RWLock(timeout=timeout.total_seconds())
+        self._disallowed = False
+        self._step = -1
+        self._timeout = timeout
+        self._state_dict: Optional[T] = None
+        self._num_chunks = num_chunks
+        self._stream: Optional[torch.cuda.Stream] = (
+            torch.cuda.Stream() if torch.cuda.is_available() else None
+        )
+        # staged checkpoint information
+        self._spec: Optional[TreeSpec] = None
+        self._chunks: Optional[List[List[object]]] = None
+        # We don't allow checkpoints until the first send_checkpoint to avoid
+        # serving the default step=-1 invalid checkpoint.
+        self.disallow_checkpoint()
+        ckpt_server = self
+        class RequestHandler(BaseHTTPRequestHandler):
+            # set request socket timeout to avoid hanging forever
+            timeout = self._timeout.total_seconds()
+            def do_GET(self):
+                try:
+                    # validate socket timeout is actually set
+                    assert self.connection.gettimeout() == self.timeout
+                    with ckpt_server._checkpoint_lock.r_lock():
+                        step = ckpt_server._step
+                        parts = self.path.split("/")
+                        assert len(parts) == 4
+                        if parts[1] != "checkpoint":
+                            self.send_error(
+                                400,
+                                f"invalid url format, expected /checkpoint/step/key but got {self.path}",
+                            )
+                            return
+                        step = int(parts[2])
+                        if step != ckpt_server._step:
+                            self.send_error(
+                                400,
+                                f"invalid checkpoint requested, serving {ckpt_server._step} but got {step=}",
+                            )
+                            return
+                        key = parts[3]
+                        if key == "full":
+                            self.send_response(200)
+                            self.send_header("Content-type", "application/octet-stream")
+                            self.end_headers()
+                            state_dict = ckpt_server._state_dict
+                            _streaming_save(state_dict, self.wfile)
+                            return
+                        if key == "metadata":
+                            self.send_response(200)
+                            self.send_header("Content-type", "application/octet-stream")
+                            self.end_headers()
+                            _streaming_save(ckpt_server._spec, self.wfile)
+                        else:
+                            chunk = ckpt_server._chunks[int(key)]
+                            self.send_response(200)
+                            self.send_header("Content-type", "application/octet-stream")
+                            self.end_headers()
+                            _streaming_save(chunk, self.wfile)
+                except Exception as e:
+                    logger.exception(
+                        f"Exception in checkpoint server when handling {self.path=}: {e}",
+                    )
+                    self.send_error(500, str(e))
+        server_address = ("", 0)
+        self._server = _IPv6HTTPServer(server_address, RequestHandler)
+        logger.info(f"Started CheckpointServer on {self.address()}...")
+        self._thread = threading.Thread(
+            target=self._serve,
+            args=(),
+            daemon=True,
+        )
+        self._thread.start()
+    @classmethod
+    def _load_from_address(cls, address: str, timeout: timedelta) -> object:
+        """
+        Loads a checkpoint from the given address.
+        Args:
+            address: the HTTP address to load the checkpoint from
+        """
+        msg = f"fetching checkpoint from {address}"
+        logger.info(msg)
+        with (
+            _time(msg),
+            urllib.request.urlopen(address, timeout=timeout.total_seconds()) as f,
+        ):
+            # We have to set weights_only to False as there are some non-tensor
+            # states like lr_scheduler.
+            # pyre-fixme[16]: needs torch>=2.7
+            return cast(T, _streaming_load(f, weights_only=False))
+    def address(self) -> str:
+        """
+        Returns the HTTP address to fetch a checkpoint from this server. Step must be appended to the end of the address.
+        Format: http://host:port/checkpoint/1234
+        Returns:
+            an HTTP address
+        """
+        port = self._server.socket.getsockname()[1]
+        return f"http://{socket.gethostname()}:{port}/checkpoint/"
+    def _serve(self) -> None:
+        try:
+            self._server.serve_forever()
+        except Exception as e:
+            logger.exception("got exception in checkpoint server")
+    def disallow_checkpoint(self) -> None:
+        """
+        Disallows serving the checkpoint.
+        All requests will block until allow_checkpoint is called.
+        """
+        if not self._disallowed:
+            self._disallowed = True
+            self._checkpoint_lock.w_acquire()
+    def allow_checkpoint(self, step: int) -> None:
+        """
+        Allows serving the checkpoint with the specified step number.
+        Args:
+            step: the step number to serve
+        """
+        self._step = step
+        if self._disallowed:
+            self._disallowed = False
+            self._checkpoint_lock.w_release()
+    def shutdown(self, wait: bool = True) -> None:
+        """
+        Shutdown the server.
+        """
+        if not wait:
+            # hack for nonblocking shutdown of socketserver threads
+            # pyre-fixme[16]: no attribute `__shutdown_request`.
+            self._server.__shutdown_request = True
+        if wait:
+            self._server.shutdown()
+            self._thread.join()
+    def metadata(self) -> str:
+        return self.address()
+    def send_checkpoint(
+        self, dst_ranks: List[int], step: int, state_dict: T, timeout: timedelta
+    ) -> None:
+        values, spec = tree_flatten(state_dict)
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            with _time("transferring state_dict to CPU"):
+                values = _to_cpu(values, pin_memory=False)
+                if self._stream is not None:
+                    self._stream.synchronize()
+        # Unflatten so non-chunked transfer uses CPU tensors
+        self._state_dict = tree_unflatten(values, spec)
+        # Save spec for chunked
+        self._spec = spec
+        self._chunks = _split_chunks(values, self._num_chunks)
+        self.allow_checkpoint(step)
+    def recv_checkpoint(
+        self, src_rank: int, metadata: str, step: int, timeout: timedelta
+    ) -> T:
+        base_url = f"{metadata}{step}"
+        if self._num_chunks == 0:
+            return cast(T, self._load_from_address(f"{base_url}/full", timeout))
+        else:
+            urls = [f"{base_url}/metadata"] + [
+                f"{base_url}/{i}" for i in range(self._num_chunks)
+            ]
+            with ThreadPoolExecutor(max_workers=len(urls)) as executor:
+                futures = [
+                    executor.submit(self._load_from_address, url, timeout)
+                    for url in urls
+                ]
+                spec, *chunks = [future.result() for future in futures]
+                spec = cast(TreeSpec, spec)
+                chunks = cast(List[List[object]], chunks)
+            values = _merge_chunks(chunks, self._num_chunks)
+            return tree_unflatten(values, spec)
+def _to_cpu(values: List[T], pin_memory: bool) -> List[T]:
+    out = []
+    for v in values:
+        if isinstance(v, torch.Tensor):
+            if v.device.type == "cuda":
+                if pin_memory:
+                    cpu = torch.empty(*tuple(v.size()), dtype=v.dtype, pin_memory=True)
+                    cpu.copy_(v, non_blocking=True)
+                    out.append(cpu)
+                else:
+                    out.append(v.cpu())
+            else:
+                out.append(v)
+        else:
+            out.append(v)
+    return out
+def _split_chunks(values: List[T], num_chunks: int) -> List[List[T]]:
+    return [values[i::num_chunks] for i in range(num_chunks)]
+def _merge_chunks(chunks: List[List[T]], num_chunks: int) -> List[T]:
+    max_len = max(len(lst) for lst in chunks)
+    output_list = []
+    for i in range(max_len):
+        for lst in chunks:
+            if i < len(lst):
+                output_list.append(lst[i])
+    return output_list

torchft/checkpointing/http_transport_bench.py ADDED Viewed

@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import sys
+from datetime import timedelta
+from typing import List
+import torch
+from torchft.checkpointing.http_transport import _time, HTTPTransport
+logger: logging.Logger = logging.getLogger(__name__)
+def main(argv: List[str]) -> None:
+    import argparse
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-chunks", type=int, default=0)
+    parser.add_argument("--device", type=str, default="cpu")
+    parser.add_argument("--chunk-size", type=int, default=3_000_000)  # 3MB
+    parser.add_argument("--total-size", type=int, default=12_000_000_000)  # 12GB
+    args = parser.parse_args(argv)
+    device = torch.device(args.device)
+    num_chunks: int = args.num_chunks
+    CHUNK_SIZE = args.chunk_size
+    TOTAL_SIZE = args.total_size
+    transport = HTTPTransport(timedelta(seconds=60), num_chunks=num_chunks)
+    metadata = transport.metadata()
+    logger.info(f"creating state_dict... {CHUNK_SIZE=} {TOTAL_SIZE=}")
+    with _time("create state_dict"):
+        state_dict = {}
+        for i in range(0, TOTAL_SIZE, CHUNK_SIZE):
+            state_dict[f"chunk/{i}"] = torch.zeros(
+                CHUNK_SIZE // 4, dtype=torch.float32, device=device
+            )
+    logger.info(f"fetching from {metadata=} {device=} {num_chunks=} {len(state_dict)=}")
+    transport.send_checkpoint(
+        dst_ranks=[0], step=1, state_dict=state_dict, timeout=timedelta(seconds=60)
+    )
+    with _time("fetching checkpoint"):
+        transport.recv_checkpoint(
+            src_rank=1, metadata=metadata, step=1, timeout=timedelta(seconds=60)
+        )
+if __name__ == "__main__":
+    main(sys.argv[1:])

torchft/checkpointing/http_transport_test.py ADDED Viewed

@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import urllib.error
+from datetime import timedelta
+from typing import Dict
+from unittest import skipUnless, TestCase
+from unittest.mock import MagicMock
+import torch
+from parameterized import parameterized
+from torchft.checkpointing.http_transport import HTTPTransport
+from torchft.checkpointing.http_transport_bench import main as bench_main
+from torchft.checkpointing.transport import CheckpointTransport
+from torchft.checkpointing.transport_test import (
+    assertStateDictEqual,
+    run_multi_recovery_test,
+)
+class TestHTTPTransport(TestCase):
+    @parameterized.expand(
+        [
+            ("no chunks", 0),
+            ("chunked", 3),
+        ]
+    )
+    def test_checkpoint_server(self, name: str, num_chunks: int) -> None:
+        expected: Dict[str, object] = {
+            "state": "dict",
+            "tensor": torch.rand(5, 2),
+            "cuda": torch.rand(
+                2, 3, device="cuda" if torch.cuda.is_available() else "cpu"
+            ),
+        }
+        state_dict_fn = MagicMock()
+        state_dict_fn.return_value = expected
+        server = HTTPTransport(
+            timeout=timedelta(seconds=10),
+            num_chunks=num_chunks,
+        )
+        server.send_checkpoint(
+            dst_ranks=[],
+            step=1234,
+            state_dict=expected,
+            timeout=timedelta(seconds=10),
+        )
+        metadata = server.metadata()
+        out = server.recv_checkpoint(
+            src_rank=0, metadata=metadata, step=1234, timeout=timedelta(seconds=10)
+        )
+        assertStateDictEqual(self, out, expected)
+        # test timeout
+        with self.assertRaisesRegex(urllib.error.URLError, r"urlopen error"):
+            server.recv_checkpoint(
+                src_rank=0, metadata=metadata, step=1234, timeout=timedelta(seconds=0.0)
+            )
+        # test mismatch case
+        server.send_checkpoint(
+            dst_ranks=[],
+            step=2345,
+            state_dict=expected,
+            timeout=timedelta(seconds=10),
+        )
+        with self.assertRaisesRegex(
+            urllib.error.HTTPError, r"Error 400.*serving 2345 but got step=1234"
+        ):
+            server.recv_checkpoint(
+                src_rank=0, metadata=metadata, step=1234, timeout=timedelta(seconds=10)
+            )
+        server.shutdown()
+    def test_checkpoint_server_locking(self) -> None:
+        server = HTTPTransport(
+            timeout=timedelta(seconds=10),
+            num_chunks=0,
+        )
+        # server should start up in a disallowed state this will block incoming
+        # requests until allow_checkpoint is called
+        self.assertTrue(server._checkpoint_lock.w_locked())
+        self.assertTrue(server._disallowed)
+        self.assertEqual(server._step, -1)
+        # allow requests
+        server.allow_checkpoint(1)
+        self.assertFalse(server._checkpoint_lock.w_locked())
+        self.assertFalse(server._disallowed)
+        self.assertEqual(server._step, 1)
+        # duplicate allow/disallow is fine
+        server.allow_checkpoint(2)
+        self.assertEqual(server._step, 2)
+        server.disallow_checkpoint()
+        server.disallow_checkpoint()
+        self.assertTrue(server._checkpoint_lock.w_locked())
+        self.assertTrue(server._disallowed)
+        server.shutdown()
+    def test_multi_http_transport_cpu(self) -> None:
+        device = torch.device("cpu")
+        def init(rank: int, world_size: int) -> CheckpointTransport[Dict[str, object]]:
+            return HTTPTransport(
+                timeout=timedelta(seconds=10),
+                num_chunks=0,
+            )
+        run_multi_recovery_test(self, init, device=device)
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    @skipUnless(torch.cuda.is_available(), "CUDA is not available")
+    def test_multi_http_transport_cuda(self) -> None:
+        device = torch.device("cuda")
+        def init(rank: int, world_size: int) -> CheckpointTransport[Dict[str, object]]:
+            return HTTPTransport(
+                timeout=timedelta(seconds=10),
+                num_chunks=0,
+            )
+        run_multi_recovery_test(self, init, device=device)
+    def test_benchmark(self) -> None:
+        bench_main(
+            [
+                "--chunk-size=10",
+                "--num-chunks=0",
+                "--total-size=100",
+                "--device=cpu",
+            ]
+        )