PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/futures.py ADDED Viewed

@@ -0,0 +1,353 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import asyncio
+import os
+import queue
+import sys
+import threading
+import time
+from contextlib import contextmanager, nullcontext
+from datetime import timedelta
+from typing import Callable, Generator, Optional, TypeVar
+from unittest.mock import Mock
+import torch
+from torch.futures import Future
+from torchft.utils import get_stream_context
+T = TypeVar("T")
+WATCHDOG_TIMEOUT_SEC = "TORCHFT_WATCHDOG_TIMEOUT_SEC"
+class _TimerHandle:
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._timer_handle: Optional[asyncio.TimerHandle] = None
+        self._cancelled = False
+    def set_timer_handle(self, timer_handle: asyncio.TimerHandle) -> None:
+        with self._lock:
+            if self._cancelled:
+                timer_handle.cancel()
+                self._timer_handle = None
+            else:
+                self._timer_handle = timer_handle
+    def cancel(self) -> None:
+        with self._lock:
+            assert not self._cancelled, "timer can only be cancelled once"
+            self._cancelled = True
+            if self._timer_handle is not None:
+                self._timer_handle.cancel()
+                self._timer_handle = None
+class _TimeoutManager:
+    """
+    This class manages timeouts for code blocks, futures and CUDA events. It
+    uses a background thread with an event loop to schedule the timeouts and
+    call the callback function when the timeout is reached.
+    Generally there is a single instance of this class that is used for all
+    timeouts. The callbacks should not block otherwise other timeouts may not
+    be processed.
+    """
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._event_loop: Optional[asyncio.AbstractEventLoop] = None
+        self._event_loop_thread: Optional[threading.Thread] = None
+        self._next_timer_id = 0
+        # Ensures `_event_loop_thread` is not stuck
+        self._watchdog_thread: Optional[threading.Thread] = None
+        # Give this much time the the `_event_loop_thread` to confirm that
+        # it is not stuck
+        self._watchdog_interval = timedelta(
+            seconds=int(os.environ.get(WATCHDOG_TIMEOUT_SEC, "30"))
+        )
+        # This queue is used to delete events on the main thread as cudaEventDestroy
+        # can block if the CUDA queue is full.
+        self._del_queue: queue.SimpleQueue[object] = queue.SimpleQueue()
+    def _maybe_start_event_loop(self) -> asyncio.AbstractEventLoop:
+        """
+        Start the event loop if it has not already been started.
+        """
+        with self._lock:
+            if self._event_loop is None:
+                self._event_loop = asyncio.new_event_loop()
+                self._event_loop_thread = threading.Thread(
+                    target=self._event_loop.run_forever,
+                    daemon=True,
+                    name="TimeoutManager",
+                )
+                self._event_loop_thread.start()
+                self._watchdog_thread = threading.Thread(
+                    target=self._watchdog_loop, daemon=True
+                )
+                self._watchdog_thread.start()
+            # pyre-fixme[7]: optional
+            return self._event_loop
+    def _watchdog_loop(self) -> None:
+        while True:
+            is_healthy = False
+            def updated_health() -> None:
+                nonlocal is_healthy
+                is_healthy = True
+            with self._lock:
+                if self._event_loop is None:
+                    return
+                # The method passed to the event loop should finish fast.
+                # It just updates a bool, which is also thread safe.
+                self._event_loop.call_soon_threadsafe(updated_health)
+            time.sleep(self._watchdog_interval.total_seconds())
+            if not is_healthy:
+                print("TimeoutManager is stuck. Exiting.")
+                sys.exit(1)
+                # Needed becuase `sys.exit` is mocked in unit tests.
+                # If we don't return here, we don't break out of the loop.
+                return
+    def shutdown(self) -> None:
+        """
+        Shutdown the event loop and cancel all pending timeouts.
+        """
+        watchdog_thread = None
+        with self._lock:
+            if self._event_loop is not None:
+                self._event_loop.call_soon_threadsafe(self._event_loop.stop)
+                assert self._event_loop_thread is not None
+                self._event_loop_thread.join()
+                self._event_loop = None
+                self._event_loop_thread = None
+                # We can't join the watchdog thread here because it grabs `lock_`
+                watchdog_thread = self._watchdog_thread
+        if watchdog_thread is not None:
+            # If `_maybe_start_event_loop` is called again, the it is possible the `join`
+            # below will never finish.
+            # This class assumes `_maybe_start_event_loop` will not be called after `shutdown`.
+            # If this functionality is required in the future, we could change the class to
+            # support this. Or create multiple instances of this class.
+            watchdog_thread.join()
+    def register(self, fut: Future[T], timeout: timedelta) -> Future[T]:
+        """
+        Registers a future that will be cancelled after the specified timeout.
+        """
+        # bypass timeout for mock futures
+        if isinstance(fut, Mock):
+            return fut
+        self._clear_del_queue()
+        loop = self._maybe_start_event_loop()
+        timed_fut: Future[T] = Future()
+        handle: _TimerHandle = _TimerHandle()
+        loop.call_soon_threadsafe(
+            self._register_callback,
+            loop,
+            lambda: timed_fut.set_exception(
+                # pyre-fixme[6]: e is not T
+                TimeoutError(f"future did not complete within {timeout}")
+            ),
+            timeout,
+            handle,
+        )
+        stream: Optional[torch.Stream] = (
+            torch.accelerator.current_stream()
+            if torch.accelerator.is_available()
+            else None
+        )
+        def callback(fut: Future[T]) -> None:
+            with get_stream_context(stream):
+                handle.cancel()
+                try:
+                    timed_fut.set_result(fut.wait())
+                except Exception as e:
+                    try:
+                        # this can throw if the future is already done
+                        # pyre-fixme[6]: e is not T
+                        timed_fut.set_exception(e)
+                    except Exception:
+                        pass
+        fut.add_done_callback(callback)
+        return timed_fut
+    def stream_timeout(self, callback: Callable[[], None], timeout: timedelta) -> None:
+        self._clear_del_queue()
+        loop = self._maybe_start_event_loop()
+        event: torch.Event = torch.Event()
+        event.record()
+        def handler() -> None:
+            if not event.query():
+                callback()
+            # cudaEventDestroy can block so we never want to delete in the event
+            # loop. Put it on the del queue so we can delete it in the main
+            # thread.
+            self._del_queue.put(event)
+        loop.call_soon_threadsafe(
+            self._register_callback, loop, handler, timeout, _TimerHandle()
+        )
+    @classmethod
+    def _register_callback(
+        cls,
+        loop: asyncio.AbstractEventLoop,
+        callback: Callable[[], None],
+        timeout: timedelta,
+        handle: _TimerHandle,
+    ) -> None:
+        timer_handle = loop.call_later(
+            timeout.total_seconds(),
+            callback,
+        )
+        handle.set_timer_handle(timer_handle)
+    @contextmanager
+    def context_timeout(
+        self, callback: Callable[[], None], timeout: timedelta
+    ) -> Generator[None, None, None]:
+        self._clear_del_queue()
+        loop = self._maybe_start_event_loop()
+        handle = _TimerHandle()
+        loop.call_soon_threadsafe(
+            self._register_callback, loop, callback, timeout, handle
+        )
+        yield
+        handle.cancel()
+    def _clear_del_queue(self) -> int:
+        """
+        Clear the queue of futures to be deleted.
+        Returns the number of items deleted.
+        """
+        count = 0
+        while True:
+            try:
+                # get and immediately discard item
+                item = self._del_queue.get_nowait()
+                refcount = sys.getrefcount(item)
+                assert (
+                    # 1 from item, 1 from getrefcount
+                    refcount == 2
+                ), f"items in del_queue reference should not have other references, found {refcount=}"
+                del item
+                count += 1
+            except queue.Empty:
+                break
+        return count
+_TIMEOUT_MANAGER = _TimeoutManager()
+def future_timeout(fut: Future[T], timeout: timedelta) -> Future[T]:
+    """
+    Return a Future that completes with the result of the given Future within
+    the given timeout or with a TimeoutError.
+    Args:
+        fut: The Future to wait for
+        timeout: The timeout to wait for the Future to complete
+    Returns:
+        The future with a timeout
+    """
+    return _TIMEOUT_MANAGER.register(fut, timeout)
+def future_wait(fut: Future[T], timeout: timedelta) -> T:
+    """
+    Wait for a Future to complete up to a timeout.
+    Args:
+        fut: The Future to wait for
+        timeout: The timeout to wait for the Future to complete
+    Returns:
+        The result of the Future if it completed within the timeout.
+    Raises:
+        TimeoutError if the Future did not complete within the timeout.
+        Any other exception that occurred in the Future.
+    """
+    event: threading.Event = threading.Event()
+    def callback(fut: Future[T]) -> T:
+        event.set()
+        return fut.wait()
+    fut = fut.then(callback)
+    if not event.wait(timeout=timeout.total_seconds()):
+        raise TimeoutError(f"future did not complete within {timeout}")
+    return fut.wait()
+def stream_timeout(callback: Callable[[], None], timeout: timedelta) -> None:
+    """
+    Registers a callback that will be called after the specified timeout if
+    the current stream doesn't complete in time.
+    This uses a cuda Event to track the completion of the current stream. If
+    the stream is not complete after the timeout, the callback is called.
+    Args:
+        callback: The callback to call if the stream doesn't complete in time.
+        timeout: The timeout to wait for the stream to complete.
+    """
+    _TIMEOUT_MANAGER.stream_timeout(callback, timeout)
+@contextmanager
+def context_timeout(
+    callback: Callable[[], None], timeout: timedelta
+) -> Generator[None, None, None]:
+    """
+    Registers a callback that will be called after the specified timeout if
+    the current contextmanager doesn't exit in time.
+    Args:
+        callback: The callback to call if we time out.
+        timeout: How long to wait for the contextmanager to exit.
+    """
+    with _TIMEOUT_MANAGER.context_timeout(callback, timeout):
+        yield

torchft/futures_test.py ADDED Viewed

@@ -0,0 +1,140 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import threading
+from datetime import timedelta
+from unittest import skipUnless, TestCase
+from unittest.mock import Mock, patch
+import torch
+from torch.futures import Future
+from torchft.futures import (
+    _TIMEOUT_MANAGER,
+    context_timeout,
+    future_timeout,
+    future_wait,
+    stream_timeout,
+)
+class FuturesTest(TestCase):
+    def setUp(self) -> None:
+        self._original_watchdog_interval = _TIMEOUT_MANAGER._watchdog_interval
+        _TIMEOUT_MANAGER._watchdog_interval = timedelta(seconds=1)
+    def tearDown(self) -> None:
+        _TIMEOUT_MANAGER._watchdog_interval = self._original_watchdog_interval
+    def test_future_wait(self) -> None:
+        fut = Future()
+        with self.assertRaisesRegex(TimeoutError, "future did not complete within"):
+            future_wait(fut, timeout=timedelta(seconds=0.01))
+        fut = Future()
+        fut.set_result(1)
+        self.assertEqual(future_wait(fut, timeout=timedelta(seconds=1.0)), 1)
+        fut = Future()
+        fut.set_exception(RuntimeError("test"))
+        with self.assertRaisesRegex(RuntimeError, "test"):
+            future_wait(fut, timeout=timedelta(seconds=1.0))
+    def test_future_timeout(self) -> None:
+        fut = Future()
+        timed_fut = future_timeout(fut, timeout=timedelta(seconds=0.01))
+        with self.assertRaisesRegex(TimeoutError, "future did not complete within"):
+            timed_fut.wait()
+    def test_future_timeout_result(self) -> None:
+        fut = Future()
+        timed_fut = future_timeout(fut, timeout=timedelta(seconds=10))
+        fut.set_result(1)
+        self.assertEqual(timed_fut.wait(), 1)
+    def test_future_timeout_exception(self) -> None:
+        fut = Future()
+        timed_fut = future_timeout(fut, timeout=timedelta(seconds=10))
+        fut.set_exception(RuntimeError("test"))
+        with self.assertRaisesRegex(RuntimeError, "test"):
+            timed_fut.wait()
+    def test_context_timeout(self) -> None:
+        barrier: threading.Barrier = threading.Barrier(2)
+        def callback() -> None:
+            barrier.wait()
+        with context_timeout(callback, timedelta(seconds=0.01)):
+            # block until timeout fires
+            barrier.wait()
+        def fail() -> None:
+            self.fail("timeout should be cancelled")
+        with context_timeout(fail, timedelta(seconds=10)):
+            pass
+    # pyre-fixme[56]: Pyre was not able to infer the type of decorator
+    @skipUnless(torch.cuda.is_available(), "CUDA is required for this test")
+    def test_stream_timeout(self) -> None:
+        torch.cuda.synchronize()
+        def callback() -> None:
+            self.fail()
+        stream_timeout(callback, timeout=timedelta(seconds=0.01))
+        # make sure event completes
+        torch.cuda.synchronize()
+        # make sure that event is deleted on the deletion queue
+        item = _TIMEOUT_MANAGER._del_queue.get(timeout=10.0)
+        _TIMEOUT_MANAGER._del_queue.put(item)
+        del item
+        self.assertEqual(_TIMEOUT_MANAGER._clear_del_queue(), 1)
+    # Test that when a timeout handle gets stuck, `sys.exit(1)` is called
+    @patch("sys.exit")
+    def test_exit_on_stuck_callback(self, mock_exit: Mock) -> None:
+        exit_event: threading.Event = threading.Event()
+        def custom_exit(_) -> None:
+            # 3. When event loop is stuck, exit(1) is called
+            nonlocal exit_event
+            exit_event.set()
+        mock_exit.side_effect = custom_exit
+        callback_event: threading.Event = threading.Event()
+        def callback() -> None:
+            # 2. Make sure callback blocks event loop
+            nonlocal callback_event
+            callback_event.wait()
+        context_event: threading.Event = threading.Event()
+        def thread_fn() -> None:
+            with context_timeout(callback, timedelta(seconds=0.01)):
+                # 1. Make sure context doesn't finish in time
+                nonlocal context_event
+                context_event.wait()
+        thread = threading.Thread(target=thread_fn)
+        thread.start()
+        # 4. exit event will wake this up
+        exit_event.wait()
+        mock_exit.assert_called_once_with(1)
+        # 5. event loop is still stuck, so let's unblock it
+        callback_event.set()
+        # 6. unblock the context and make sure it exits
+        context_event.set()
+        thread.join()

torchft/http.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import socket
+from http.server import ThreadingHTTPServer
+class _IPv6HTTPServer(ThreadingHTTPServer):
+    address_family: socket.AddressFamily = socket.AF_INET6
+    request_queue_size: int = 1024

torchft/lighthouse_test.py ADDED Viewed

@@ -0,0 +1,163 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import time
+from datetime import timedelta
+from unittest import TestCase
+import torch.distributed as dist
+from torchft import Manager, ProcessGroupGloo
+from torchft._torchft import LighthouseClient, LighthouseServer, Quorum, QuorumMember
+class TestLighthouse(TestCase):
+    def test_join_timeout_behavior(self) -> None:
+        """Test that join_timeout_ms affects joining behavior"""
+        # To test, we create a lighthouse with 100ms and 400ms join timeouts
+        # and measure the time taken to validate the quorum.
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=1,
+            join_timeout_ms=100,
+        )
+        # Create a manager that tries to join
+        try:
+            store = dist.TCPStore(
+                host_name="localhost",
+                port=0,
+                is_master=True,
+                wait_for_workers=False,
+            )
+            pg = ProcessGroupGloo()
+            manager = Manager(
+                pg=pg,
+                min_replica_size=1,
+                load_state_dict=lambda x: None,
+                state_dict=lambda: None,
+                replica_id=f"lighthouse_test",
+                store_addr="localhost",
+                store_port=store.port,
+                rank=0,
+                world_size=1,
+                use_async_quorum=False,
+                lighthouse_addr=lighthouse.address(),
+            )
+            start_time = time.time()
+            manager.start_quorum()
+            time_taken = time.time() - start_time
+            assert time_taken < 0.4, f"Time taken to join: {time_taken} > 0.4s"
+        finally:
+            # Cleanup
+            lighthouse.shutdown()
+            if "manager" in locals():
+                manager.shutdown()
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=1,
+            join_timeout_ms=400,
+        )
+    def test_heartbeat_timeout_ms_sanity(self) -> None:
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=1,
+            heartbeat_timeout_ms=100,
+        )
+        lighthouse.shutdown()
+    def test_lighthouse_client_behavior(self) -> None:
+        """Test that using LighthouseClient with a generic quorum behavior"""
+        # To test, we create a lighthouse with 100ms and 400ms join timeouts
+        # and measure the time taken to validate the quorum.
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=1,
+            join_timeout_ms=100,
+        )
+        # Create a manager that tries to join
+        try:
+            client = LighthouseClient(
+                addr=lighthouse.address(),
+                connect_timeout=timedelta(seconds=1),
+            )
+            store = dist.TCPStore(
+                host_name="localhost",
+                port=0,
+                is_master=True,
+                wait_for_workers=False,
+            )
+            result = client.quorum(
+                replica_id="lighthouse_test",
+                address="localhost",
+                store_address=f"localhost:{store.port}",
+                step=1,
+                world_size=1,
+                shrink_only=False,
+                timeout=timedelta(seconds=1),
+                data={"my_data": 1234},
+            )
+            assert result is not None
+            assert isinstance(result, Quorum)
+            assert len(result.participants) == 1
+            for member in result.participants:
+                assert isinstance(member, QuorumMember)
+                assert member.replica_id == "lighthouse_test"
+                assert member.data is not None
+                assert "my_data" in member.data
+                assert member.data["my_data"] == 1234
+            # Test the optional args
+            result = client.quorum(
+                replica_id="lighthouse_test",
+                timeout=timedelta(seconds=1),
+            )
+            assert result is not None
+            for member in result.participants:
+                assert member.replica_id == "lighthouse_test"
+        finally:
+            # Cleanup
+            lighthouse.shutdown()
+    def test_heartbeat_round_trip(self) -> None:
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=1,
+            heartbeat_timeout_ms=200,
+        )
+        try:
+            client = LighthouseClient(
+                addr=lighthouse.address(),
+                connect_timeout=timedelta(seconds=1),
+            )
+            client.heartbeat("rep0")
+            # (Should still be alive, as sleep time is less than timeout)
+            time.sleep(0.15)
+            q = client.quorum(
+                replica_id="rep0",
+                timeout=timedelta(milliseconds=500),
+            )
+            assert any(m.replica_id == "rep0" for m in q.participants)
+            # (Wait long enough for timeout to trigger)
+            time.sleep(0.25)
+            # "Probe" with different replica so we don't revive rep0
+            probe = client.quorum(
+                replica_id="probe",
+                timeout=timedelta(milliseconds=500),
+            )
+            assert all(m.replica_id != "rep0" for m in probe.participants)
+        finally:
+            lighthouse.shutdown()