PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torchft.data import DistributedSampler
+from torchft.ddp import DistributedDataParallel
+from torchft.manager import Manager
+from torchft.optim import OptimizerWrapper as Optimizer
+from torchft.otel import setup_logger
+from torchft.process_group import (
+    ProcessGroupBabyNCCL,
+    ProcessGroupBabyXCCL,
+    ProcessGroupGloo,
+    ProcessGroupNCCL,
+    ProcessGroupXCCL,
+)
+setup_logger("torchft_quorums")
+setup_logger("torchft_commits")
+setup_logger("torchft_errors")
+__all__ = (
+    "DistributedDataParallel",
+    "DistributedSampler",
+    "Manager",
+    "Optimizer",
+    "ProcessGroupNCCL",
+    "ProcessGroupXCCL",
+    "ProcessGroupBabyNCCL",
+    "ProcessGroupBabyXCCL",
+    "ProcessGroupGloo",
+)

torchft/_test/diloco_trainer.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import logging
+import os
+from datetime import timedelta
+from typing import Any, Dict
+import torch
+from torch import nn
+from torch.distributed.tensor import DeviceMesh, DTensor
+from torchft.local_sgd import DiLoCo
+from torchft.manager import Manager
+from torchft.manager_integ_test import MyModel, Runner
+from torchft.process_group import (
+    FakeProcessGroupWrapper,
+    ProcessGroupBabyNCCL,
+    ProcessGroupGloo,
+)
+logger: logging.Logger = logging.getLogger(__name__)
+class MultiModel(torch.nn.Module):
+    def __init__(self, in_dim: int = 3, out_dim: int = 4, n_layers: int = 1) -> None:
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+    def get_rand_inputs(
+        self, batch_size: int, device: torch.device = torch.device("cpu")
+    ) -> torch.Tensor:
+        raise
+    def get_rand_labels(
+        self, batch_size: int, device: torch.device = torch.device("cpu")
+    ) -> torch.Tensor:
+        raise
+class MultiMyModel(MultiModel):
+    def __init__(self, in_dim: int = 3, out_dim: int = 4, n_layers: int = 1) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        for _ in range(n_layers):
+            self.layers.append(MyModel(in_dim, out_dim))
+            in_dim, out_dim = out_dim, in_dim
+        self.out_dim = in_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x)
+        return x
+    def get_rand_inputs(
+        self, batch_size: int, device: torch.device = torch.device("cpu")
+    ) -> torch.Tensor:
+        return torch.rand(batch_size, self.in_dim, device=device)
+    def get_rand_labels(
+        self, batch_size: int, device: torch.device = torch.device("cpu")
+    ) -> torch.Tensor:
+        return torch.randint(self.out_dim, (batch_size,), device=device)
+class DiLoCoTrainer:
+    """
+    A class that encapsulates the DiLoCo training process.
+    """
+    def __init__(
+        self,
+        rank: int,
+        store_port: int,
+        device: torch.device,
+        runner: Runner,
+        model_state_dict: dict[str, Any],
+        n_fragments: int,
+        diloco_args: dict[str, Any],
+    ) -> None:
+        """
+        Initialize the DiLoCoTrainer.
+        Args:
+            rank: The rank of the current process.
+            store_port: The port for the store.
+            device: The device to use for training.
+            runner: The runner instance.
+            train_loop_args: Additional arguments for the training loop.
+        """
+        self.rank: int = rank
+        self.store_port: int = store_port
+        self.device: torch.device = device
+        self.runner: Runner = runner
+        # Extract arguments from train_loop_args
+        self.model_state_dict: Dict[str, Any] = model_state_dict
+        self.n_fragments: int = n_fragments
+        self.diloco_args: dict[str, Any] = diloco_args
+        # Initialize components
+        self.model: MultiModel = self.setup_model()
+        self.inner_optimizer: torch.optim.Optimizer = self.setup_inner_optimizer()
+        self.outer_optimizers: list[torch.optim.Optimizer] = (
+            self.setup_outer_optimizers()
+        )
+        self.pg: FakeProcessGroupWrapper = self.setup_pg()
+        # Set up the process group for the event injector
+        self.runner.event_injector.set_pg(self.pg)
+        self.manager: Manager = self.setup_manager()
+        self.device_mesh: None | DeviceMesh = None
+        self.setup_distributed()
+        self.criterion: nn.CrossEntropyLoss = nn.CrossEntropyLoss()
+        self.diloco: DiLoCo | None = None
+    def setup_model(self) -> MultiModel:
+        """Set up the model and move it to the device."""
+        model = MultiMyModel(2, 3, self.n_fragments)
+        model.load_state_dict(self.model_state_dict)
+        model.to(self.device)
+        return model
+    def setup_inner_optimizer(self) -> torch.optim.Optimizer:
+        """Set up the inner optimizer."""
+        return torch.optim.AdamW(
+            self.model.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95)
+        )
+    def setup_outer_optimizers(self) -> list[torch.optim.Optimizer]:
+        """Set up outer optimizers."""
+        # Setup inner optimizer
+        # Create one outer optimizer per fragment
+        outer_optimizers = []
+        for _, layers in enumerate(self.model.layers):
+            outer_optimizers.append(
+                torch.optim.SGD(
+                    layers.parameters(), lr=0.7, momentum=0.9, nesterov=True
+                )
+            )
+        return outer_optimizers
+    def setup_pg(self) -> FakeProcessGroupWrapper:
+        if self.device.type == "cuda":
+            return FakeProcessGroupWrapper(ProcessGroupBabyNCCL())
+        else:
+            return FakeProcessGroupWrapper(
+                ProcessGroupGloo(timeout=timedelta(seconds=10))
+            )
+    def setup_manager(self) -> Manager:
+        """Set up the process group and manager."""
+        print(
+            f"worker {self.runner.replica_id=} {self.rank=} {self.runner.world_size=} starting"
+        )
+        # Create manager with all arguments passed directly
+        return Manager(
+            pg=self.pg,
+            min_replica_size=2,
+            use_async_quorum=False,
+            load_state_dict=self.load_state_dict,
+            state_dict=self.state_dict,
+            replica_id=str(self.runner.replica_id),
+            store_addr="localhost",
+            store_port=self.store_port,
+            rank=self.rank,
+            world_size=self.runner.world_size,
+            lighthouse_addr=self.runner.lighthouse_address,
+            port=19530 + self.runner.replica_id,
+            connect_timeout=timedelta(seconds=10),
+            quorum_timeout=timedelta(seconds=10),
+            timeout=timedelta(seconds=10),
+            **self.runner.manager_args,  # type: ignore
+        )
+    def setup_distributed(self) -> None:
+        """Set up distributed training."""
+        # Initialize default group for device mesh to work
+        if not torch.distributed.is_initialized():
+            # TODO: remove this try-except once pytorch is updated to 2.8.0 and can use localhost:0
+            try:
+                torch.distributed.init_process_group(
+                    init_method="tcp://localhost:0",
+                    rank=self.rank,
+                    world_size=self.runner.world_size,
+                )
+            except ValueError:
+                os.environ["MASTER_ADDR"] = "localhost"
+                os.environ["MASTER_PORT"] = "0"
+                os.environ["WORLD_SIZE"] = str(self.runner.world_size)
+                os.environ["RANK"] = str(self.rank)
+        self.device_mesh = DeviceMesh(
+            self.device.type,
+            torch.arange(self.runner.world_size),
+        )
+        # Convert model parameters to DTensor
+        for layer in self.model.layers:
+            if isinstance(layer, nn.Linear):
+                for param in layer.parameters():
+                    param = DTensor.from_local(
+                        param,
+                        device_mesh=self.device_mesh,
+                    )
+    def load_state_dict(self, state_dict: Dict[str, Dict[str, object]]) -> None:
+        """
+        Load the state dictionary.
+        Args:
+            state_dict: The state dictionary to load.
+        """
+        assert self.diloco is not None
+        self.model.load_state_dict(state_dict["model"])
+        self.model.to(self.device)
+        self.inner_optimizer.load_state_dict(state_dict["inner_optim"])
+    def state_dict(self) -> Dict[str, Dict[str, object]]:
+        """
+        Get the state dictionary.
+        Returns:
+            The state dictionary.
+        """
+        assert self.diloco is not None
+        return {
+            "model": self.model.state_dict(),
+            "inner_optim": self.inner_optimizer.state_dict(),
+        }
+    def train_loop(self) -> dict[str, Any]:
+        """Run the training loop."""
+        # Ensure sync_every is set in diloco_args
+        all_state_dicts = {}
+        if "sync_every" not in self.diloco_args:
+            self.diloco_args["sync_every"] = 2
+        with DiLoCo(
+            self.manager,
+            [layer for layer in self.model.layers],
+            self.inner_optimizer,
+            self.outer_optimizers,
+            backup_device=self.device,
+            **self.diloco_args,
+        ) as self.diloco:
+            while True:
+                self.runner.event_injector.check(self.rank, self.manager.current_step())
+                manager_curr_step = self.manager.current_step()
+                if manager_curr_step not in all_state_dicts:
+                    # Store the manager state dict, converting to the right type
+                    all_state_dicts[manager_curr_step] = copy.deepcopy(
+                        self.manager._manager_state_dict()
+                    )
+                batch_size = 1
+                inputs = self.model.get_rand_inputs(batch_size, device=self.device)
+                labels = self.model.get_rand_labels(batch_size, device=self.device)
+                out = self.model(inputs)
+                loss = self.criterion(out, labels)
+                self.inner_optimizer.zero_grad()
+                loss.backward()
+                self.inner_optimizer.step()
+                # after 4 model updates then break
+                if self.manager.current_step() >= 4:
+                    break
+        return all_state_dicts

torchft/_test/managed_work_test.py ADDED Viewed

@@ -0,0 +1,320 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import types
+import unittest
+from datetime import timedelta
+from typing import Callable, cast, Dict, List, Optional, Tuple, TypeVar
+# Define a type variable for the Future's value type
+T = TypeVar("T")
+import parameterized
+import torch
+from torch.distributed.distributed_c10d import Work
+from torch.futures import Future
+from torchft.manager import _ManagedWork, Manager
+class SimpleWork(Work):
+    """A simple implementation of torch.distributed.Work for testing."""
+    def __init__(self, tensors: List[torch.Tensor]) -> None:
+        super().__init__()
+        self._tensors = tensors
+        self._future: Future[List[torch.Tensor]] = torch.futures.Future()
+        self._is_completed: bool = False
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        self._is_completed = True
+        self._future.set_result(self._tensors)
+        return True
+    def get_future(self) -> Future[List[torch.Tensor]]:
+        return self._future
+class TestManagedWork(unittest.TestCase):
+    @parameterized.parameterized.expand(
+        [
+            ("cpu", torch.device("cpu")),
+            ("cuda", torch.device("cuda:0")),
+        ]
+    )
+    def test_callbacks_execute_after_wait(
+        self, name: str, device: torch.device
+    ) -> None:
+        """Test that callbacks are only executed after wait() is called."""
+        # Skip if CUDA is requested but not available
+        if device.type == "cuda" and not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        # Create a tensor to work with
+        tensor: torch.Tensor = torch.ones(1, dtype=torch.float32, device=device)
+        # Create a simple work object
+        work = SimpleWork([tensor])
+        # Create a minimal manager object with just the wrap_future method
+        manager = Manager.__new__(Manager)  # Create instance without calling __init__
+        # We're using types.MethodType to attach a method to the manager instance
+        # This is just for testing purposes
+        manager.wrap_future = types.MethodType(  # type: ignore
+            lambda self, fut, default, timeout=None: fut, manager
+        )
+        # Create the managed work
+        managed_work = _ManagedWork(manager, work, [tensor])
+        # Track callback execution
+        callback_executed: bool = False
+        def callback(fut: Future[object]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            nonlocal callback_executed, tensor
+            callback_executed = True
+            # Multiply tensor by 2 to verify the callback ran
+            tensor.mul_(2)
+            return [tensor]
+        # Add the callback
+        fut = managed_work.get_future()
+        fut = fut.then(callback)
+        # Verify callback hasn't executed yet
+        self.assertFalse(callback_executed)
+        self.assertEqual(tensor.item(), 1.0)
+        # Call wait() which should trigger the callback
+        managed_work.wait()
+        # Verify callback has executed
+        self.assertTrue(callback_executed)
+        self.assertEqual(tensor.item(), 2.0)
+    @parameterized.parameterized.expand(
+        [
+            ("cpu", torch.device("cpu")),
+            ("cuda", torch.device("cuda:0")),
+        ]
+    )
+    def test_multiple_callbacks_execute_in_order(
+        self, name: str, device: torch.device
+    ) -> None:
+        """Test that multiple callbacks are executed in the order they were added."""
+        # Skip if CUDA is requested but not available
+        if device.type == "cuda" and not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        # Create a tensor to work with
+        tensor: torch.Tensor = torch.ones(1, dtype=torch.float32, device=device)
+        # Create a simple work object
+        work = SimpleWork([tensor])
+        # Create a minimal manager object with just the wrap_future method
+        manager = Manager.__new__(Manager)  # Create instance without calling __init__
+        manager.wrap_future = types.MethodType(  # type: ignore
+            lambda self, fut, default, timeout=None: fut, manager
+        )
+        # Create the managed work
+        managed_work = _ManagedWork(manager, work, [tensor])
+        # Track execution order
+        execution_order: List[int] = []
+        def callback1(fut: Future[list[torch.Tensor]]) -> List[torch.Tensor]:
+            nonlocal tensor
+            execution_order.append(1)
+            tensor.add_(1)
+            return [tensor]
+        def callback2(fut: Future[list[torch.Tensor]]) -> List[torch.Tensor]:
+            nonlocal tensor
+            execution_order.append(2)
+            tensor.add_(2)
+            return [tensor]
+        def callback3(fut: Future[list[torch.Tensor]]) -> List[torch.Tensor]:
+            nonlocal tensor
+            execution_order.append(3)
+            tensor.add_(3)
+            return [tensor]
+        # Add callbacks
+        fut = managed_work.get_future()
+        fut = cast(Future[list[torch.Tensor]], fut)
+        fut = fut.then(callback1)
+        fut = fut.then(callback2)
+        fut = fut.then(callback3)
+        # Verify no callbacks have executed yet
+        self.assertEqual(len(execution_order), 0)
+        self.assertEqual(tensor.item(), 1.0)
+        # Call wait() which should trigger the callbacks
+        managed_work.wait()
+        # Verify callbacks executed in order
+        self.assertEqual(execution_order, [1, 2, 3])
+        # Each callback adds to the tensor, so final value should be 1 + 1 + 2 + 3 = 7
+        self.assertEqual(tensor.item(), 7.0)
+    @parameterized.parameterized.expand(
+        [
+            ("cpu", torch.device("cpu")),
+            ("cuda", torch.device("cuda:0")),
+        ]
+    )
+    def test_future_then_api(self, name: str, device: torch.device) -> None:
+        """Test that the future's then API works correctly with ManagedWork."""
+        # Skip if CUDA is requested but not available
+        if device.type == "cuda" and not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        # Create a tensor to work with
+        tensor: torch.Tensor = torch.ones(1, dtype=torch.float32, device=device)
+        # Create a simple work object
+        work = SimpleWork([tensor])
+        # Create a minimal manager object with just the wrap_future method
+        manager = Manager.__new__(Manager)  # Create instance without calling __init__
+        manager.wrap_future = types.MethodType(  # type: ignore
+            lambda self, fut, default, timeout=None: fut, manager
+        )
+        # Create the managed work
+        managed_work = _ManagedWork(manager, work, [tensor])
+        # Get the future
+        future = managed_work.get_future()
+        # Track callback execution
+        callback_executed: bool = False
+        def callback(fut: Future[object]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            nonlocal callback_executed, tensor
+            callback_executed = True
+            # Multiply tensor by 3 to verify the callback ran
+            tensor.mul_(3)
+            return [tensor]
+        # Use the then API
+        future = future.then(callback)
+        # Verify callback hasn't executed yet
+        self.assertFalse(callback_executed)
+        self.assertEqual(tensor.item(), 1.0)
+        # Call wait() on the managed_work first to set up the future properly
+        managed_work.wait()
+        # Verify callback has executed
+        self.assertTrue(callback_executed)
+        self.assertEqual(tensor.item(), 3.0)
+    @parameterized.parameterized.expand(
+        [
+            ("cpu", torch.device("cpu")),
+            ("cuda", torch.device("cuda:0")),
+        ]
+    )
+    def test_callbacks_changing_return_types(
+        self, name: str, device: torch.device
+    ) -> None:
+        """
+        Test that callbacks can change return types and that tensors are modified in-place.
+        This test demonstrates:
+        1. Callbacks changing return types (List[Tensor] -> Dict -> Tuple)
+        2. Using Future.value() instead of nonlocal
+        3. Verifying tensors are modified in-place for both approaches
+        """
+        # Skip if CUDA is requested but not available
+        if device.type == "cuda" and not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        # Create tensors to work with
+        tensor1: torch.Tensor = torch.ones(1, dtype=torch.float32, device=device)
+        tensor2: torch.Tensor = torch.ones(1, dtype=torch.float32, device=device) * 2
+        # Store original tensor memory addresses to verify in-place modification
+        tensor1_address = tensor1.data_ptr()
+        tensor2_address = tensor2.data_ptr()
+        # Create a simple work object
+        work = SimpleWork([tensor1, tensor2])
+        # Create a minimal manager object with just the wrap_future method
+        manager = Manager.__new__(Manager)  # Create instance without calling __init__
+        manager.wrap_future = types.MethodType(  # type: ignore
+            lambda self, fut, default, timeout=None: fut, manager
+        )
+        # Create the managed work
+        managed_work = _ManagedWork(manager, work, [tensor1, tensor2])
+        # Get the future
+        future = managed_work.get_future()
+        future = cast(Future[List[torch.Tensor]], future)
+        # First callback: Takes List[Tensor] and returns Dict[str, Tensor]
+        # Uses nonlocal to modify tensor1
+        def callback1(fut: Future[List[torch.Tensor]]) -> Dict[str, torch.Tensor]:
+            tensors = fut.value()
+            nonlocal tensor1
+            # Modify tensor1 in-place using nonlocal
+            tensor1.mul_(3)
+            # Return a dictionary instead of a list
+            return {"first": tensors[0], "second": tensors[1]}
+        # Second callback: Takes Dict[str, Tensor] and returns Tuple[Tensor, float]
+        # Uses Future.value() to modify tensor2
+        def callback2(
+            fut: Future[Dict[str, torch.Tensor]],
+        ) -> Tuple[torch.Tensor, float]:
+            data = fut.value()
+            # Modify tensor2 in-place using the value from the future
+            data["second"].add_(5)  # Should modify tensor2 in-place
+            # Return a tuple instead of a dict
+            return (data["second"], data["first"].item())
+        # Third callback: Takes Tuple[Tensor, float] and returns a single Tensor
+        def callback3(fut: Future[Tuple[torch.Tensor, float]]) -> torch.Tensor:
+            tensor, value = fut.value()
+            # Create a new tensor based on the tuple values
+            result = tensor * value
+            return result
+        # Chain the callbacks
+        future = future.then(callback1)
+        future = future.then(callback2)
+        future = future.then(callback3)
+        # Call wait() to trigger the callbacks
+        managed_work.wait()
+        # Verify tensor1 was modified in-place (using nonlocal)
+        self.assertEqual(tensor1.item(), 3.0)  # 1 * 3 = 3
+        self.assertEqual(tensor1.data_ptr(), tensor1_address)  # Same memory address
+        # Verify tensor2 was modified in-place (using Future.value())
+        self.assertEqual(tensor2.item(), 7.0)  # 2 + 5 = 7
+        self.assertEqual(tensor2.data_ptr(), tensor2_address)  # Same memory address
+        # Get the final result from the future
+        final_result = future.wait()
+        # The final result should be tensor2 * tensor1.item() = 7 * 3 = 21
+        self.assertEqual(final_result.item(), 21.0)
+if __name__ == "__main__":
+    unittest.main()