PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/examples/slurm/punisher.py ADDED Viewed

@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import logging
+import random
+import time
+from torchx import specs
+from torchx.runner import get_runner, Runner
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger(__name__)
+_SCHEDULER = "slurm"
+def kill_all(runner: Runner) -> None:
+    jobs = runner.list(_SCHEDULER)
+    jobs = [job for job in jobs if job.state == specs.AppState.RUNNING]
+    for job in jobs:
+        if "ft_" not in job.name:
+            continue
+        print(f"killing {job.app_handle}")
+        runner.cancel(job.app_handle)
+def kill_one(runner: Runner) -> None:
+    jobs = runner.list(_SCHEDULER)
+    jobs = [job for job in jobs if job.state == specs.AppState.RUNNING]
+    candidates = []
+    for job in jobs:
+        if "ft_" not in job.name:
+            continue
+        if "ft_0" in job.name:
+            continue
+        candidates.append(job.app_handle)
+    choice = random.choice(candidates)
+    print(f"killing {choice=} {candidates=}")
+    runner.cancel(choice)
+def kill_loop(runner: Runner, args: argparse.Namespace) -> None:
+    for _ in range(args.num_failures):
+        kill_one(runner)
+        dur = random.random() * (2 * args.mtbf_secs)
+        print(f"sleeping for {dur=} {args.mtbf_secs=}")
+        time.sleep(args.mtbf_secs)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="CLI tool to inject failures on slurm")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # kill_loop subcommand
+    kill_loop_parser = subparsers.add_parser("kill_loop", help="Kill jobs in a loop")
+    kill_loop_parser.add_argument(
+        "--mtbf-secs",
+        type=float,
+        default=5,
+        help="Mean time between failures",
+    )
+    kill_loop_parser.add_argument(
+        "--num-failures",
+        type=int,
+        default=1,
+        help="Number of failures to inject",
+    )
+    # kill_one subcommand
+    subparsers.add_parser("kill_one", help="Kill a single job")
+    # kill_all subcommand
+    subparsers.add_parser("kill_all", help="Kill all jobs")
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        return
+    with get_runner() as runner:
+        if args.command == "kill_loop":
+            kill_loop(runner, args)
+        elif args.command == "kill_one":
+            kill_one(runner)
+        elif args.command == "kill_all":
+            kill_all(runner)
+if __name__ == "__main__":
+    main()

torchft/examples/slurm/runner.py ADDED Viewed

@@ -0,0 +1,221 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import logging
+import os
+import time
+from torchx import specs
+from torchx.components.dist import ddp
+from torchx.runner import get_runner, Runner
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger(__name__)
+_SCHEDULER = "slurm"
+def _make_app(replica_id: int, cli_args: argparse.Namespace) -> specs.AppDef:
+    args = [
+        "--comm.trace_buf_size=0",
+        "--comm.train_timeout_seconds=60",
+        "--metrics.log_freq=1",
+        "--profiling.enable_profiling",
+        "--experimental.custom_args_module=torchtitan.components.ft.config",
+        "--job.config_file=./torchtitan/models/llama3/train_configs/llama3_8b.toml",
+        "--model.name=llama3_ft",
+        "--training.dataset=c4",
+        "--training.steps=10000",
+        "--training.local_batch_size=2",
+        f"--parallelism.data_parallel_shard_degree={cli_args.nodes * cli_args.nproc_per_node}",
+        "--fault_tolerance.enable",
+        f"--fault_tolerance.replica_id={replica_id}",
+        f"--fault_tolerance.group_size={cli_args.replica_count}",
+        f"--fault_tolerance.process_group={cli_args.process_group}",
+        f"--fault_tolerance.process_group_timeout_ms={600 * 1000}",
+    ]
+    if cli_args.enable_semi_sync:
+        args += [
+            f"--fault_tolerance.semi_sync_method={cli_args.semi_sync_method}",
+        ]
+    if cli_args.semi_sync_method == "diloco":
+        args += [
+            "--fault_tolerance.sync_steps=20",
+            "--fault_tolerance.fragment_sync_delay=1",
+            f"--fault_tolerance.num_fragments={cli_args.num_fragments}",
+        ]
+    if replica_id == 0:
+        args += [
+            "--metrics.enable-wandb",
+            "--checkpoint.interval=100",
+        ]
+    env = {}
+    # use agent store in torchelastic to avoid TCPStore init race condition
+    env["TORCH_SHARE_RDZV_TCP_STORE"] = "1"
+    env["TORCH_CPP_LOG_LEVEL"] = "INFO"
+    env["TORCH_CUDA_SANITIZER=1"] = "1"
+    # NCCL envs for debugging
+    env["NCCL_DEBUG"] = "INFO"
+    env["NCCL_DEBUG_SUBSYS"] = "ALL"
+    env["NCCL_PROTO"] = "Simple"
+    # gloo
+    if os.environ.get("GLOO_SOCKET_IFNAME") is not None:
+        env["GLOO_SOCKET_IFNAME"] = os.environ.get("GLOO_SOCKET_IFNAME")
+    # application log levels
+    env["LOGLEVEL"] = "INFO"
+    env["RUST_LOGS"] = "INFO"
+    env["TORCH_CPP_LOG_LEVEL"] = "INFO"
+    # application timeouts
+    env["TORCHFT_QUORUM_TIMEOUT_SEC"] = "900"
+    env["TORCHFT_TIMEOUT_SEC"] = "600"
+    env["TORCHFT_QUORUM_RETRIES"] = "0"
+    env["TORCHFT_LIGHTHOUSE"] = os.environ.get(
+        "TORCHFT_LIGHTHOUSE", "http://slurm-head-node-0:29510"
+    )
+    env["WANDB_PROJECT"] = "torchft"
+    app = ddp(
+        *args,
+        name=f"ft_{replica_id}",
+        env=env,
+        script="./torchtitan/train.py",
+        gpu=cli_args.nproc_per_node,
+        j=f"{cli_args.nodes}x{cli_args.nproc_per_node}",
+    )
+    app.roles[0].name = app.name
+    return app
+def start_replica(
+    runner: Runner, replica_id: int, args: argparse.Namespace
+) -> specs.AppHandle:
+    app = _make_app(replica_id, args)
+    app_handle = runner.run(
+        app,
+        scheduler=_SCHEDULER,
+    )
+    return app_handle
+def monitor(runner: Runner, args: argparse.Namespace) -> None:
+    jobs = runner.list(_SCHEDULER)
+    jobs = [job for job in jobs if job.state == specs.AppState.RUNNING]
+    active_replicas = {}
+    for job in jobs:
+        if "ft_" not in job.name:
+            continue
+        name, _, _ = job.name.partition("-")
+        _, _, replica_id_str = name.partition("_")
+        replica_id = int(replica_id_str)
+        active_replicas[replica_id] = job
+    to_launch = set()
+    for replica_id in range(args.replica_count):
+        alive = replica_id in active_replicas
+        if alive:
+            job = active_replicas[replica_id]
+            print(f" - {replica_id=:2d}: ALIVE {job.app_handle}")
+        else:
+            print(f" - {replica_id=:2d}: DEAD")
+            to_launch.add(replica_id)
+    for replica_id in to_launch:
+        app_handle = start_replica(
+            runner,
+            replica_id,
+            args,
+        )
+        print(f"launched {replica_id=}: {app_handle=}")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="CLI tool lauch data parallel replicas on slurm"
+    )
+    parser.add_argument(
+        "--workspace-dir", type=str, help="Location of torchtitan folder"
+    )
+    parser.add_argument(
+        "--nodes",
+        type=int,
+        default=10,
+        help="Number of nodes per replica",
+    )
+    parser.add_argument(
+        "--nproc-per-node",
+        type=int,
+        default=10,
+        help="Number of ranks per node",
+    )
+    parser.add_argument(
+        "--replica-count",
+        type=int,
+        default=10,
+        help="Number of data parallel replicas",
+    )
+    parser.add_argument(
+        "--process-group",
+        type=str,
+        default="gloo",
+        help="The process group to use for data parallel",
+    )
+    parser.add_argument(
+        "--enable-semi-sync",
+        type=bool,
+        default=True,
+        help="Whether to enable semi-sync method for data parallel",
+    )
+    parser.add_argument(
+        "--semi-sync-method",
+        type=str,
+        default="diloco",
+        help="The semi-sync method to use for data parallel. Options: diloco, local_sgd",
+    )
+    parser.add_argument(
+        "--num-fragments",
+        type=int,
+        default=2,
+        help="The number of fragments to use for data parallel. Only used for diloco semi-sync method",
+    )
+    args = parser.parse_args()
+    os.chdir(args.workspace_dir)
+    with get_runner() as runner:
+        while True:
+            monitor(runner, args)
+            time.sleep(10)
+if __name__ == "__main__":
+    main()

torchft/fsdp_test.py ADDED Viewed

@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import multiprocessing
+import os
+import unittest
+from concurrent.futures import ProcessPoolExecutor
+from unittest.mock import Mock
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch._C._distributed_c10d import ReduceOp
+from torch.distributed._composable.fsdp import FSDPModule, fully_shard
+from torch.distributed.tensor import init_device_mesh
+from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
+from torchft.manager import Manager
+from torchft.process_group import ProcessGroupGloo
+class FSDPTest(unittest.TestCase):
+    @staticmethod
+    def _test_fsdp(
+        world_size: int,
+        rank: int,
+        dp_replicate: int = 2,
+        dp_shard: int = 2,
+        tp: int = 1,
+    ) -> None:
+        torch.cuda.set_device(rank)
+        group_size = world_size // dp_replicate
+        group = rank // group_size
+        group_rank = rank % group_size
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(12346 + group)
+        os.environ["RANK"] = str(group_rank)
+        os.environ["WORLD_SIZE"] = str(group_size)
+        manager = Mock(spec=Manager)
+        pg: ProcessGroupGloo = Mock(spec=ProcessGroupGloo)
+        device_mesh = init_device_mesh(
+            device_type="cuda",
+            mesh_shape=(dp_shard, tp),
+            mesh_dim_names=("dp_shard", "tp"),
+        )
+        manager.num_participants.return_value = 1
+        model = nn.Linear(128, 128).cuda()
+        batch = torch.randn(4, 128).cuda()
+        fsdp_mesh = device_mesh["dp_shard"]
+        def all_reduce_hook(output: torch.Tensor) -> None:
+            dist.all_reduce(output, group=pg, op=ReduceOp.AVG)
+        def apply_set_all_reduce_hook(m: nn.Module) -> None:
+            assert isinstance(m, FSDPModule)
+            m.set_all_reduce_hook(all_reduce_hook)
+        if tp > 1:
+            tp_mesh = device_mesh["tp"]
+            model = parallelize_module(
+                model,
+                tp_mesh,
+                ColwiseParallel(),
+            )
+        shard_model = fully_shard(model, mesh=fsdp_mesh)
+        shard_model.apply(apply_set_all_reduce_hook)
+        shard_model(batch).mean().backward()
+    # pyre-ignore[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(torch.cuda.device_count() < 4, "Not enough GPUs")
+    def test_fsdp(self) -> None:
+        context = multiprocessing.get_context("spawn")
+        with ProcessPoolExecutor(max_workers=4, mp_context=context) as executor:
+            futures = []
+            for i in range(4):
+                future = executor.submit(self._test_fsdp, 4, i)
+                futures.append(future)
+            for fut in futures:
+                fut.result()
+    # pyre-ignore[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(torch.cuda.device_count() < 4, "Not enough GPUs")
+    def test_fsdp_tp(self) -> None:
+        context = multiprocessing.get_context("spawn")
+        with ProcessPoolExecutor(max_workers=4, mp_context=context) as executor:
+            futures = []
+            for i in range(4):
+                future = executor.submit(
+                    self._test_fsdp, 4, i, dp_replicate=1, dp_shard=2, tp=2
+                )
+                futures.append(future)
+            for fut in futures:
+                fut.result()