PyPI - torchft-nightly - Versions diffs - 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl - Mend

torchft-nightly 2026.1.3__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

torchft/__init__.py +34 -0
torchft/_test/diloco_trainer.py +287 -0
torchft/_test/managed_work_test.py +320 -0
torchft/_test_utils.py +111 -0
torchft/_torchft.cpython-310-x86_64-linux-gnu.so +0 -0
torchft/_torchft.pyi +116 -0
torchft/checkpointing/__init__.py +20 -0
torchft/checkpointing/_rwlock.py +136 -0
torchft/checkpointing/_serialization.py +39 -0
torchft/checkpointing/http_transport.py +299 -0
torchft/checkpointing/http_transport_bench.py +61 -0
torchft/checkpointing/http_transport_test.py +146 -0
torchft/checkpointing/pg_transport.py +306 -0
torchft/checkpointing/pg_transport_bench.py +99 -0
torchft/checkpointing/pg_transport_test.py +101 -0
torchft/checkpointing/rwlock_test.py +58 -0
torchft/checkpointing/transport.py +68 -0
torchft/checkpointing/transport_test.py +161 -0
torchft/collectives.py +415 -0
torchft/collectives_test.py +212 -0
torchft/coordination.py +39 -0
torchft/coordination_test.py +29 -0
torchft/data.py +77 -0
torchft/data_test.py +39 -0
torchft/ddp.py +105 -0
torchft/ddp_test.py +68 -0
torchft/diloco_regression_test.py +644 -0
torchft/examples/slurm/README.md +34 -0
torchft/examples/slurm/punisher.py +95 -0
torchft/examples/slurm/runner.py +221 -0
torchft/fsdp_test.py +102 -0
torchft/futures.py +353 -0
torchft/futures_test.py +140 -0
torchft/http.py +13 -0
torchft/lighthouse_test.py +163 -0
torchft/local_sgd.py +796 -0
torchft/local_sgd_integ_test.py +600 -0
torchft/local_sgd_test.py +324 -0
torchft/manager.py +1358 -0
torchft/manager_integ_test.py +653 -0
torchft/manager_test.py +911 -0
torchft/multiprocessing.py +38 -0
torchft/multiprocessing_dummy_context.py +135 -0
torchft/multiprocessing_test.py +58 -0
torchft/optim.py +63 -0
torchft/optim_test.py +50 -0
torchft/otel.py +134 -0
torchft/parameter_server.py +195 -0
torchft/parameter_server_test.py +47 -0
torchft/process_group.py +2118 -0
torchft/process_group_test.py +1028 -0
torchft/quantization.py +686 -0
torchft/quantization_test.py +131 -0
torchft/torchx.py +89 -0
torchft/utils.py +67 -0
torchft/work.py +26 -0
torchft_nightly-2026.1.3.dist-info/METADATA +308 -0
torchft_nightly-2026.1.3.dist-info/RECORD +61 -0
torchft_nightly-2026.1.3.dist-info/WHEEL +4 -0
torchft_nightly-2026.1.3.dist-info/entry_points.txt +2 -0
torchft_nightly-2026.1.3.dist-info/licenses/LICENSE +34 -0

torchft/process_group.py ADDED Viewed

@@ -0,0 +1,2118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Process Groups
+=========================
+This module implements fault tolerant process groups that can be reconfigured
+and resized at runtime.
+These extend the standard PyTorch ProcessGroup API and can be used in most
+places that would accept a standard process group. As these can change size at
+runtime users need to take care to not assume a static rank or world size.
+"""
+import logging
+import os
+import threading
+import time
+import warnings
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from datetime import timedelta
+from multiprocessing.connection import Connection
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+# pyre-fixme[21]: no attribute ProcessGroupGloo
+from torch.distributed import (
+    PrefixStore,
+    ProcessGroup as BaseProcessGroup,
+    ProcessGroupGloo as BaseProcessGroupGloo,
+    Store,
+    TCPStore,
+)
+from torch.distributed.distributed_c10d import (
+    AllgatherOptions,
+    AllreduceCoalescedOptions,
+    AllreduceOptions,
+    AllToAllOptions,
+    BarrierOptions,
+    BroadcastOptions,
+    ReduceOp,
+    ReduceScatterOptions,
+    Work,
+)
+from torch.futures import Future
+from torch.utils._pytree import tree_any
+# We import these for backwards compatibility
+from torchft.futures import context_timeout, stream_timeout
+from torchft.multiprocessing import _MonitoredPipe
+from torchft.utils import get_stream_context, record_event, synchronize
+from torchft.work import _DummyWork
+if TYPE_CHECKING:
+    from torchft.manager import Manager
+logger: logging.Logger = logging.getLogger(__name__)
+# TODO: use non strings which are cheaper
+_QUEUE_CLOSE = "queue_close"
+_FUTURE_RESULT = "fut_result"
+_FUTURE_EXCEPTION = "fut_exception"
+T = TypeVar("T")
+TORCH_NCCL_DEBUG_INFO_PIPE_FILE_ENV_VAR = "TORCH_NCCL_DEBUG_INFO_PIPE_FILE"
+# Used to trigger flight recorder if we trigger abort on the process group
+TORCHFT_TRIGGER_FR_ON_ABORT = "TORCHFT_TRIGGER_FR_ON_ABORT"
+def trigger_nccl_fr_trace_through_pipe(rank: int) -> bool:
+    """Collect NCCL flight recorder trace through the pipe."""
+    dump_file_prefix = os.environ.get(TORCH_NCCL_DEBUG_INFO_PIPE_FILE_ENV_VAR, "")
+    if not dump_file_prefix:
+        logging.info(
+            f"[rank {rank}] Triggering FR trace dump through pipe failed: pipe is not enabled."
+        )
+        return False
+    pipe_name = f"{dump_file_prefix}{rank}.pipe"
+    with open(pipe_name, "w") as f:
+        # Trigger fr trace dump through pipe
+        logging.info(f"[rank {rank}] Triggering FR trace dump through pipe...")
+        f.write("1\n")
+        time.sleep(60)
+    return True
+def create_store_client(store_addr: str, timeout: timedelta) -> Store:
+    """
+    Creates a PrefixStore(TCPStore(...)) client from an address in the format:
+    host:port/prefix
+    Ex: localhost:1234/my/prefix
+    """
+    host, _, rest = store_addr.partition(":")
+    port, _, prefix = rest.partition("/")
+    store = TCPStore(
+        host_name=host,
+        port=int(port),
+        is_master=False,
+        wait_for_workers=False,
+        timeout=timeout,
+    )
+    store = PrefixStore(prefix, store)
+    return store
+class ProcessGroup(BaseProcessGroup):
+    def __init__(self, *args: object, **kwargs: object) -> None:
+        # pyre-fixme[6]: got object
+        super().__init__(*args, **kwargs)
+        self._group_name: Optional[str] = None
+    # pyre-fixme[14]: inconsistent override
+    def allgather(
+        self,
+        output_tensors: List[List[torch.Tensor]],
+        input_tensor: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        """
+        Gathers tensors from the whole group in a list.
+        See torch.distributed.all_gather for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def allgather_into_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        """
+        Performs an allgather operation on coalesced tensors.
+        See torch.distributed.allgather_coalesced for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def allreduce(
+        self,
+        tensors: List[torch.Tensor],
+        opts: Union[AllreduceOptions, ReduceOp],
+    ) -> Work:
+        """
+        Reduces the tensor data across all machines in such a way that all get the final result.
+        See torch.distributed.all_reduce for more details.
+        """
+        raise NotImplementedError("not implemented")
+    def allreduce_coalesced(
+        self,
+        tensors: List[torch.Tensor],
+        opts: AllreduceCoalescedOptions,
+    ) -> Work:
+        """
+        Performs an all_reduce operation in a coalesced manner.
+        See torch.distributed.all_reduce_coalesced for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def alltoall_base(
+        self,
+        output_buffer: torch.Tensor,
+        input_buffer: torch.Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts: AllToAllOptions,
+    ) -> Work:
+        """
+        Performs an all_to_all operation.
+        See torch.distributed.all_to_all_single for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def barrier(self, opts: BarrierOptions) -> Work:
+        """
+        Synchronizes all processes.
+        See torch.distributed.barrier for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def broadcast(
+        self, tensor_list: List[torch.Tensor], opts: BroadcastOptions
+    ) -> Work:
+        """
+        Broadcasts the tensor to the whole group.
+        See torch.distributed.broadcast for more details.
+        """
+        raise NotImplementedError("not implemented")
+    def broadcast_one(self, tensor: torch.Tensor, root: int) -> Work:
+        opts = BroadcastOptions()
+        opts.rootRank = root
+        return self.broadcast([tensor], opts)
+    # pyre-fixme[14]: inconsistent override
+    def recv(self, tensors: List[torch.Tensor], src_rank: int, tag: int) -> Work:
+        """
+        Receives a list of tensors from the process with rank `rank`.
+        See torch.distributed.recv for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        """
+        Reduces, then scatters a list of tensors to all processes in a group.
+        See torch.distributed.reduce_scatter for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        """
+        Performs a reduce-scatter operation on coalesced tensors.
+        See torch.distributed.reduce_scatter_tensor for more details.
+        """
+        raise NotImplementedError("not implemented")
+    # pyre-fixme[14]: inconsistent override
+    def send(self, tensors: List[torch.Tensor], dst_rank: int, tag: int) -> Work:
+        """
+        Sends a list of tensors to the process with rank `dst_rank`.
+        See torch.distributed.send for more details.
+        """
+        raise NotImplementedError("not implemented")
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        """
+        This reconfigures the ProcessGroup to use a new store, rank and world size.
+        Every time this is called it must be provided with a unique prefixed
+        store address. I.e. localhost:1234/my/prefix/1
+        This function will block until the underlying ProcessGroup is created.
+        If an error occurs this will throw.
+        Args:
+            store_addr: address of the store to use
+            replica_id: the replica_id for this group
+            rank: rank of this process
+            world_size: world size of this process group
+            quorum_id: current quorum's identifier
+            group_rank: local rank within the replica group
+            group_world_size: the number of ranks within a replica
+            global_ranks: the global ranks part of this process group
+        """
+        raise NotImplementedError("not implemented")
+    def size(self) -> int:
+        raise NotImplementedError("not implemented")
+    def getBackendName(self) -> str:
+        raise NotImplementedError("not implemented")
+    def _register(self, name: str) -> str:
+        group_name = f"{self.getBackendName()}:{name}"
+        # This is needed for DeviceMesh and functional collectives to work.
+        # Resizable worlds don't fit well into DeviceMesh so we register a world
+        # size 1 PG.
+        def create_pg(
+            prefix_store: PrefixStore, rank: int, world_size: int, timeout: float
+        ) -> ProcessGroup:
+            return self
+        devices = ["cpu"]
+        if torch.cuda.is_available():
+            devices.append("cuda")
+        elif torch.xpu.is_available():
+            devices.append("xpu")
+        dist.Backend.register_backend(group_name, create_pg, devices=devices)
+        return group_name
+    def register(self, name: str) -> "ProcessGroup":
+        """
+        Registers the process group with the global registry. This enables usage
+        with things like functional_collectives which are compilable.
+        This should only be called once.
+        Args:
+            name: name must be a unique name for this process group
+        """
+        group_name = self._register(name)
+        return dist.new_group(
+            ranks=[dist.get_rank()],
+            backend=group_name,
+            group_desc=group_name,
+            timeout=timedelta(seconds=60.0),  # this timeout isn't used
+        )
+    @property
+    def group_name(self) -> str:
+        if self._group_name is None:
+            raise ValueError("ProcessGroup name not set")
+        return self._group_name
+    def _set_group_name(self, name: str) -> None:
+        self._group_name = name
+    def unregister(self) -> None:
+        """
+        Unregisters the process group with the global registry.
+        Must be registered first.
+        """
+        dist.destroy_process_group(self)
+    def abort(self) -> None:
+        """
+        Aborts the process group.
+        """
+        pass
+    def shutdown(self) -> None:
+        """
+        Shuts down the process group.
+        """
+        pass
+    def errored(self) -> Optional[Exception]:
+        """
+        Whether an async error occured that requires reconfiguration.
+        """
+        return None
+    def set_timeout(self, timeout: timedelta) -> None:
+        """
+        Sets the default timeout for the process group.
+        """
+        raise NotImplementedError("set_timeout not implemented")
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class ProcessGroupWrapper(ProcessGroup):
+    """
+    This is a wrapper around any ProcessGroup with a reconfiguration method.
+    Args:
+        timeout: timeout for reconfiguration for TCPStore
+        pg: optional ProcessGroup to use, if None a new one will be created
+    """
+    def __init__(
+        self,
+        timeout: timedelta = timedelta(seconds=60),
+        pg: Optional[ProcessGroup] = None,
+    ) -> None:
+        super().__init__(0, 1)
+        self._pg: Optional[BaseProcessGroup] = pg
+        self._timeout = timeout
+        self._replica_id: str | None = None
+        self._rank: int | None = None
+        self._quorum_id: int | None = None
+        self._group_rank: int | None = None
+        self._group_world_size: int | None = None
+        self._global_ranks: list[int] | None = None
+        self.errors_logger: logging.Logger = logging.getLogger("torchft_errors")
+    def getBackendName(self) -> str:
+        pg = self._pg
+        if isinstance(pg, ProcessGroup):
+            return pg.getBackendName()
+        raise NotImplementedError("not implemented")
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        pg = self._pg
+        self._replica_id = replica_id
+        self._quorum_id = quorum_id
+        self._group_rank = group_rank
+        self._group_world_size = group_world_size
+        self._rank = rank
+        self._global_ranks = global_ranks
+        if isinstance(pg, ProcessGroup):
+            pg.configure(
+                store_addr,
+                replica_id,
+                rank,
+                world_size,
+                quorum_id,
+                group_rank,
+                group_world_size,
+                global_ranks,
+            )
+            return
+        # abort if already initialized
+        self.abort(errored=False)
+        store = create_store_client(store_addr, timeout=self._timeout)
+        self._pg = self._create_pg(store, rank, world_size)
+    def abort(self, errored: bool = True) -> None:
+        if errored:
+            self.errors_logger.info(
+                "",
+                extra={
+                    "job_id": os.environ.get("JOB_ID", "unknown"),
+                    "replica_id": self._replica_id,
+                    "rank": self._rank,
+                    "quorum_id": self._quorum_id,
+                    "error": "process_group_abort",
+                },
+            )
+        pg = self._pg
+        if pg is not None:
+            if hasattr(pg, "abort"):
+                pg.abort()
+            else:
+                backend = None
+                try:
+                    if torch.cuda.is_available():
+                        backend = pg._get_backend(torch.device("cuda"))
+                    elif torch.xpu.is_available():
+                        backend = pg._get_backend(torch.device("xpu"))
+                except RuntimeError:
+                    backend = None
+                if backend is not None and hasattr(backend, "abort"):
+                    backend.abort()
+            self._pg = None
+    def shutdown(self) -> None:
+        # TODO: abort PG if possible
+        self._pg = None
+    def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        raise NotImplementedError("not implemented")
+    def _wrap_work(self, work: Work, opts: object) -> Work:
+        return work
+    def _opts_hook(self, opts: T) -> T:
+        return opts
+    @contextmanager
+    def _run_context(self) -> Generator[None, None, None]:
+        yield
+    def set_timeout(self, timeout: timedelta) -> None:
+        self._timeout = timeout
+    def allgather(
+        self,
+        output_tensors: List[List[torch.Tensor]],
+        input_tensor: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.allgather(
+                    output_tensors, input_tensor, self._opts_hook(opts)
+                ),
+                opts,
+            )
+    def allgather_into_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.allgather_into_tensor_coalesced(
+                    output_tensors, input_tensors, self._opts_hook(opts)
+                ),
+                opts,
+            )
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.allreduce(tensors, self._opts_hook(opts)), opts
+            )
+    def allreduce_coalesced(
+        self, tensors: List[torch.Tensor], opts: Union[AllreduceOptions, ReduceOp]
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.allreduce_coalesced(tensors, self._opts_hook(opts)), opts
+            )
+    def alltoall_base(
+        self,
+        output_buffer: torch.Tensor,
+        input_buffer: torch.Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts: AllToAllOptions,
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.alltoall_base(
+                    output_buffer,
+                    input_buffer,
+                    output_split_sizes,
+                    input_split_sizes,
+                    self._opts_hook(opts),
+                ),
+                opts,
+            )
+    def barrier(self, opts: Optional[BarrierOptions] = None) -> Work:
+        with self._run_context():
+            return self._wrap_work(self.parent.barrier(self._opts_hook(opts)), opts)
+    def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.broadcast(tensor_list, self._opts_hook(opts)), opts
+            )
+    def recv(self, tensors: List[torch.Tensor], src_rank: int, tag: int) -> Work:
+        with self._run_context():
+            return self._wrap_work(self.parent.recv(tensors, src_rank, tag), None)
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: object,
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.reduce_scatter(
+                    output_tensors, input_tensors, self._opts_hook(opts)
+                ),
+                opts,
+            )
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        with self._run_context():
+            return self._wrap_work(
+                self.parent.reduce_scatter_tensor_coalesced(
+                    output_tensors, input_tensors, self._opts_hook(opts)
+                ),
+                opts,
+            )
+    def send(self, tensors: List[torch.Tensor], dst_rank: int, tag: int) -> Work:
+        with self._run_context():
+            return self._wrap_work(self.parent.send(tensors, dst_rank, tag), None)
+    def size(self) -> int:
+        return self.parent.size()
+    @property
+    def parent(self) -> BaseProcessGroup:
+        assert self._pg is not None, "process group not initialized"
+        return self._pg
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(pg={self._pg})"
+class ProcessGroupGloo(ProcessGroupWrapper):
+    """
+    This is a reconfigurable version of ProcessGroupGloo.
+    """
+    def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.GLOO)
+        # pyre-fixme[16]: no attribute ProcessGroupGloo
+        backend_class = BaseProcessGroupGloo(store, rank, world_size, self._timeout)
+        backend_class._set_sequence_number_for_group()
+        if self._global_ranks:
+            backend_class.options.global_ranks_in_group = self._global_ranks
+        if self._group_rank and self._group_world_size:
+            backend_class.options.group_name = f"torchft_quorum_{self._quorum_id}_rank_{self._group_rank % self._group_world_size}"
+        pg._register_backend(
+            torch.device("cpu"), ProcessGroup.BackendType.GLOO, backend_class
+        )
+        if torch.cuda.is_available():
+            pg._register_backend(
+                torch.device("cuda"), ProcessGroup.BackendType.GLOO, backend_class
+            )
+        return pg
+    def getBackendName(self) -> str:
+        return "torchft-gloo"
+    # pyre-fixme[14,15]: inconsistent override
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: ReduceScatterOptions,
+    ) -> None:
+        """
+        This function is a placeholder for the reduce_scatter operation in the
+        ProcessGroupGloo class. However, this operation is not supported by the
+        Gloo backend, and thus, calling this function will raise a
+        RuntimeError.
+        Raises:
+            RuntimeError: Always raised since reduce_scatter is not
+            supported by ProcessGroupGloo.
+        """
+        raise RuntimeError("ProcessGroupGloo does not support reduce_scatter.")
+    # pyre-fixme[15]: inconsistent override
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> None:
+        """
+        This function is a placeholder for the reduce_scatter_tensor_coalesced
+        operation in the ProcessGroupGloo class.
+        However, this operation is not supported by the
+        Gloo backend, and thus, calling this function will raise a
+        RuntimeError.
+        Raises:
+            RuntimeError: Always raised since reduce_scatter is not
+            supported by ProcessGroupGloo.
+        """
+        raise RuntimeError(
+            "ProcessGroupGloo does not support reduce_scatter_tensor_coalesced."
+        )
+class _WorkAcceleratorTimeout(Work):
+    def __init__(self, pg: ProcessGroup, work: Work, timeout: timedelta) -> None:
+        super().__init__()
+        self._pg = pg
+        self._work = work
+        self._timeout = timeout
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        async_timeout = timeout or self._timeout
+        with self._stream_timeout(self._pg, async_timeout):
+            # In newer versions of PyTorch work may not exist if the call was
+            # not async. In these cases we can just schedule the stream timeout
+            # and return.
+            if self._work is not None:
+                if not self._work.wait():
+                    return False
+            # Always use cuda stream for timeout to avoid ProcessGroupNCCL
+            # watchdog firing and crashing the process.
+            if timeout is not None:
+                torch.cuda.synchronize()
+            return True
+    @classmethod
+    @contextmanager
+    def _stream_timeout(
+        cls, pg: ProcessGroup, timeout: timedelta
+    ) -> Generator[None, None, None]:
+        """
+        Set a timeout on the CUDA stream for the given process group.
+        This does not hold a reference to self to avoid holding the work
+        object/tensors longer than necessary.
+        Args:
+            pg: The process group to call abort on.
+            timeout: The timeout to set on the CUDA stream.
+        """
+        def callback() -> None:
+            logger.error(f"aborting after {timeout}!")
+            pg.abort()
+        # make sure .wait() can be cancelled if it blocks i.e. in barrier
+        with context_timeout(callback, timeout):
+            yield
+        # Cancel work if the cuda stream doesn't complete
+        stream_timeout(callback, timeout)
+    def get_future(self) -> torch.futures.Future[object]:
+        fut = self._work.get_future()
+        def done_callback(fut: torch.futures.Future[object]) -> None:
+            try:
+                with self._stream_timeout(self._pg, self._timeout):
+                    fut.wait()
+            except Exception as e:
+                logger.error(f"done callback failed: {e}")
+        fut.add_done_callback(done_callback)
+        return fut
+class ProcessGroupNCCL(ProcessGroupWrapper):
+    """
+    This is a reconfigurable version of ProcessGroupNCCL.
+    If you are using a supported version of NCCL (NCCL >= 2.26, torch >= 2.7)
+    this will attempt to use ncclCommAbort to recover from any timeouts.
+    This uses a Python user space event loop to asynchronously wait for the NCCL
+    operations to complete. This should not be used with very long timeouts as
+    the timeout entries are not cleaned up until the elapsed duration completes
+    which may result in slowness or excess memory usage.
+    WARNING: this may result in deadlocks due to NCCL error handling and on old
+    versions of torch/NCCL will result in deadlocks.
+    Args:
+        timeout: the timeout to use for NCCL operations.
+    """
+    def __init__(self, timeout: timedelta = timedelta(seconds=60.0)) -> None:
+        super().__init__(timeout)
+        self._use_abort: bool = torch.cuda.nccl.version() >= (2, 25)
+        self._errored: Optional[Exception] = None
+        NONBLOCKING_TIMEOUT_ENV = "TORCH_NCCL_NONBLOCKING_TIMEOUT"
+        if NONBLOCKING_TIMEOUT_ENV not in os.environ:
+            warnings.warn(
+                f"{NONBLOCKING_TIMEOUT_ENV} is not set, defaulting to {timeout}. "
+                "If any nonblocking NCCL operations have already run this may "
+                "result in the default timeout of 30 minutes and hangs on error."
+            )
+            os.environ[NONBLOCKING_TIMEOUT_ENV] = str(timeout.total_seconds())
+    def _opts_hook(self, opts: T) -> T:
+        if not self._use_abort:
+            return opts
+        # We need to clear the timeout to apply our own timeout that doesn't
+        # crash the whole program.
+        if hasattr(opts, "timeout"):
+            # apply default timeout to disable
+            opts.timeout = AllgatherOptions().timeout
+        return opts
+    def _wrap_work(self, work: Work, opts: object) -> Work:
+        if not self._use_abort:
+            return work
+        timeout = self._timeout
+        # pyre-fixme[16]: no attribute timeout
+        if hasattr(opts, "timeout") and opts.timeout.total_seconds() > 0:
+            timeout = opts.timeout
+        return _WorkAcceleratorTimeout(self, work, timeout)
+    @contextmanager
+    def _run_context(self) -> Generator[None, None, None]:
+        timeout: timedelta = self._timeout
+        def callback() -> None:
+            logger.error(f"aborting after {timeout}!")
+            self.abort()
+        # when running in blocking mode we need to make sure collectives can
+        # timeout
+        with context_timeout(callback, timeout):
+            yield
+    def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        # pyre-fixme[21]: no attribute ProcessGroupNCCL
+        from torch.distributed import ProcessGroupNCCL as BaseProcessGroupNCCL
+        self._errored = None
+        # pyre-fixme[16]: no attribute ProcessGroupNCCL
+        opts = BaseProcessGroupNCCL.Options()
+        opts.config.blocking = False
+        if self._global_ranks:
+            opts.global_ranks_in_group = self._global_ranks
+        if self._group_rank and self._group_world_size:
+            opts.group_name = f"torchft_quorum_{self._quorum_id}_rank_{self._group_rank % self._group_world_size}"
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.NCCL)
+        # pyre-fixme[16]: no attribute ProcessGroupNCCL
+        backend_class = BaseProcessGroupNCCL(store, rank, world_size, opts)
+        backend_class._set_sequence_number_for_group()
+        backend_class.eager_connect_single_device(
+            torch.device(torch.accelerator.current_device_index())
+        )
+        pg._register_backend(
+            torch.device("cuda"), ProcessGroup.BackendType.NCCL, backend_class
+        )
+        return pg
+    def abort(self, errored: bool = True) -> None:
+        # We need to set the error before aborting to ensure that errored()
+        # returns the error correctly when NCCL abort fires and unblocks the
+        # stream.
+        if os.environ.get("TORCHFT_TRIGGER_FR_ON_ABORT", "false") == "true":
+            trigger_nccl_fr_trace_through_pipe(dist.get_rank())
+        self._errored = RuntimeError("aborted")
+        super().abort(errored=errored)
+    def errored(self) -> Optional[Exception]:
+        # force a synchronization to ensure all work is complete
+        synchronize()
+        return self._errored
+    def getBackendName(self) -> str:
+        return "torchft-nccl"
+class ProcessGroupXCCL(ProcessGroupWrapper):
+    """
+    This is a reconfigurable version of ProcessGroupXCCL for Intel XPU devices.
+    This process group is designed to work with Intel XPU devices using XCCL
+    (eXtended Collective Communication Library). It provides similar functionality
+    to ProcessGroupNCCL but optimized for Intel XPU architecture.
+    If you are using a supported version of XCCL, this will attempt to use
+    xccl abort mechanisms to recover from any timeouts.
+    This uses a Python user space event loop to asynchronously wait for the XCCL
+    operations to complete. This should not be used with very long timeouts as
+    the timeout entries are not cleaned up until the elapsed duration completes
+    which may result in slowness or excess memory usage.
+    Args:
+        timeout: the timeout to use for XCCL operations.
+    """
+    def __init__(self, timeout: timedelta = timedelta(seconds=60.0)) -> None:
+        super().__init__(timeout)
+        # Check if XPU is available and XCCL is supported
+        self._use_abort: bool = torch.xpu.is_available()
+        self._errored: Optional[Exception] = None
+        NONBLOCKING_TIMEOUT_ENV = "TORCH_XCCL_NONBLOCKING_TIMEOUT"
+        if NONBLOCKING_TIMEOUT_ENV not in os.environ:
+            warnings.warn(
+                f"{NONBLOCKING_TIMEOUT_ENV} is not set, defaulting to {timeout}. "
+                "If any nonblocking XCCL operations have already run this may "
+                "result in the default timeout of 30 minutes and hangs on error."
+            )
+            os.environ[NONBLOCKING_TIMEOUT_ENV] = str(timeout.total_seconds())
+    def _opts_hook(self, opts: T) -> T:
+        if not self._use_abort:
+            return opts
+        # We need to clear the timeout to apply our own timeout that doesn't
+        # crash the whole program.
+        if hasattr(opts, "timeout"):
+            # apply default timeout to disable
+            opts.timeout = AllgatherOptions().timeout
+        return opts
+    def _wrap_work(self, work: Work, opts: object) -> Work:
+        if not self._use_abort:
+            return work
+        timeout = self._timeout
+        # pyre-fixme[16]: no attribute timeout
+        if hasattr(opts, "timeout") and opts.timeout.total_seconds() > 0:
+            timeout = opts.timeout
+        return _WorkAcceleratorTimeout(self, work, timeout)
+    @contextmanager
+    def _run_context(self) -> Generator[None, None, None]:
+        timeout: timedelta = self._timeout
+        def callback() -> None:
+            logger.error(f"aborting after {timeout}!")
+            self.abort()
+        # when running in blocking mode we need to make sure collectives can
+        # timeout
+        with context_timeout(callback, timeout):
+            yield
+    def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        # pyre-fixme[21]: no attribute ProcessGroupXCCL
+        from torch.distributed import ProcessGroupXCCL as BaseProcessGroupXCCL
+        self._errored = None
+        # pyre-fixme[16]: no attribute ProcessGroupXCCL
+        opts = BaseProcessGroupXCCL.Options()
+        # opts.config.blocking = False
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.XCCL)
+        # pyre-fixme[16]: no attribute ProcessGroupXCCL
+        backend_class = BaseProcessGroupXCCL(store, rank, world_size, opts)
+        backend_class._set_sequence_number_for_group()
+        backend_class.eager_connect_single_device(
+            torch.device(torch.accelerator.current_device_index())
+        )
+        pg._register_backend(
+            torch.device("xpu"), ProcessGroup.BackendType.XCCL, backend_class
+        )
+        return pg
+    def abort(self, errored: bool = True) -> None:
+        # We need to set the error before aborting to ensure that errored()
+        # returns the error correctly when XCCL abort fires and unblocks the
+        # stream.
+        self._errored = RuntimeError("aborted")
+        super().abort(errored)
+    def errored(self) -> Optional[Exception]:
+        # force a synchronization to ensure all work is complete
+        torch.xpu.current_stream().synchronize()
+        return self._errored
+    def getBackendName(self) -> str:
+        return "torchft-xccl"
+class ProcessGroupDummy(ProcessGroup):
+    """
+    This process group discards all data passed to it and returns success. This
+    is intended for rare cases where we want to discard certain operations
+    without modifying the underlying library.
+    This PG only supports world_size of 1.
+    """
+    def __init__(self, rank: int, world: int) -> None:
+        super().__init__(rank, world)
+        assert rank == 0
+        assert world == 1
+        self._rank = rank
+        self._world = world
+        self.wait_count = 0
+        self.get_future_count = 0
+        self._work: List[Work] = []
+        self.configure_count = 0
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        self.configure_count += 1
+    def allgather(
+        self,
+        output_tensors: List[List[torch.Tensor]],
+        input_tensor: List[torch.Tensor],
+        opts: object,
+    ) -> Work:
+        for o, i in zip(output_tensors[0], input_tensor):
+            o.copy_(i)
+        res = _DummyWork(output_tensors)
+        self._work.append(res)
+        return res
+    def allgather_into_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        for o, i in zip(output_tensors, input_tensors):
+            o.copy_(i)
+        res = _DummyWork(output_tensors)
+        self._work.append(res)
+        return res
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        res = _DummyWork(tensors)
+        self._work.append(res)
+        return res
+    def allreduce_coalesced(
+        self, tensors: List[torch.Tensor], opts: Union[AllreduceOptions, ReduceOp]
+    ) -> Work:
+        res = _DummyWork(tensors)
+        self._work.append(res)
+        return res
+    def alltoall_base(
+        self,
+        output_buffer: torch.Tensor,
+        input_buffer: torch.Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts: AllToAllOptions,
+    ) -> Work:
+        output_buffer.copy_(input_buffer)
+        res = _DummyWork([output_buffer])
+        self._work.append(res)
+        return res
+    def barrier(self, opts: Optional[BarrierOptions] = None) -> Work:
+        return _DummyWork(None)
+    def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> Work:
+        res = _DummyWork(tensor_list)
+        self._work.append(res)
+        return res
+    def recv(self, tensors: List[torch.Tensor], src_rank: int, tag: int) -> Work:
+        return _DummyWork(None)
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: object,
+    ) -> Work:
+        for o, i in zip(output_tensors, input_tensors[0]):
+            o.copy_(i)
+        res = _DummyWork(output_tensors)
+        self._work.append(res)
+        return res
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        for o, i in zip(output_tensors, input_tensors):
+            o.copy_(i)
+        res = _DummyWork(output_tensors)
+        self._work.append(res)
+        return res
+    def send(self, tensors: List[torch.Tensor], dst_rank: int, tag: int) -> Work:
+        return _DummyWork(None)
+    def size(self) -> int:
+        return self._world
+    def getBackendName(self) -> str:
+        return "torchft-dummy"
+class _ErrorSwallowingWork(Work):
+    def __init__(
+        self,
+        pg: "ErrorSwallowingProcessGroupWrapper",
+        work: Work,
+        default_result: object,
+    ) -> None:
+        super().__init__()
+        self._pg = pg
+        self._work = work
+        self._default_result = default_result
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        try:
+            self._work.wait()
+        except Exception as e:
+            self._pg.report_error(e)
+        return True
+    def get_future(self) -> Future[object]:
+        fut = self._work.get_future()
+        # schedule error handling as a continuation on the Future
+        def callback(
+            fut: torch.futures.Future[List[torch.Tensor]],
+        ) -> object:
+            try:
+                return fut.value()
+            except Exception as e:
+                logger.exception(f"got exception in future -- skipping remaining: {e}")
+                self._pg.report_error(e)
+                return self._default_result
+        fut = fut.then(callback)
+        return fut
+class ErrorSwallowingProcessGroupWrapper(ProcessGroupWrapper):
+    """
+    This is a wrapper around any ProcessGroup that will swallow errors and
+    return dummy results on error.
+    This is intended to allow handling errors outside of the training loop to
+    avoid having to modify modeling code to support error handling.
+    After an error occurs all future operations will be skipped until the
+    process group is reconfigured via ``configure``.
+    """
+    def __init__(self, pg: ProcessGroup) -> None:
+        super().__init__(pg=pg)
+        self._error: Optional[Exception] = None
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        self._error = None
+        super().configure(
+            store_addr,
+            replica_id,
+            rank,
+            world_size,
+            quorum_id,
+            group_rank,
+            group_world_size,
+            global_ranks,
+        )
+    def report_error(self, e: Exception) -> None:
+        """
+        Report an error to this process group. This will cause all future
+        operations to be skipped until the process group is reconfigured via
+        ``configure``.
+        Args:
+            e: exception to report
+        """
+        self._error = e
+    def error(self) -> Optional[Exception]:
+        """
+        Returns the error that was reported to this process group.
+        Returns:
+            exception that was reported
+        """
+        return self._error
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        if self._error is not None:
+            return _DummyWork(tensors)
+        try:
+            return _ErrorSwallowingWork(
+                self,
+                super().allreduce(tensors, opts),
+                tensors,
+            )
+        except Exception as e:
+            self.report_error(e)
+            return _DummyWork(tensors)
+class FakeProcessGroupWrapper(ProcessGroupWrapper):
+    """
+    This is a wrapper around any ProcessGroup that can be used to inject
+    errors into the process group at various points.
+    This is intended to be used for tests so that they can test cases
+    in which process group operations error out.
+    """
+    def __init__(self, pg: ProcessGroup) -> None:
+        super().__init__(pg=pg)
+        self._future_error: Optional[Exception] = None
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        self._future_error = None
+        super().configure(
+            store_addr,
+            replica_id,
+            rank,
+            world_size,
+            quorum_id,
+            group_rank,
+            group_world_size,
+            global_ranks,
+        )
+    def report_future_error(self, e: Exception) -> None:
+        """
+        Report an error to this process group. This will cause the
+        future attached to the next operation to error out.
+        Args:
+            e: exception to report
+        """
+        self._future_error = e
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        work = super().allreduce(tensors, opts)
+        if self._future_error is None:
+            return work
+        fut = work.get_future()
+        def callback(
+            fut: torch.futures.Future[List[torch.Tensor]],
+        ) -> List[torch.Tensor]:
+            future_error, self._future_error = self._future_error, None
+            assert future_error is not None
+            raise future_error
+        fut = fut.then(callback)
+        return work
+class ManagedProcessGroup(ProcessGroupWrapper):
+    """
+    This is a wrapper around any ProcessGroup that is managed by a torchft
+    Manager.
+    This uses the ProcessGroup that is configured in the Manager. The world size
+    is dynamic and will report the number of active particpants in the quorum to
+    the model.
+    Any errors will be asynchronously reported to the manager and only successes
+    will be returned to the caller.
+    """
+    def __init__(self, manager: "Manager") -> None:
+        super().__init__(pg=manager._pg)
+        self._manager = manager
+    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
+        assert len(tensors) == 1
+        if isinstance(opts, ReduceOp):
+            return self._manager.allreduce(tensors[0], reduce_op=opts)
+        if isinstance(opts, AllreduceOptions):
+            return self._manager.allreduce(tensors[0], reduce_op=opts.reduceOp)
+        assert False, "unreachable"
+    def size(self) -> int:
+        return self._manager.num_participants()
+    def getBackendName(self) -> str:
+        return self._manager._pg.getBackendName()
+class _BabyWork(Work):
+    def __init__(
+        self,
+        pg: "ProcessGroupBaby",
+        op_id: int,
+        stream: Optional[torch.Stream],
+    ) -> None:
+        super().__init__()
+        self._pg = pg
+        self._op_id = op_id
+        self._stream = stream
+    def wait(self, timeout: Optional[timedelta] = None) -> bool:
+        return self._pg._wait(self._op_id, timeout)
+    def synchronize(self) -> None:
+        # TODO: No one seems to use this and NCCL wait already only waits the
+        # stream and is non-blocking on the CPU side so no real need for a
+        # separate call.
+        raise NotImplementedError("not implemented")
+    def get_future(self) -> Future[object]:
+        return self._pg._get_future(self._op_id, self._stream)
+    def __del__(self) -> None:
+        self._pg._del(self._op_id)
+def _is_any_cuda(obj: object) -> bool:
+    """
+    Returns true if any of the tensors in the object are CUDA tensors.
+    Supports lists, tuples, dicts, and tensors.
+    """
+    return tree_any(lambda obj: isinstance(obj, torch.Tensor) and obj.is_cuda, obj)
+def _is_any_xpu(obj: object) -> bool:
+    """
+    Returns true if any of the tensors in the object are XPU tensors.
+    Supports lists, tuples, dicts, and tensors.
+    """
+    return tree_any(lambda obj: isinstance(obj, torch.Tensor) and obj.is_xpu, obj)
+@dataclass
+class _OpMetadata:
+    work: Work
+    stream: Optional[torch.Stream]
+    @contextmanager
+    def set_stream(self) -> Generator[None, None, None]:
+        with get_stream_context(self.stream):
+            yield
+@dataclass
+class _FutureMetadata:
+    future: Future[object]
+    stream: Optional[torch.Stream]
+    @contextmanager
+    def set_stream(self) -> Generator[None, None, None]:
+        with get_stream_context(self.stream):
+            yield
+def _maybe_share_tensors(
+    tensor: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor],
+) -> None:
+    """Move a tensor / list of tensors to shared memory if not already in shared memory."""
+    if isinstance(tensor, list):
+        for t in tensor:
+            _maybe_share_tensors(t)
+    elif isinstance(tensor, torch.Tensor):
+        if not tensor.is_shared():
+            tensor.share_memory_()
+    else:
+        raise TypeError(f"expected tensor or list but got {type(tensor)}")
+def _assert_list(tensors: Union[List[torch.Tensor], List[List[torch.Tensor]]]) -> None:
+    """Assert that the input is a list of tensors or a nested list of tensors."""
+    if not isinstance(tensors, list):
+        raise TypeError(f"expected list but got {type(tensors)}")
+class ProcessGroupBaby(ProcessGroup):
+    """
+    This is a process group that runs the underlying process group in a
+    subprocess. Since it's running in a subprocess all tensors need to be in
+    shared memory or will be moved to shared memory. CUDA/XPU tensors are implicitly
+    shareable and don't need any changes.
+    """
+    def __init__(self, timeout: Union[float, timedelta] = 60.0) -> None:
+        super().__init__(0, 1)
+        self._world_size = -1
+        self._p: Optional[mp.Process] = None
+        self._pipe: Optional[_MonitoredPipe] = None
+        self._future_pipe: Optional[_MonitoredPipe] = None
+        self._future_thread: Optional[threading.Thread] = None
+        self._futures: Dict[int, _FutureMetadata] = {}
+        self._futures_lock = threading.Lock()
+        self._next_op_id = 0
+        if isinstance(timeout, timedelta):
+            timeout = timeout.total_seconds()
+        self._timeout: float = timeout
+    def shutdown(self) -> None:
+        """
+        Shutdown the process group. This will kill the underlying process and
+        close all queues.
+        This is a no-op if the process group is already shutdown.
+        ProcessGroup can be reconfigured after shutdown.
+        """
+        if self._pipe is not None:
+            self._pipe.close()
+        future_pipe = self._future_pipe
+        if future_pipe is not None:
+            # wait for the future thread to exit and then close the queue
+            future_pipe.close()
+            future_thread = self._future_thread
+            assert future_thread is not None
+            future_thread.join(timeout=10.0)
+            if future_thread.is_alive():
+                raise RuntimeError("future thread did not exit")
+        # Kill after closing queues to avoid log spam.
+        if self._p is not None:
+            self._p.kill()
+    def configure(
+        self,
+        store_addr: str,
+        replica_id: str,
+        rank: int,
+        world_size: int,
+        quorum_id: Optional[int] = None,
+        group_rank: Optional[int] = None,
+        group_world_size: Optional[int] = None,
+        global_ranks: Optional[list[int]] = None,
+    ) -> None:
+        self._world_size = world_size
+        self.shutdown()
+        ctx = mp.get_context("spawn")
+        req_local, req_remote = ctx.Pipe()
+        future_local, future_remote = ctx.Pipe()
+        self._pipe = req_local = _MonitoredPipe(req_local)
+        self._future_pipe = future_local = _MonitoredPipe(future_local)
+        curr_device = (
+            torch.accelerator.current_device_index()
+            if torch.accelerator.is_available()
+            else -1
+        )
+        self._p = p = ctx.Process(
+            target=self._worker,
+            args=(
+                store_addr,
+                rank,
+                world_size,
+                req_remote,
+                future_remote,
+                curr_device,
+            ),
+            daemon=True,
+        )
+        p.start()
+        # futures need thread to fire callbacks
+        # this lock needs to be held when manipulating _futures
+        self._futures_lock = threading.Lock()
+        self._futures = {}
+        self._future_thread = threading.Thread(
+            target=self._future_handler,
+            args=(future_local,),
+            daemon=True,
+        )
+        self._future_thread.start()
+        # fetch the status of the PG init
+        # if an exception was returned get will throw
+        assert req_local.recv(self._timeout) is None
+    @classmethod
+    def _create_pg(cls, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        """
+        This is a class method to avoid pickling the class.
+        """
+        raise NotImplementedError("not implemented")
+    @classmethod
+    def _worker(
+        cls,
+        store_addr: str,
+        rank: int,
+        world_size: int,
+        req_pipe: "Connection[object, object]",  # type: ignore
+        future_pipe: "Connection[object, object]",  # type: ignore
+        curr_device: int,
+    ) -> None:
+        try:
+            if curr_device >= 0 and torch.accelerator.is_available():
+                torch.accelerator.set_device_index(curr_device)
+            store = create_store_client(
+                store_addr,
+                # default TCPStore timeout is 5 minutes
+                timeout=timedelta(minutes=5),
+            )
+            try:
+                pg = cls._create_pg(store, rank, world_size)
+            except Exception as e:
+                logger.exception(f"got exception in worker: {e}")
+                req_pipe.send(e)
+                return
+            req_pipe.send(None)
+            streams: Dict[str, torch.Stream] = {}
+            work: Dict[int, _OpMetadata] = {}
+            while True:
+                op = cast(list[object], req_pipe.recv())
+                cmd = op[0]
+                if cmd == "func":
+                    op_id: int
+                    op_id, func_name, args, kwargs, stream_device, stream_id, event = (
+                        cast(
+                            Tuple[
+                                int,
+                                str,
+                                list[object],
+                                dict[str, object],
+                                int,
+                                int,
+                                Optional[Union[torch.cuda.Event, torch.xpu.Event]],
+                            ],
+                            op[1:],
+                        )
+                    )
+                    # To avoid potential deadlocks we need to preserve the
+                    # stream/synchronization behavior of the parent process.
+                    # We allocate one Stream per stream_id to make sure that we
+                    # don't accidentally introduce cross stream synchronization
+                    # points.
+                    if stream_id is not None:
+                        stream_key = f"{stream_device}/{stream_id}"
+                        if stream_key not in streams:
+                            streams[stream_key] = torch.Stream(device=stream_device)
+                        stream = streams[stream_key]
+                    else:
+                        stream = None
+                    with get_stream_context(stream):
+                        # Make the stream wait on the cuda event to make sure we
+                        # don't start the operation until the tensor is ready.
+                        if event is not None:
+                            event.wait()
+                        args = _PickleSafeOptions.unsafe_args(args)
+                        fn = getattr(pg, func_name)
+                        work[op_id] = _OpMetadata(
+                            work=fn(*args, **kwargs),
+                            stream=stream,
+                        )
+                elif cmd == "wait":
+                    op_id, timeout = cast(tuple[int, timedelta], op[1:])
+                    metadata = work[op_id]
+                    with metadata.set_stream():
+                        # With WorkNCCL this makes the stream wait not the CPU when
+                        # no timeout is passed.
+                        if timeout is not None:
+                            metadata.work.wait(timeout)
+                        else:
+                            metadata.work.wait()
+                        # Register event on the stream that we can pass to the main
+                        # process.
+                        event = record_event() if metadata.stream is not None else None
+                    req_pipe.send((op_id, event))
+                elif cmd == "del":
+                    op_id: int = cast(int, op[1])
+                    del work[op_id]
+                elif cmd == "future":
+                    op_id: int = cast(int, op[1])
+                    metadata: _OpMetadata = work[op_id]
+                    def callback(fut: Future[object], metadata: _OpMetadata) -> None:
+                        try:
+                            # create an event after the collective has been issued
+                            # to wait on this before we call "future"
+                            with metadata.set_stream():
+                                fut.wait()
+                                event = (
+                                    record_event()
+                                    if metadata.stream is not None
+                                    else None
+                                )
+                            future_pipe.send((op_id, _FUTURE_RESULT, None, event))
+                        except Exception as e:
+                            future_pipe.send((op_id, _FUTURE_EXCEPTION, e, None))
+                    metadata.work.get_future().add_done_callback(
+                        lambda fut: callback(fut, metadata)
+                    )
+                elif cmd == "num_active_work":
+                    req_pipe.send(len(work))
+                else:
+                    raise ValueError(f"unknown cmd: {cmd}")
+        except Exception as e:
+            logger.exception(f"worker errored: {e}")
+            req_pipe.send(e)
+            raise
+    def _future_handler(self, future_pipe: _MonitoredPipe) -> None:
+        try:
+            while True:
+                try:
+                    cmd = future_pipe.recv(timedelta(seconds=10))
+                except TimeoutError:
+                    continue
+                except OSError:
+                    # subprocess exited
+                    break
+                op_id, mode, data, event = cast(
+                    Tuple[
+                        int,
+                        str,
+                        object,
+                        Optional[Union[torch.cuda.Event, torch.xpu.Event]],
+                    ],
+                    cmd,
+                )
+                with self._futures_lock:
+                    meta = self._futures[op_id]
+                    del self._futures[op_id]
+                with meta.set_stream():
+                    if mode == _FUTURE_RESULT:
+                        if event is not None:
+                            event.wait()
+                        meta.future.set_result(data)
+                    elif mode == _FUTURE_EXCEPTION:
+                        meta.future.set_exception(data)
+                    else:
+                        raise ValueError(f"unknown mode {mode}")
+        except Exception as e:
+            logger.exception(f"got unexpected error in future handler: {e}")
+    def _get_future(self, op_id: int, stream: Optional[torch.Stream]) -> Future[object]:
+        with self._futures_lock:
+            fut = Future()
+            self._futures[op_id] = _FutureMetadata(future=fut, stream=stream)
+            assert self._pipe is not None
+            self._pipe.send(("future", op_id))
+        # TODO: return correct tensor instead of None
+        return fut
+    def _wait(self, op_id: int, timeout: Optional[timedelta] = None) -> bool:
+        assert self._pipe is not None
+        self._pipe.send(("wait", op_id, timeout))
+        assert self._pipe is not None
+        op_id, event = cast(
+            Tuple[int, Optional[Union[torch.cuda.Event, torch.xpu.Event]]],
+            self._pipe.recv(timeout or self._timeout),
+        )
+        assert op_id == op_id
+        if event is not None:
+            event.wait()
+        return True
+    def _del(self, op_id: int) -> None:
+        assert self._pipe is not None
+        try:
+            self._pipe.send(("del", op_id))
+        except OSError:
+            # if pipe is closed we can safely do nothing
+            pass
+    def _run_func(self, func: str, *args: object, **kwargs: object) -> Work:
+        pipe = self._pipe
+        assert pipe is not None
+        is_accelerator = _is_any_cuda(args) or _is_any_xpu(args)
+        stream_device = (
+            torch.accelerator.current_stream().device if is_accelerator else None
+        )
+        stream_id = (
+            torch.accelerator.current_stream().stream_id if is_accelerator else None
+        )
+        event = record_event() if is_accelerator else None
+        op_id = self._next_op_id
+        self._next_op_id += 1
+        pipe.send(
+            (
+                "func",
+                op_id,
+                func,
+                _PickleSafeOptions.safe_args(args),
+                kwargs,
+                stream_device,
+                stream_id,
+                event,
+            ),
+        )
+        return _BabyWork(
+            pg=self,
+            op_id=op_id,
+            stream=torch.accelerator.current_stream() if is_accelerator else None,
+        )
+    def allgather(
+        self,
+        output_tensors: List[List[torch.Tensor]],
+        input_tensor: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        _assert_list(output_tensors)
+        _assert_list(input_tensor)
+        _maybe_share_tensors(output_tensors)
+        _maybe_share_tensors(input_tensor)
+        return self._run_func("allgather", output_tensors, input_tensor, opts)
+    def allgather_into_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: AllgatherOptions,
+    ) -> Work:
+        _assert_list(output_tensors)
+        _assert_list(input_tensors)
+        _maybe_share_tensors(output_tensors)
+        _maybe_share_tensors(input_tensors)
+        return self._run_func(
+            "allgather_into_tensor_coalesced", output_tensors, input_tensors, opts
+        )
+    def allreduce(
+        self,
+        tensors: List[torch.Tensor],
+        opts: Union[dist.AllreduceOptions, dist.ReduceOp],
+    ) -> Work:
+        _assert_list(tensors)
+        _maybe_share_tensors(tensors)
+        return self._run_func("allreduce", tensors, opts)
+    def allreduce_coalesced(
+        self,
+        tensors: List[torch.Tensor],
+        opts: Union[dist.AllreduceCoalescedOptions, dist.ReduceOp],
+    ) -> Work:
+        _assert_list(tensors)
+        _maybe_share_tensors(tensors)
+        return self._run_func("allreduce_coalesced", tensors, opts)
+    def alltoall_base(
+        self,
+        output_buffer: torch.Tensor,
+        input_buffer: torch.Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts: AllToAllOptions,
+    ) -> Work:
+        _maybe_share_tensors(output_buffer)
+        _maybe_share_tensors(input_buffer)
+        return self._run_func(
+            "alltoall_base",
+            output_buffer,
+            input_buffer,
+            output_split_sizes,
+            input_split_sizes,
+            opts,
+        )
+    def barrier(self, opts: Optional[BarrierOptions] = None) -> Work:
+        return self._run_func("barrier", opts)
+    def broadcast(
+        self,
+        tensor_list: List[torch.Tensor],
+        opts: BroadcastOptions,
+    ) -> Work:
+        _assert_list(tensor_list)
+        _maybe_share_tensors(tensor_list)
+        return self._run_func("broadcast", tensor_list, opts)
+    def recv(self, tensors: List[torch.Tensor], src_rank: int, tag: int) -> Work:
+        _assert_list(tensors)
+        _maybe_share_tensors(tensors)
+        return self._run_func("recv", tensors, src_rank, tag)
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        _assert_list(output_tensors)
+        _assert_list(input_tensors)
+        _maybe_share_tensors(output_tensors)
+        _maybe_share_tensors(input_tensors)
+        return self._run_func("reduce_scatter", output_tensors, input_tensors, opts)
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> Work:
+        _assert_list(output_tensors)
+        _assert_list(input_tensors)
+        _maybe_share_tensors(output_tensors)
+        _maybe_share_tensors(input_tensors)
+        return self._run_func(
+            "reduce_scatter_tensor_coalesced", output_tensors, input_tensors, opts
+        )
+    def send(self, tensors: List[torch.Tensor], dst_rank: int, tag: int) -> Work:
+        _assert_list(tensors)
+        _maybe_share_tensors(tensors)
+        return self._run_func("send", tensors, dst_rank, tag)
+    def size(self) -> int:
+        return self._world_size
+    def num_active_work(self) -> int:
+        assert self._pipe is not None
+        self._pipe.send(("num_active_work",))
+        assert self._pipe is not None
+        return cast(int, self._pipe.recv(self._timeout))
+    def set_timeout(self, timeout: timedelta) -> None:
+        self._timeout = timeout.total_seconds()
+@dataclass
+class _PickleSafeOptions:
+    func: Callable[[], object]
+    fields: Dict[str, object]
+    @classmethod
+    def safe_args(cls, args: T) -> T:
+        if isinstance(args, tuple):
+            return tuple(cls.safe_args(arg) for arg in args)
+        elif isinstance(args, list):
+            return [cls.safe_args(arg) for arg in args]
+        elif isinstance(
+            args,
+            (
+                AllgatherOptions,
+                AllreduceOptions,
+                AllreduceCoalescedOptions,
+                AllToAllOptions,
+                BarrierOptions,
+                BroadcastOptions,
+                ReduceScatterOptions,
+            ),
+        ):
+            return cls.from_torch(args)
+        else:
+            return args
+    @classmethod
+    def unsafe_args(cls, args: T) -> T:
+        if isinstance(args, tuple):
+            return tuple(cls.unsafe_args(arg) for arg in args)
+        elif isinstance(args, list):
+            return [cls.unsafe_args(arg) for arg in args]
+        elif isinstance(args, cls):
+            return args.to_torch()
+        else:
+            return args
+    @classmethod
+    def from_torch(cls, opts: object) -> "_PickleSafeOptions":
+        return cls(
+            func=opts.__class__,
+            fields={k: getattr(opts, k) for k in dir(opts) if not k.startswith("_")},
+        )
+    def to_torch(self) -> object:
+        opts = self.func()
+        for k, v in self.fields.items():
+            setattr(opts, k, v)
+        return opts
+class ProcessGroupBabyGloo(ProcessGroupBaby):
+    """
+    This is a ProcessGroup that runs Gloo in a subprocess.
+    For most use cases you should prefer ProcessGroupGloo or
+    ProcessGroupBabyNCCL.
+    """
+    @classmethod
+    def _create_pg(cls, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.GLOO)
+        # pyre-fixme[16]: no attribute ProcessGroupGloo
+        backend_class = BaseProcessGroupGloo(store, rank, world_size)
+        pg._register_backend(
+            torch.device("cpu"), ProcessGroup.BackendType.GLOO, backend_class
+        )
+        return pg
+    def getBackendName(self) -> str:
+        return "torchft-baby-gloo"
+    # pyre-fixme[15]: inconsistent override
+    def reduce_scatter(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
+        opts: ReduceScatterOptions,
+    ) -> None:
+        """
+        This function is a placeholder for the reduce_scatter operation in the
+        ProcessGroupGloo class. However, this operation is not supported by the
+        Gloo backend, and thus, calling this function will raise a
+        RuntimeError.
+        Raises:
+            RuntimeError: Always raised since reduce_scatter is not
+            supported by ProcessGroupGloo.
+        """
+        raise RuntimeError("ProcessGroupBabyGloo does not support reduce_scatter.")
+    # pyre-fixme[15]: inconsistent override
+    def reduce_scatter_tensor_coalesced(
+        self,
+        output_tensors: List[torch.Tensor],
+        input_tensors: List[torch.Tensor],
+        opts: ReduceScatterOptions,
+    ) -> None:
+        """
+        This function is a placeholder for the reduce_scatter_tensor_coalesced
+        operation in the ProcessGroupBabyGloo class.
+        However, this operation is not supported by the
+        Gloo backend, and thus, calling this function will raise a
+        RuntimeError.
+        Raises:
+            RuntimeError: Always raised since reduce_scatter is not
+            supported by ProcessGroupBabyGloo.
+        """
+        raise RuntimeError(
+            "ProcessGroupBabyGloo does not support reduce_scatter_tensor_coalesced."
+        )
+class ProcessGroupBabyNCCL(ProcessGroupBaby):
+    """
+    This is a ProcessGroup that runs NCCL in a subprocess.
+    For the NCCL backend, extra memory will be used by the subprocesses CUDA
+    context compared to running NCCL in the main process. This is typically
+    around ~1GB.
+    The returned Work objects only synchronize on the cuda stream and not on the
+    CPU side. This works by passing CUDA Events between the processes. To do a
+    CPU synchronize, call torch.cuda.synchronize() after wait().
+    WARNING: If the child process is killed while an operation is running, CUDA
+    tensors may leak in the current PyTorch implementation. TODO fix
+    WARNING: As this uses a separate CUDA context for the subprocess, performance
+    may be slower than using NCCL directly. Separate CUDA contexts can not run
+    at the same time so network and compute kernels will not overlap execution
+    and instead do time sharing which may reduce GPU utilization.
+    """
+    @classmethod
+    def _create_pg(cls, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        from torch.distributed import ProcessGroupNCCL as BaseProcessGroupNCCL
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.NCCL)
+        # pyre-fixme[16]: no attribute ProcessGroupNCCL
+        backend_class = BaseProcessGroupNCCL(store, rank, world_size)
+        backend_class._set_sequence_number_for_group()
+        pg._register_backend(
+            torch.device("cuda"), ProcessGroup.BackendType.NCCL, backend_class
+        )
+        return pg
+    def getBackendName(self) -> str:
+        return "torchft-baby-nccl"
+class ProcessGroupBabyXCCL(ProcessGroupBaby):
+    """
+    This is a ProcessGroup that runs XCCL in a subprocess for Intel XPU devices.
+    For the XCCL backend, extra memory will be used by the subprocesses XPU
+    context compared to running XCCL in the main process. This is typically
+    dependent on the XPU memory architecture.
+    The returned Work objects only synchronize on the XPU stream and not on the
+    CPU side. This works by passing XPU Events between the processes. To do a
+    CPU synchronize, call torch.xpu.synchronize() after wait().
+    WARNING: If the child process is killed while an operation is running, XPU
+    tensors may leak in the current PyTorch implementation. TODO fix
+    WARNING: As this uses a separate XPU context for the subprocess, performance
+    may be slower than using XCCL directly. Separate XPU contexts can not run
+    at the same time so network and compute kernels will not overlap execution
+    and instead do time sharing which may reduce XPU utilization.
+    """
+    @classmethod
+    def _create_pg(cls, store: Store, rank: int, world_size: int) -> BaseProcessGroup:
+        # Check if XPU and XCCL are available
+        from torch.distributed import ProcessGroupXCCL as BaseProcessGroupXCCL
+        pg = BaseProcessGroup(store, rank, world_size)
+        pg._set_default_backend(ProcessGroup.BackendType.XCCL)
+        # pyre-fixme[16]: no attribute ProcessGroupNCCL
+        backend_class = BaseProcessGroupXCCL(store, rank, world_size)
+        backend_class._set_sequence_number_for_group()
+        pg._register_backend(
+            torch.device("xpu"), ProcessGroup.BackendType.XCCL, backend_class
+        )
+        return pg
+    def getBackendName(self) -> str:
+        return "torchft-baby-xccl"