PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/pipelining/infra/schedule/program/zerobubblev.py ADDED Viewed

@@ -0,0 +1,227 @@
+from ..component.program import (
+    PipelineProgramBuilder,
+    ScheduleStyle,
+    add_communication_ops,
+    build_stage_to_host_rank_topology,
+)
+from ..component.runtime import (
+    ActionBase,
+    BackwardFullInputComputeAction,
+    BackwardWeightComputeAction,
+    ForwardComputeAction,
+)
+class ZeroBubbleVPipelineProgramBuilder(PipelineProgramBuilder):
+    """
+    Builder for the Zero Bubble V (ZBV) Pipeline Schedule.
+    This schedule is designed for V-shape topologies (2 stages per rank) and
+    utilizes the Zero Bubble optimizations by splitting backward passes.
+    It requires exactly two stages
+    per rank organized in a V-shape topology and splits backward passes into
+    Input and Weight gradients to optimize pipeline throughput.
+    References:
+        https://arxiv.org/pdf/2401.10241, Section 6
+    """
+    def __init__(self):
+        """Constructs the ZBV builder."""
+    def compose(
+            self, num_microbatches: int, pp_size: int
+    ) -> dict[int, list[ActionBase]]:
+        num_stages = self.num_stages_per_rank * pp_size
+        # 1. Topology
+        # V-style: Rank 0 gets Stage 0 & Stage N-1. Rank 1 gets Stage 1 & Stage N-2...
+        stage_to_rank = build_stage_to_host_rank_topology(
+            pp_size=pp_size, num_stages=num_stages, style=ScheduleStyle.v
+        )
+        actions: dict[int, list[ActionBase]] = {}
+        for rank in range(pp_size):
+            actions[rank] = self._generate_rank_schedule(
+                rank=rank,
+                pp_size=pp_size,
+                num_stages=num_stages,
+                target_microbatches=num_microbatches,
+            )
+        # 2. Inject Communications
+        return add_communication_ops(
+            compute_actions=actions,
+            stage_to_rank=stage_to_rank,
+            num_stages=num_stages
+        )
+    def _generate_rank_schedule(  # noqa: C901
+            self,
+            rank: int,
+            pp_size: int,
+            num_stages: int,
+            target_microbatches: int,
+    ) -> list[ActionBase]:
+        # ZBV logic assumes the pipeline is fully saturated to define the loop bounds.
+        # We simulate enough steps to cover the topology startup, then filter
+        # down to the user's requested microbatches at the end.
+        simulated_n_micro = max(2 * pp_size - 1, target_microbatches)
+        rank_ops: list[ActionBase] = []
+        # -- Stage Identification (V-Shape) --
+        # s0: The "Forward-going" chunk (e.g., Stage 0 for Rank 0)
+        # s1: The "Backward-coming" chunk (e.g., Stage N-1 for Rank 0)
+        s0 = rank
+        s1 = num_stages - 1 - rank
+        # -- Counters --
+        # Track next microbatch index for each operation type on each chunk.
+        # F: Forward, I: Backward Input, W: Backward Weight
+        f0_cnt = 0
+        b0_cnt = 0  # Input Grad Counter (Chunk 0)
+        w0_cnt = 0  # Weight Grad Counter (Chunk 0)
+        f1_cnt = 0
+        b1_cnt = 0  # Input Grad Counter (Chunk 1)
+        w1_cnt = 0  # Weight Grad Counter (Chunk 1)
+        # -- Helpers --
+        def emit_f(stage: int, idx: int):
+            rank_ops.append(ForwardComputeAction(stage_idx=stage, microbatch_idx=idx))
+        def emit_i_and_w(stage: int, idx: int):
+            rank_ops.append(
+                BackwardFullInputComputeAction(
+                    stage_idx=stage, microbatch_idx=idx, full_backward=False
+                )
+            )
+            rank_ops.append(
+                BackwardWeightComputeAction(stage_idx=stage, microbatch_idx=idx)
+            )
+        def emit_i(stage: int, idx: int):
+            rank_ops.append(
+                BackwardFullInputComputeAction(
+                    stage_idx=stage, microbatch_idx=idx, full_backward=False
+                )
+            )
+        def emit_w(stage: int, idx: int):
+            rank_ops.append(
+                BackwardWeightComputeAction(stage_idx=stage, microbatch_idx=idx)
+            )
+        # -- Phase 1: Warmup 1 (Chunk 0 Forwards) --
+        warmup_n1 = 2 * (pp_size - rank) - 1
+        for _ in range(warmup_n1):
+            emit_f(s0, f0_cnt)
+            f0_cnt += 1
+        # -- Phase 2: Warmup 2 (Interleave F1, F0) --
+        warmup_n2 = rank
+        for _ in range(warmup_n2):
+            emit_f(s1, f1_cnt)
+            f1_cnt += 1
+            emit_f(s0, f0_cnt)
+            f0_cnt += 1
+        # -- Phase 3: Warmup 3 (F1, then B1 I+W) --
+        warmup_n3 = pp_size - rank
+        for _ in range(warmup_n3):
+            emit_f(s1, f1_cnt)
+            f1_cnt += 1
+            emit_i_and_w(s1, b1_cnt)
+            b1_cnt += 1
+            w1_cnt += 1
+        # -- Phase 4: Stable State --
+        while f1_cnt < f0_cnt or f0_cnt < simulated_n_micro:
+            # Emit F0 if within bounds
+            if f0_cnt < simulated_n_micro:
+                emit_f(s0, f0_cnt)
+                f0_cnt += 1
+            # Emit B0 (I+W)
+            emit_i_and_w(s0, b0_cnt)
+            b0_cnt += 1
+            w0_cnt += 1
+            # Emit F1
+            emit_f(s1, f1_cnt)
+            f1_cnt += 1
+            # Emit B1 (I+W)
+            emit_i_and_w(s1, b1_cnt)
+            b1_cnt += 1
+            w1_cnt += 1
+        # -- Phase 5: Cooldown 1 (Splitting I and W) --
+        # In cooldown, the I and W streams diverge to fill bubbles.
+        cooldown_n1 = rank
+        for _ in range(cooldown_n1):
+            emit_i(s0, b0_cnt)
+            b0_cnt += 1
+            emit_i(s1, b1_cnt)
+            b1_cnt += 1
+        # -- Phase 6: Cooldown 2 (I0, then W0) --
+        cooldown_n2 = pp_size - rank
+        for _ in range(cooldown_n2):
+            # Input Grad Chunk 0
+            emit_i(s0, b0_cnt)
+            b0_cnt += 1
+            # Weight Grad Chunk 0 (delayed from previous steps)
+            emit_w(s0, w0_cnt)
+            w0_cnt += 1
+        # -- Phase 7: Flush Remaining Weights --
+        # Flush W1
+        while w1_cnt < b1_cnt:
+            emit_w(s1, w1_cnt)
+            w1_cnt += 1
+        # Flush W0
+        while w0_cnt < b0_cnt:
+            emit_w(s0, w0_cnt)
+            w0_cnt += 1
+        # -- Integrity Check --
+        if not (w0_cnt == b0_cnt == f0_cnt):
+            raise RuntimeError(
+                f"ZBV Schedule Failed (Chunk 0): F={f0_cnt}, I={b0_cnt}, W={w0_cnt}"
+            )
+        if not (w1_cnt == b1_cnt == f1_cnt):
+            raise RuntimeError(
+                f"ZBV Schedule Failed (Chunk 1): F={f1_cnt}, I={b1_cnt}, W={w1_cnt}"
+            )
+        # -- Post-Process: Filter to Target Microbatches --
+        # Remove any actions involving simulated microbatches beyond the user's request.
+        final_ops: list[ActionBase] = []
+        for action in rank_ops:
+            if isinstance(action, (ForwardComputeAction,
+                                   BackwardFullInputComputeAction,
+                                   BackwardWeightComputeAction)):
+                if action.microbatch_idx < target_microbatches:
+                    final_ops.append(action)
+            else:
+                final_ops.append(action)
+        return final_ops
+    @property
+    def num_stages_per_rank(self) -> int:
+        return 2
+    @property
+    def topology_style(self) -> ScheduleStyle:
+        return ScheduleStyle.v

d9d/pipelining/infra/stage/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .stage import PipelineStage
+__all__ = [
+    "PipelineStage"
+]

d9d/pipelining/infra/stage/communications.py ADDED Viewed

@@ -0,0 +1,274 @@
+import dataclasses
+import torch
+import torch.distributed as dist
+@dataclasses.dataclass(kw_only=True, slots=True)
+class ReceiveStageInput:
+    """
+    Instruction to receive a specific tensor from a previous stage (or next stage during backward).
+    Attributes:
+        name: A unique identifier for the communication operation.
+        from_stage: The stage index sending the data.
+        buffer: The pre-allocated tensor buffer where data will be received.
+    """
+    name: str
+    from_stage: int
+    buffer: torch.Tensor
+@dataclasses.dataclass
+class StartStageInput:
+    """
+    Instruction indicating that the input for this stage does not come from communication
+    (e.g., this is the first stage receiving data loader inputs).
+    """
+StageInput = ReceiveStageInput | StartStageInput
+@dataclasses.dataclass(kw_only=True, slots=True)
+class SendStageOutput:
+    """
+    Instruction to send a specific tensor to a next stage (or previous if backward).
+    Attributes:
+        to_stage: The stage index receiving the data.
+    """
+    to_stage: int
+@dataclasses.dataclass
+class EndStageOutput:
+    """
+    Instruction indicating that the output of this stage is not sent anywhere
+    (e.g., this is the last stage computing loss).
+    """
+StageOutput = SendStageOutput | EndStageOutput
+class StageCommunicationHandler:
+    """
+    Manages Point-to-Point (P2P) communication descriptors for a specific data flow direction within a pipeline stage.
+    This class handles the creation of P2P operations (send/recv) across multiple microbatches,
+    managing buffers and mapping logical stage indices to physical ranks.
+    """
+    def __init__(
+            self,
+            name: str,
+            stage_index: int,
+            num_microbatches: int,
+            input_stage_index: int | None,
+            input_args: dict[str, torch.Tensor],
+            output_stage_index: int | None,
+            output_args: dict[str, torch.Tensor],
+            stage_idx_to_host_rank: dict[int, int],
+            group: dist.ProcessGroup
+    ):
+        """
+        Constructs a StageCommunicationHandler object.
+        Args:
+            name: Name prefix for this handler (e.g., 'fwd', 'bwd').
+            stage_index: The logical index of the current stage.
+            num_microbatches: Total number of microbatches ("chunks") to schedule.
+            input_stage_index: The logical index of the stage providing inputs, or None if inputs are local.
+            input_args: Metadata (shapes/dtypes) for input tensors.
+            output_stage_index: The logical index of the stage consuming outputs, or None if outputs are terminal.
+            output_args: Metadata (shapes/dtypes) for output tensors.
+            stage_idx_to_host_rank: Mapping from logical stage indices to physical world ranks.
+            group: The process group strictly for pipeline communication.
+        """
+        self._input_handlers = self._build_inputs(
+            name=name,
+            stage_index=stage_index,
+            num_microbatches=num_microbatches,
+            input_stage_index=input_stage_index,
+            input_args=input_args
+        )
+        self._output_handlers = self._build_outputs(
+            output_stage_index=output_stage_index,
+            output_args=output_args
+        )
+        self._stage_idx_to_host_rank = stage_idx_to_host_rank
+        self._group = group
+    @staticmethod
+    def _build_inputs(
+            name: str,
+            stage_index: int,
+            num_microbatches: int,
+            input_stage_index: int | None,
+            input_args: dict[str, torch.Tensor]
+    ) -> dict[int, dict[str, StageInput]]:
+        handlers: dict[int, dict[str, StageInput]] = {}
+        for chunk_id in range(num_microbatches):
+            handlers[chunk_id] = {}
+            for input_name, input_tensor_meta in input_args.items():
+                if input_stage_index is None:
+                    handlers[chunk_id][input_name] = StartStageInput()
+                else:
+                    handlers[chunk_id][input_name] = ReceiveStageInput(
+                        name=f"{name}_recv_from_{input_stage_index}_to_{stage_index}[{chunk_id}][{input_name}]",
+                        from_stage=input_stage_index,
+                        buffer=torch.empty(
+                            input_tensor_meta.size(),
+                            dtype=input_tensor_meta.dtype,
+                            layout=input_tensor_meta.layout,
+                            device="cuda"  # force device
+                        )
+                    )
+        return handlers
+    @staticmethod
+    def _build_outputs(
+            output_stage_index: int | None,
+            output_args: dict[str, torch.Tensor]
+    ) -> dict[str, StageOutput]:
+        handlers: dict[str, StageOutput] = {}
+        for output_name in output_args:
+            if output_stage_index is None:
+                handlers[output_name] = EndStageOutput()
+            else:
+                handlers[output_name] = SendStageOutput(
+                    to_stage=output_stage_index
+                )
+        return handlers
+    def set_input_requires_grad_(self, requires_grad: bool):
+        """
+        Sets the `requires_grad` flag for all internal input buffers.
+        Typically used to enable gradient flow from backward stages to forward stages.
+        Args:
+            requires_grad: Whether the buffers should require gradients.
+        """
+        for inputs in self._input_handlers.values():
+            for info in inputs.values():
+                if isinstance(info, ReceiveStageInput):
+                    info.buffer.requires_grad_(requires_grad)
+    def set_inputs_local(self, inputs: dict[str, torch.Tensor], microbatch_index: int):
+        """
+        Manually fills the input buffer for a specific microbatch with local data.
+        This is used when the stage is the first in the pipeline or receives data
+        from a dataloader rather than via network communication.
+        Args:
+            inputs: Dictionary of input tensors.
+            microbatch_index: The microbatch identifier.
+        """
+        for input_name, input_value in inputs.items():
+            handler = self._input_handlers[microbatch_index][input_name]
+            if not isinstance(handler, ReceiveStageInput):
+                raise RuntimeError("Tried to set a buffer of no-receive stage input")
+            prev_requires_grad = handler.buffer.requires_grad
+            handler.buffer = input_value.detach().requires_grad_(
+                prev_requires_grad)
+    def get_inputs(self, microbatch_index: int) -> dict[str, torch.Tensor]:
+        """
+        Retrieves the input tensors for a specific microbatch from the internal buffers.
+        Args:
+            microbatch_index: The microbatch identifier.
+        Returns:
+            Dictionary mapping input names to tensors.
+        """
+        outputs: dict[str, torch.Tensor] = {}
+        for input_name, input_info in self._input_handlers[microbatch_index].items():
+            if not isinstance(input_info, ReceiveStageInput):
+                raise RuntimeError("Tried to get a buffer of no receive stage input")
+            outputs[input_name] = input_info.buffer
+        return outputs
+    def create_receive_ops(self, microbatch_index: int) -> list[dist.P2POp]:
+        """
+        Generates the PyTorch P2P receive operations for a specific microbatch.
+        Args:
+            microbatch_index: The microbatch identifier.
+        Returns:
+            A list of `dist.P2POp` objects configured for `dist.irecv`.
+        """
+        ops = []
+        inputs = self._input_handlers[microbatch_index]
+        # sort ops by parameter names to ensure receive ops are ordered the same for send and recv
+        for _input_name, input_info in sorted(inputs.items(), key=lambda x: x[0]):
+            match input_info:
+                case StartStageInput():
+                    pass
+                case ReceiveStageInput():
+                    peer_rank = self._stage_idx_to_host_rank[input_info.from_stage]
+                    peer_global_rank = dist.get_global_rank(self._group, peer_rank)
+                    op = dist.P2POp(dist.irecv, input_info.buffer, peer_global_rank, self._group)
+                    ops.append(op)
+                case _:
+                    raise ValueError()
+        return ops
+    def create_send_ops(self, send_contents: dict[str, torch.Tensor]) -> list[dist.P2POp]:
+        """
+        Generates the PyTorch P2P send operations for the provided tensors.
+        Args:
+            send_contents: Dictionary of tensors to send.
+        Returns:
+            A list of `dist.P2POp` objects configured for `dist.isend`.
+        """
+        ops = []
+        # sort ops by parameter names to ensure receive ops are ordered the same for send and recv
+        for output_name, output_info in sorted(self._output_handlers.items(), key=lambda x: x[0]):
+            output_tensor = send_contents[output_name]
+            match output_info:
+                case EndStageOutput():
+                    pass
+                case SendStageOutput():
+                    peer_rank = self._stage_idx_to_host_rank[output_info.to_stage]
+                    peer_global_rank = dist.get_global_rank(self._group, peer_rank)
+                    op = dist.P2POp(dist.isend, output_tensor, peer_global_rank, self._group)
+                    ops.append(op)
+                case _:
+                    raise ValueError()
+        return ops
+    def reset(self):
+        """Resets the internal state, specifically clearing gradients on input buffers."""
+        for inp_handlers in self._input_handlers.values():
+            for inp_handler in inp_handlers.values():
+                if isinstance(inp_handler, ReceiveStageInput):
+                    inp_handler.buffer.grad = None