PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/pipelining/infra/stage/computations.py ADDED Viewed

@@ -0,0 +1,317 @@
+import dataclasses
+from collections.abc import Iterator, Mapping, Sequence
+from typing import Any, cast
+import torch
+from torch import nn
+from torch.autograd.graph import Node
+from .splitgrad import (
+    ParamGroup,
+    stage_backward_full,
+    stage_backward_input,
+    stage_backward_weight,
+)
+from .struct_helper import DictFlattener
+# TODO/NOTICE: We WILL NOT disable FSDP's resharding for microbatches since it will modify
+# TODO/NOTICE: its behavior in an unexpected way. Perhaps we need better FSDP resharding policy handler?
+@dataclasses.dataclass(slots=True)
+class ForwardCache:
+    """
+    Stores the inputs and outputs of a forward pass to be used later in the backward pass.
+    """
+    inputs: dict[str, torch.Tensor]
+    outputs: dict[str, torch.Tensor]
+class ForwardComputeHandler:
+    """
+    Handles the execution of the forward pass for a pipeline stage module.
+    Maintains a cache of inputs and outputs indexed by microbatch ID.
+    """
+    def __init__(
+            self,
+            stage_index: int,
+            module: nn.Module
+    ):
+        """
+        Constructs a ForwardComputeHandler object.
+        Args:
+            stage_index: Logical index of the stage.
+            module: The PyTorch module representing this stage computation.
+        """
+        self._stage_idx = stage_index
+        self._module = module
+        self._cache: dict[int, ForwardCache] = {}
+    def run(
+            self,
+            microbatch_index: int,
+            inputs: dict[str, torch.Tensor],
+            kwargs: dict[str, Any]
+    ):
+        """
+        Executes the module's forward pass.
+        Args:
+            microbatch_index: Identifier for the current microbatch.
+            inputs: Dictionary of input tensors.
+            kwargs: Additional keyword arguments for the module.
+        Returns:
+            The output of the module.
+        Raises:
+            RuntimeError: If the forward pass implementation fails.
+        """
+        # Compute forward
+        try:
+            output = self._module(**inputs, **kwargs)
+        except Exception as e:
+            raise RuntimeError(f"S{self._stage_idx}B{microbatch_index} failed to run forward") from e
+        if not isinstance(output, Mapping):
+            raise ValueError("Currently, pipelined models should output dict[str, torch.Tensor | None]")
+        output = {k: v for k, v in output.items() if v is not None}
+        self._cache[microbatch_index] = ForwardCache(
+            inputs=inputs,
+            outputs=output
+        )
+    def get_outputs(self, microbatch_index: int) -> dict[str, torch.Tensor]:
+        """
+        Retrieves cached outputs for a specific microbatch without removing them.
+        Args:
+            microbatch_index: Identifier for the microbatch.
+        Returns:
+            Dictionary of output tensors.
+        """
+        return self._cache[microbatch_index].outputs
+    def pop_inputs_outputs(
+            self, microbatch_index: int
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        """
+        Retrieves and removes the cached inputs and outputs for a specific microbatch.
+        Typically called when initiating the backward pass.
+        Args:
+            microbatch_index: Identifier for the microbatch.
+        Returns:
+            A tuple containing (inputs, outputs).
+        """
+        cache = self._cache.pop(microbatch_index)
+        return cache.inputs, cache.outputs
+@dataclasses.dataclass(kw_only=True, slots=True)
+class BackwardCacheInputForWeight:
+    """
+    State preserved after calculating input gradients, pending weight gradient calculation.
+    """
+    inputs_grad: dict[str, torch.Tensor]
+    param_groups: list[ParamGroup]
+    ownership_tokens: list[Node]
+@dataclasses.dataclass(kw_only=True, slots=True)
+class BackwardCacheInputForFull:
+    stage_outputs_or_loss: list[torch.Tensor]
+    output_grads: list[torch.Tensor] | None
+    input_values: list[torch.Tensor]
+@dataclasses.dataclass(kw_only=True, slots=True)
+class BackwardCacheFull:
+    """
+    State preserved after calculating weight gradients.
+    """
+    inputs_grad: dict[str, torch.Tensor | None]
+class BackwardComputeHandler:
+    """
+    Handles the execution of backward passes for a pipeline stage.
+    Supports splitting the backward pass into input-gradients and weight-gradients
+    phases, which is necessary for schedules like ZB.
+    """
+    def __init__(
+            self,
+            stage_index: int,
+            module: nn.Module
+    ):
+        """
+        Constructs a BackwardComputeHandler object.
+        Args:
+            stage_index: Logical index of the stage.
+            module: The PyTorch module to compute gradients for.
+        """
+        self._stage_idx = stage_index
+        self._module = module
+        self._cache: dict[int, BackwardCacheInputForWeight | BackwardCacheInputForFull | BackwardCacheFull] = {}
+    def _parameters_with_grad(self) -> Iterator[nn.Parameter]:
+        return (param for param in self._module.parameters() if param.requires_grad)
+    def backward_full(
+            self,
+            microbatch_index: int,
+            inputs: dict[str, torch.Tensor],
+            outputs: dict[str, torch.Tensor],
+            outputs_grad: dict[str, torch.Tensor] | None,
+    ):
+        """
+        Performs a full backward pass (both inputs and weights).
+        Args:
+            microbatch_index: Identifier for the microbatch.
+            inputs: The inputs used in the forward pass.
+            outputs: The outputs produced by the forward pass.
+            outputs_grad: Gradients of the loss with respect to the outputs.
+        """
+        if microbatch_index in self._cache:
+            raise ValueError(f"S{self._stage_idx}B{microbatch_index} double backward")
+        inputs_flattener = DictFlattener(inputs.keys())
+        outputs_flattener = DictFlattener(outputs.keys())
+        inputs_grad_linear = stage_backward_full(
+            outputs=outputs_flattener.flatten(outputs),
+            output_grads=outputs_flattener.flatten(outputs_grad) if outputs_grad is not None else None,
+            inputs=inputs_flattener.flatten(inputs)
+        )
+        if self._stage_idx != 0:
+            self._cache[microbatch_index] = BackwardCacheFull(
+                inputs_grad=inputs_flattener.unflatten(inputs_grad_linear)
+            )
+    def backward_input(
+            self,
+            microbatch_index: int,
+            inputs: dict[str, torch.Tensor],
+            outputs: dict[str, torch.Tensor],
+            outputs_grad: dict[str, torch.Tensor] | None
+    ):
+        """
+        Performs a partial backward pass to compute gradients with respect to inputs only.
+        This prepares the computation state for a subsequent `backward_weight` call.
+        Args:
+            microbatch_index: Identifier for the microbatch.
+            inputs: The inputs used in the forward pass.
+            outputs: The outputs produced by the forward pass.
+            outputs_grad: Gradients of the loss with respect to the outputs.
+        """
+        if microbatch_index in self._cache:
+            raise ValueError("Double backward pass")
+        inputs_flattener = DictFlattener(inputs.keys())
+        outputs_flattener = DictFlattener(outputs.keys())
+        if self._stage_idx == 0:
+            self._cache[microbatch_index] = BackwardCacheInputForFull(
+                stage_outputs_or_loss=outputs_flattener.flatten(outputs),
+                output_grads=outputs_flattener.flatten(outputs_grad) if outputs_grad is not None else None,
+                input_values=inputs_flattener.flatten(inputs)
+            )
+        else:
+            results = stage_backward_input(
+                outputs=outputs_flattener.flatten(outputs),
+                output_grads=outputs_flattener.flatten(outputs_grad) if outputs_grad is not None else None,
+                inputs=inputs_flattener.flatten(inputs),
+                weights=self._parameters_with_grad()
+            )
+            self._cache[microbatch_index] = BackwardCacheInputForWeight(
+                inputs_grad=inputs_flattener.unflatten(cast(Sequence[torch.Tensor], results.input_grads)),
+                param_groups=results.param_groups,
+                ownership_tokens=results.grad_ownership_tokens
+            )
+    def backward_weight(
+            self,
+            microbatch_index: int
+    ):
+        """
+        Performs a partial backward pass to accumulate gradients into weights.
+        Must be preceded by `backward_input` for the same microbatch index.
+        Args:
+            microbatch_index: Identifier for the microbatch.
+        """
+        if microbatch_index not in self._cache:
+            raise ValueError(f"S{self._stage_idx}BW{microbatch_index} - weight backward with no input backward before")
+        prev_cache = self._cache.pop(microbatch_index)
+        match prev_cache:
+            case BackwardCacheInputForFull():
+                stage_backward_full(
+                    outputs=prev_cache.stage_outputs_or_loss,
+                    output_grads=prev_cache.output_grads,
+                    inputs=prev_cache.input_values
+                )
+            case BackwardCacheInputForWeight():
+                stage_backward_weight(
+                    weights=self._parameters_with_grad(),
+                    param_groups=prev_cache.param_groups
+                )
+            case _:
+                raise ValueError("Previous backward was not input backward")
+    def pop_for_sending(self, microbatch_index: int) -> dict[str, torch.Tensor]:
+        """
+        Retrieves the calculated input gradients for a microbatch.
+        Args:
+            microbatch_index: Identifier for the microbatch.
+        Returns:
+            Dictionary of gradient tensors.
+        """
+        cached = self._cache[microbatch_index]
+        match cached:
+            case BackwardCacheFull():
+                del self._cache[microbatch_index]
+            case BackwardCacheInputForWeight():
+                pass
+            case _:
+                raise ValueError("You should call either backward_full or backward_input before popping cached grad")
+        for grad_value in cached.inputs_grad.values():
+            if grad_value is None:
+                raise ValueError("Cannot pop null gradient for sending! Perhaps malformed schedule?")
+        return cast(dict[str, torch.Tensor], cached.inputs_grad)

d9d/pipelining/infra/stage/splitgrad.py ADDED Viewed

@@ -0,0 +1,377 @@
+from collections import defaultdict, deque
+from collections.abc import Callable, Iterator
+from dataclasses import dataclass
+from typing import Any, cast
+import torch
+from torch import nn
+from torch.autograd.graph import GradientEdge, Node
+from d9d.core.autograd import GLOBAL_GRAD_CONTEXT, GradDirection
+def stage_backward_full(
+        outputs: list[torch.Tensor],
+        output_grads: list[torch.Tensor] | None,
+        inputs: list[torch.Tensor]
+) -> list[torch.Tensor | None]:
+    """
+    Performs a standard, full backward pass for a pipeline stage.
+    This function computes gradients for the inputs based on the gradients
+    received for the outputs.
+    Args:
+        outputs: The output tensors of the forward pass.
+        output_grads: The gradients arriving from the next pipeline stage corresponding
+            to `outputs`. If None, assumes scalar output or implied ones.
+        inputs: The input tensors to the forward pass for which gradients are required.
+    Returns:
+        A list of gradients corresponding to the `inputs`. If some input does not require gradient - its result will
+            be None.
+    """
+    with GLOBAL_GRAD_CONTEXT.with_directions(GradDirection.inputs, GradDirection.weight):
+        torch.autograd.backward(
+            tensors=outputs,
+            grad_tensors=output_grads
+        )
+    input_grads = []
+    for input_item in inputs:
+        input_grads.append(input_item.grad)
+        input_item.grad = None
+    return input_grads
+@dataclass
+class ParamGroup:
+    """
+    Represents a group of parameters and their dependency intermediates in the autograd graph.
+    This structure is used to manage the split backward pass, identifying which
+    intermediate nodes in the graph allow gradients to flow to specific sets of parameters.
+    Attributes:
+        params: Set of autograd Nodes representing the parameters.
+        intermediates: List of autograd Nodes serving as entry points for gradients
+            flowing to these parameters.
+        grads: Storage for captured gradients at the intermediate nodes during
+            the input backward phase.
+    """
+    params: set[Node]
+    intermediates: list[Node] | None
+    grads: list[torch.Tensor | None] | None = None
+def _get_grad_fn_or_grad_acc(t: torch.Tensor) -> Node | None:
+    if t.requires_grad and t.grad_fn is None:
+        # hack from pytorch codebase to create accumulation op
+        viewed_t = t.view_as(t)
+        grad_fn = viewed_t.grad_fn
+        grad_fn = cast(Node, grad_fn)
+        return grad_fn.next_functions[0][0]
+    else:
+        return t.grad_fn
+def _construct_reverse_graph(roots: list[Node]) -> dict[Node, list[Node]]:
+    """
+    Builds a reverse adjacency list (Input -> Output) via BFS from the roots.
+    Standard autograd graphs point from Output -> Input (next_functions).
+    This helper provides the reverse mapping to assist in dependency analysis.
+    Args:
+        roots: The starting nodes for the graph traversal.
+    Returns:
+        A dictionary mapping a node to a list of its dependent (child) nodes.
+    """
+    reverse_graph = defaultdict(list)
+    valid_roots = {x for x in roots if x is not None}
+    to_visit = deque(valid_roots)
+    visited = set(valid_roots)
+    while to_visit:
+        current_node = to_visit.popleft()
+        for parent_node, _ in current_node.next_functions:
+            if parent_node is None:
+                continue
+            reverse_graph[parent_node].append(current_node)
+            if parent_node not in visited:
+                visited.add(parent_node)
+                to_visit.append(parent_node)
+    return reverse_graph
+def _reverse_closure(
+        roots: list[Node], target_nodes: set[Node], reverse_edges_dict: dict[Node, list[Node]]
+) -> tuple[set[Node], set[Node]]:
+    """
+    Computes a closure of nodes reachable from roots in the reverse graph.
+    Args:
+        roots: Starting nodes.
+        target_nodes: Nodes that act as boundaries/targets for the search.
+        reverse_edges_dict: The reverse graph adjacency list.
+    Returns:
+        A tuple containing the set of all closure nodes and the set of visited target nodes.
+    """
+    closure: set[Node] = set()
+    visited_target_nodes = set()
+    to_visit: deque[Node] = deque()
+    for node in roots:
+        if node is not None and node not in closure:
+            closure.add(node)
+            to_visit.append(node)
+    while to_visit:
+        node = to_visit.popleft()
+        reverse_edges = reverse_edges_dict[node]
+        for fn in reverse_edges:
+            if fn in closure or fn is None:
+                continue
+            if fn in target_nodes:
+                visited_target_nodes.add(fn)
+                continue
+            closure.add(fn)
+            to_visit.append(fn)
+    return closure, visited_target_nodes
+def _get_param_groups(
+        inputs: list[Node], params: list[Node], reverse_edges_dict: dict[Node, list[Node]]
+) -> list[ParamGroup]:
+    """
+    Clusters parameters based on their dependencies on inputs.
+    This function identifies how gradients propagate from inputs through intermediates
+    to parameters, grouping them to facilitate split backward execution.
+    Args:
+        inputs: Gradient functions of the input tensors.
+        params: Gradient functions of the parameter tensors.
+        reverse_edges_dict: The reverse autograd graph.
+    Returns:
+        A list of distinct parameter groups.
+    """
+    inputs_closure, _ = _reverse_closure(inputs, set(), reverse_edges_dict)
+    node_to_group_map: dict[Node, dict[str, set[Node]]] = {}
+    for param in params:
+        _, intersected_inputs = _reverse_closure(
+            [param], inputs_closure, reverse_edges_dict
+        )
+        current_dict = {
+            "params": {param},
+            "intermediates": intersected_inputs
+        }
+        target_dict = None
+        for intermediate_node in intersected_inputs:
+            if intermediate_node in node_to_group_map:
+                target_dict = node_to_group_map[intermediate_node]
+                break
+        if target_dict is not None:
+            target_dict["params"].update(current_dict["params"])
+            target_dict["intermediates"].update(current_dict["intermediates"])
+            current_dict = target_dict
+        for intermediate_node in current_dict["intermediates"]:
+            node_to_group_map[intermediate_node] = current_dict
+    # Deduplicate and Convert to Dataclass
+    unique_groups = []
+    seen_ids = set()
+    for group_dict in node_to_group_map.values():
+        if id(group_dict) not in seen_ids:
+            seen_ids.add(id(group_dict))
+            unique_groups.append(ParamGroup(
+                params=group_dict["params"],
+                intermediates=list(group_dict["intermediates"])
+            ))
+    return unique_groups
+def _make_capture_hook(group: ParamGroup, idx: int) -> Callable[[torch.Tensor], None]:
+    def _hook(grad_in: torch.Tensor):
+        # Lazy init gradients list
+        if group.grads is None and group.intermediates is not None:
+            group.grads = [None] * len(group.intermediates)
+        if group.grads is not None:
+            group.grads[idx] = grad_in
+    return _hook
+@dataclass
+class BackwardInputResult:
+    """
+    Container for the results of the input backward phase.
+    Attributes:
+        input_grads: The gradients computed for the input tensors.
+        param_groups: The parameter groups with hooks established to capture
+            weight gradients in the subsequent phase.
+        grad_ownership_tokens: References to tensors keeping the computation
+            graph alive for the weight backward phase.
+    """
+    input_grads: list[torch.Tensor | None]
+    param_groups: list[ParamGroup]
+    grad_ownership_tokens: list[Any]
+def stage_backward_input(
+        outputs: list[torch.Tensor],
+        output_grads: list[torch.Tensor] | None,
+        inputs: list[torch.Tensor],
+        weights: Iterator[nn.Parameter],
+) -> BackwardInputResult:
+    """
+    Performs the first phase of a split backward pass: Input Gradients.
+    This function computes the gradients with respect to `inputs` while postponing
+    the computation of gradients with respect to `weights`. It analyzes the
+    autograd graph to identify intermediate nodes where gradients destined for
+    weights split off from the main flow. Hooks are registered at these
+    intermediates to capture gradients for the second phase (`stage_backward_weight`).
+    Args:
+        outputs: The output tensors of the forward pass.
+        output_grads: The gradients arriving for the outputs.
+        inputs: The input tensors from the forward pass.
+        weights: An iterator over the model parameters (weights).
+    Returns:
+        A result object containing input gradients, prepared parameter groups,
+        and ownership tokens to maintain graph validity.
+    """
+    outputs_grad_fn = [grad_fn for x in outputs if (grad_fn := _get_grad_fn_or_grad_acc(x)) is not None]
+    inputs_grad_fn = [grad_fn for x in inputs if (grad_fn := _get_grad_fn_or_grad_acc(x)) is not None]
+    weights_grad_fn = [grad_fn for x in weights if (grad_fn := _get_grad_fn_or_grad_acc(x)) is not None]
+    reverse_edges = _construct_reverse_graph(outputs_grad_fn)
+    param_groups = _get_param_groups(inputs_grad_fn, weights_grad_fn, reverse_edges)
+    hook_handles = []
+    for group in param_groups:
+        if group.intermediates:
+            for i, node in enumerate(group.intermediates):
+                hook_handles.append(node.register_prehook(_make_capture_hook(group, i)))
+    if output_grads is None:
+        output_grads = [torch.ones_like(o) for o in outputs]
+    inputs_requiring_grad = [inp for inp in inputs if inp.requires_grad]
+    with GLOBAL_GRAD_CONTEXT.with_directions(GradDirection.inputs):
+        torch.autograd.backward(
+            tensors=outputs,
+            grad_tensors=output_grads,
+            inputs=inputs_requiring_grad,
+            retain_graph=True,
+        )
+    final_input_grads = []
+    # 6. Cleanup
+    for input_item in inputs:
+        final_input_grads.append(input_item.grad)
+        input_item.grad = None
+    for handle in hook_handles:
+        handle.remove()
+    return BackwardInputResult(
+        input_grads=final_input_grads,
+        param_groups=param_groups,
+        # TODO(max): we can keep only intermediate ownership tokens to both truncate the
+        # TODO(max): graph and do not deallocate C++ stuff
+        grad_ownership_tokens=outputs  # Keep the tensors alive!
+    )
+def stage_backward_weight(  # noqa: C901
+        weights: Iterator[nn.Parameter],
+        param_groups: list[ParamGroup],
+        retain_graph: bool = False
+) -> tuple[torch.Tensor | None, ...]:
+    """
+    Performs the second phase of a split backward pass: Weight Gradients.
+    This function consumes the gradients captured in the `ParamGroup`s during
+    `stage_backward_input` to compute the final gradients for the model weights.
+    It triggers backward passes starting from the intermediate nodes identified previously.
+    Args:
+        weights: An iterator over the model parameters to extract gradients for.
+        param_groups: The list of groups containing captured intermediate gradients.
+        retain_graph: Whether to retain the graph after this backward pass.
+    Returns:
+        A tuple of gradients corresponding to the provided `weights`.
+    """
+    grad_acc_to_weight = {}
+    all_weights = []  # Keep order
+    for weight in weights:
+        all_weights.append(weight)
+        grad_acc = _get_grad_fn_or_grad_acc(weight)
+        if grad_acc is not None:
+            grad_acc_to_weight[grad_acc] = weight
+    for group in param_groups:
+        valid_edges = []
+        valid_grad_outputs: list[torch.Tensor] = []
+        # Ensure we have data
+        if group.grads and group.intermediates:
+            for grads_tuple, intermediate in zip(group.grads, group.intermediates, strict=True):
+                if grads_tuple is None:
+                    raise ValueError("Trying to do backward_weight with to intermediate grads")
+                non_none = [g for g in grads_tuple if g is not None]
+                if len(non_none) > 0:
+                    valid_edges.append(GradientEdge(intermediate, 0))
+                    valid_grad_outputs.append(cast(torch.Tensor, sum(non_none)))
+        # Break Cycle: Intermediates
+        group.intermediates = None
+        if valid_edges:
+            inputs_for_backward = []
+            for node in group.params:
+                if node in grad_acc_to_weight:
+                    inputs_for_backward.append(grad_acc_to_weight[node])
+            if inputs_for_backward:
+                with GLOBAL_GRAD_CONTEXT.with_directions(GradDirection.weight):
+                    torch.autograd.backward(
+                        tensors=valid_edges,
+                        grad_tensors=valid_grad_outputs,
+                        retain_graph=retain_graph,
+                        inputs=inputs_for_backward
+                    )
+        # Break Cycle: Grads
+        group.grads = None
+    return tuple(w.grad for w in all_weights)