PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/kernel/cce/main.py ADDED Viewed

@@ -0,0 +1,282 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# TODO: currently this implementation diverges only in out_grad contiguity fix
+# TODO: proposed in cce.py (grep FIX) - we should contribute this to main repo
+import platform
+import warnings
+from typing import TYPE_CHECKING, Literal, overload
+import torch
+import torch.nn as nn
+from cut_cross_entropy.cce_utils import CCEPreset, CCEPresets, LinearCrossEntropyImpl
+from cut_cross_entropy.constants import IGNORE_INDEX
+from cut_cross_entropy.doc import (
+    CCE_OPTS_DOC,
+    DTENSOR_NOTE,
+    IMPL_DOC,
+    LINEAR_CROSS_ENTROPY_DOC,
+    add_doc_end,
+    add_doc_start,
+)
+from cut_cross_entropy.torch_compile import torch_compile_linear_cross_entropy
+from cut_cross_entropy.utils import (
+    CCEWarning,
+    is_torch_greater_or_equal_2_5,
+    is_triton_3_2,
+    maybe_type_as,
+    to_full_tensor,
+)
+from cut_cross_entropy.vocab_parallel import VocabParallelOptions
+warnings.filterwarnings("once", category=CCEWarning, module="cut_cross_entropy")
+PLATFORM_SYSTEM = platform.system()
+if TYPE_CHECKING or PLATFORM_SYSTEM != "Darwin":
+    from .cce import cce_linear_cross_entropy
+    LCE_IMPL_DEFAULT = LinearCrossEntropyImpl.CCE
+else:
+    cce_linear_cross_entropy = None
+    LCE_IMPL_DEFAULT = LinearCrossEntropyImpl.TORCH_COMPILE
+if TYPE_CHECKING or is_torch_greater_or_equal_2_5():
+    import torch.distributed.tensor
+is_d_tensor_error_message = (
+    "Received {name} as a torch.distributed.tensor.DTensor. This is not supported. "
+)
+@overload
+def linear_cross_entropy(
+    e: torch.Tensor,
+    c: torch.Tensor,
+    targets: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    ignore_index: int = IGNORE_INDEX,
+    softcap: float | None = None,
+    reduction: str = "mean",
+    shift: bool | int = 0,
+    return_lse: Literal[False] = False,
+    filter_eps: float | str | None = "auto",
+    accum_e_fp32: bool = False,
+    accum_c_fp32: bool = False,
+    filter_e_grad: bool = True,
+    filter_c_grad: bool = True,
+    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
+    vocab_parallel_options: VocabParallelOptions | None = None,
+) -> torch.Tensor: ...
+@overload
+def linear_cross_entropy(
+    e: torch.Tensor,
+    c: torch.Tensor,
+    targets: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    ignore_index: int = IGNORE_INDEX,
+    softcap: float | None = None,
+    reduction: str = "mean",
+    shift: bool | int = 0,
+    return_lse: Literal[True] = True,
+    filter_eps: float | str | None = "auto",
+    accum_e_fp32: bool = False,
+    accum_c_fp32: bool = False,
+    filter_e_grad: bool = True,
+    filter_c_grad: bool = True,
+    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
+    vocab_parallel_options: VocabParallelOptions | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]: ...
+@overload
+def linear_cross_entropy(
+    e: torch.Tensor,
+    c: torch.Tensor,
+    targets: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    ignore_index: int = IGNORE_INDEX,
+    softcap: float | None = None,
+    reduction: str = "mean",
+    shift: bool | int = 0,
+    return_lse: bool = False,
+    filter_eps: float | str | None = "auto",
+    accum_e_fp32: bool = False,
+    accum_c_fp32: bool = False,
+    filter_e_grad: bool = True,
+    filter_c_grad: bool = True,
+    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
+    vocab_parallel_options: VocabParallelOptions | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ...
+@add_doc_start(LINEAR_CROSS_ENTROPY_DOC)
+@add_doc_start(*(doc_str + " Only valid for the cce implementation." for doc_str in CCE_OPTS_DOC))
+@add_doc_start(IMPL_DOC)
+@add_doc_end(DTENSOR_NOTE)
+def linear_cross_entropy(
+    e: torch.Tensor,
+    c: torch.Tensor,
+    targets: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    ignore_index: int = IGNORE_INDEX,
+    softcap: float | None = None,
+    reduction: str = "mean",
+    shift: bool | int = 0,
+    return_lse: bool = False,
+    filter_eps: float | str | None = "auto",
+    accum_e_fp32: bool = False,
+    accum_c_fp32: bool = False,
+    filter_e_grad: bool = True,
+    filter_c_grad: bool = True,
+    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
+    vocab_parallel_options: VocabParallelOptions | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    """
+    :param vocab_parallel_options: Used to enable vocab parallelism."""
+    if is_torch_greater_or_equal_2_5():
+        maybe_tensor_inputs = dict(e=e, targets=targets)
+        for k, v in maybe_tensor_inputs.items():
+            if isinstance(v, torch.distributed.tensor.DTensor):
+                raise ValueError(is_d_tensor_error_message.format(name=k))
+        c = maybe_type_as(to_full_tensor(c), e)
+        bias = maybe_type_as(to_full_tensor(bias), e)
+    if isinstance(impl, LinearCrossEntropyImpl):
+        impl = impl.name.lower()
+    if isinstance(shift, int) and (shift < 0 or shift >= targets.size(-1)):
+        raise ValueError(f"Shift must be in the range [0, {targets.size(-1)}). Got {shift}.")
+    if vocab_parallel_options is not None:
+        expected_v_dim_size = vocab_parallel_options.stop - vocab_parallel_options.start
+        if c.size(0) != expected_v_dim_size:
+            raise ValueError(f"Expected c.size(0) to be {expected_v_dim_size}, got {c.size(0)}.")
+    if bias is not None and bias.size(0) != c.size(0):
+        raise ValueError(
+            f"Bias has a different number of elements than c. {bias.size(0)} vs. {c.size(0)}."
+        )
+    if impl in CCEPresets.names:
+        if platform.system() == "Darwin":
+            raise RuntimeError(
+                "CCE does not support MacOS. Please use torch_compile when running on MacOS instead."
+            )
+        if is_triton_3_2():
+            warnings.warn(
+                "There is a known issue with CCE and Triton 3.2 (the version that ships with PyTorch 2.6)"
+                " that can result in incorrect gradients. If possible, please verify that you"
+                " are not impacted by this bug by trying a newer triton version (i.e. by installing PyTorch>2.6).",
+                CCEWarning,
+                stacklevel=2,
+            )
+        cce_opts = CCEPresets.build_for_impl(
+            impl,
+            CCEPreset(
+                filter_eps=filter_eps,
+                accum_e_fp32=accum_e_fp32,
+                accum_c_fp32=accum_c_fp32,
+                filter_e_grad=filter_e_grad,
+                filter_c_grad=filter_c_grad,
+            ),
+        )
+        assert cce_linear_cross_entropy is not None
+        loss, lse = cce_linear_cross_entropy(
+            e,
+            c,
+            targets,
+            bias,
+            ignore_index,
+            softcap,
+            reduction,
+            shift,
+            **cce_opts,
+            vocab_parallel_options=vocab_parallel_options,
+            return_lse=return_lse,
+        )
+    elif impl == "torch_compile":
+        loss, lse = torch_compile_linear_cross_entropy(
+            e,
+            c,
+            targets,
+            bias,
+            ignore_index,
+            softcap,
+            reduction,
+            shift,
+            vocab_parallel_options=vocab_parallel_options,
+            return_lse=return_lse,
+        )
+    else:
+        raise NotImplementedError(f"{impl} is not implemented.")
+    if return_lse:
+        assert lse is not None
+        return loss, lse
+    else:
+        return loss
+class LinearCrossEntropy(nn.Module):
+    def __init__(
+        self,
+        ignore_index: int = IGNORE_INDEX,
+        softcap: float | None = None,
+        reduction: str = "mean",
+        shift: bool | int = 0,
+        filter_eps: float | str | None = "auto",
+        accum_e_fp32: bool = False,
+        accum_c_fp32: bool = False,
+        filter_e_grad: bool = True,
+        filter_c_grad: bool = True,
+        impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
+        return_lse: bool = False,
+    ):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.softcap = softcap
+        self.reduction = reduction
+        self.filter_eps = filter_eps
+        self.shift = shift
+        self.accum_e_fp32 = accum_e_fp32
+        self.accum_c_fp32 = accum_c_fp32
+        self.filter_e_grad = filter_e_grad
+        self.filter_c_grad = filter_c_grad
+        self.impl = impl
+        self.return_lse = return_lse
+    def forward(
+        self,
+        e: torch.Tensor,
+        c: torch.Tensor,
+        targets: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        return linear_cross_entropy(
+            e,
+            c,
+            targets,
+            bias=bias,
+            ignore_index=self.ignore_index,
+            softcap=self.softcap,
+            reduction=self.reduction,
+            shift=self.shift,
+            filter_eps=self.filter_eps,
+            accum_e_fp32=self.accum_e_fp32,
+            accum_c_fp32=self.accum_c_fp32,
+            filter_e_grad=self.filter_e_grad,
+            filter_c_grad=self.filter_c_grad,
+            impl=self.impl,
+            return_lse=self.return_lse,
+        )

d9d/kernel/general/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .get_int_dtype import get_int_dtype
+__all__ = [
+    "get_int_dtype"
+]

d9d/kernel/general/get_int_dtype.py ADDED Viewed

@@ -0,0 +1,7 @@
+import triton
+import triton.language as tl
+@triton.constexpr_function
+def get_int_dtype(bitwidth: int, signed: bool) -> tl.dtype:
+    return tl.core.get_int_dtype(bitwidth, signed)

d9d/kernel/gmm/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .function import gmm
+__all__ = [
+    "gmm"
+]

d9d/kernel/gmm/function.py ADDED Viewed

@@ -0,0 +1,78 @@
+from typing import Any
+import torch
+from grouped_gemm import backend
+from torch.autograd import Function
+from d9d.core.autograd import GLOBAL_GRAD_CONTEXT, GradDirection
+class GroupedGemm(Function):
+    """
+    Autograd function for Grouped GEMM (Generalized Matrix Multiplication) with explicit gradient control.
+    """
+    @staticmethod
+    def forward(
+            ctx: Any,
+            a: torch.Tensor,
+            b: torch.Tensor,
+            batch_sizes: torch.Tensor,
+            a_grad_direction: GradDirection | None,
+            b_grad_direction: GradDirection | None,
+            trans_b: bool
+    ) -> torch.Tensor:
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.a_grad_direction = a_grad_direction
+        ctx.b_grad_direction = b_grad_direction
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+    @staticmethod
+    def backward(
+            ctx: Any, grad: torch.Tensor
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None, None, None, None, None]:
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+        compute_a = GLOBAL_GRAD_CONTEXT.check_direction(ctx.a_grad_direction)
+        compute_b = GLOBAL_GRAD_CONTEXT.check_direction(ctx.b_grad_direction)
+        a_grad = None
+        if ctx.needs_input_grad[0] and compute_a:
+            a_grad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+        b_grad = None
+        if ctx.needs_input_grad[1] and compute_b:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            b_grad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return a_grad, b_grad, None, None, None, None
+def gmm(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        batch_sizes: torch.Tensor,
+        a_grad_direction: GradDirection | None,
+        b_grad_direction: GradDirection | None,
+        trans_b: bool = False
+) -> torch.Tensor:
+    """
+    The Grouped GEMM (Generalized Matrix Multiplication) function with explicit gradient control.
+    Args:
+        a: Left-hand side tensor.
+        b: Right-hand side tensor.
+        batch_sizes: Sizes of batches/groups.
+        a_grad_direction: Gradient category for `a` (e.g., `GradDirection.inputs`).
+        b_grad_direction: Gradient category for `b` (e.g., `GradDirection.weight`).
+        trans_b: Whether to transpose `b`.
+    Returns:
+        Result of matrix multiplication.
+    """
+    return GroupedGemm.apply(a, b, batch_sizes, a_grad_direction, b_grad_direction, trans_b)

d9d/kernel/moe/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .indices_to_multihot import fused_indices_to_multihot
+from .permute_with_probs import moe_permute_with_probs, moe_unpermute_mask
+__all__ = [
+    "fused_indices_to_multihot",
+    "moe_permute_with_probs",
+    "moe_unpermute_mask"
+]

d9d/kernel/moe/indices_to_multihot.py ADDED Viewed

@@ -0,0 +1,268 @@
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/fusions/fused_indices_converter.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+import math
+import torch
+import triton
+import triton.language as tl
+# Assign a block to a row([1,topk]), generate a local routing map([1,num_of_local_experts])
+@triton.jit
+def _indices_to_multihot_kernel(
+    indices_ptr,
+    probs_in_indices_ptr,
+    multihot_indices_ptr,  # bool
+    probs_in_multihot_ptr,
+    position_map_ptr,
+    num_of_local_experts: tl.constexpr,
+    num_of_local_experts_next_power_of_2: tl.constexpr,
+    topk: tl.constexpr,
+    topk_next_power_of_2: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    '''
+    Triton kernel for converting indices to multihot representation.
+    Input:
+        indices: [num_of_tokens, topk]
+        probs_in_indices: [num_of_tokens, topk]
+    Output:
+        multihot_indices: [num_of_tokens, num_of_local_experts]
+        probs_in_multihot: [num_of_tokens, num_of_local_experts]
+    Assume that topk = 2 , num_of_local_experts = 4, num_of_tokens = 2,
+    then the kernel can process the following conversion:
+    Input Example:
+        indices = [
+                [0, 1],
+                [1, 2]
+            ]
+        probs_in_indices = [
+                [0.1, 0.2],
+                [0.3, 0.4]
+            ]
+    Output Example:
+        multihot_indices = [
+                [1, 1, -1, -1],
+                [-1, 1, 1, -1]
+            ]
+        probs_in_multihot = [
+                [0.1, 0.2, 0.0, 0.0],
+                [0.0, 0.3, 0.4, 0.0]
+            ]
+    '''
+    # Prepare the [0, topk) row
+    topk_row = tl.arange(0, topk_next_power_of_2)
+    topk_row = tl.where(topk_row < topk, topk_row, -1)
+    topk_row_mask = topk_row != -1
+    # Prepare the [0, num_of_local_experts) row
+    num_exp_row = tl.arange(0, num_of_local_experts_next_power_of_2)
+    num_exp_row = tl.where(num_exp_row < num_of_local_experts, num_exp_row, -1)
+    num_exp_row_mask = num_exp_row != -1
+    # Load a [1, topk] row from the indices buffer
+    row_idx = tl.program_id(0)
+    indices_row = tl.load(indices_ptr + row_idx * topk + topk_row, mask=topk_row_mask)
+    indices_row = tl.where(topk_row_mask, indices_row, -1)
+    probs_row = tl.load(probs_in_indices_ptr + row_idx * topk + topk_row, mask=topk_row_mask)
+    # Get the position of the each index in the indices_row, which is saved for backwards
+    position_row = tl.where(indices_row != -1, topk_row, -1)
+    # Mask of the valid indices
+    mask = (indices_row != -1) & (indices_row < num_of_local_experts)
+    row_idx_offset = row_idx * num_of_local_experts
+    # Store to initialize
+    tl.store(multihot_indices_ptr + row_idx_offset + num_exp_row, 0, mask=num_exp_row_mask)
+    tl.store(probs_in_multihot_ptr + row_idx_offset + num_exp_row, 0, mask=num_exp_row_mask)
+    tl.store(position_map_ptr + row_idx_offset + num_exp_row, -1, mask=num_exp_row_mask)
+    # Use barrier to make sure the initialization is done
+    tl.debug_barrier()
+    # Store the indices and probs_in_indices
+    tl.store(multihot_indices_ptr + row_idx_offset + indices_row, 1, mask)
+    tl.store(probs_in_multihot_ptr + row_idx_offset + indices_row, probs_row, mask)
+    # Store the position of the position_row for backwards
+    tl.store(position_map_ptr + row_idx_offset + indices_row, position_row, mask)
+# Assign a block to a row([1,topk]), generate a probs_indices([1,topk])
+@triton.jit
+def _multihot_to_indices_kernel(
+    probs_in_multihot_ptr,
+    position_map_ptr,
+    probs_indices_ptr,
+    num_of_local_experts: tl.constexpr,
+    num_of_local_experts_next_power_of_2: tl.constexpr,
+    topk: tl.constexpr,
+    topk_next_power_of_2: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    '''
+    Triton kernel for converting multihot representation to indices.
+    Input:
+        probs_in_multihot: [num_of_tokens, num_of_local_experts]
+        position_map: [num_of_tokens, num_of_local_experts]
+    Output:
+        probs_indices: [num_of_tokens, topk]
+    Assume that topk = 2 , num_of_local_experts = 4, num_of_tokens = 2,
+    then the kernel can process the following conversion:
+    Input Example:
+        probs_in_multihot = [
+                [0.7, 0.8, 0.0, 0.0],
+                [0.0, 0.1, 0.9, 0.0]
+            ]
+        position_map = [
+                [1, 1, -1, -1],
+                [-1, 1, 1, -1]
+            ]
+    Output Example:
+        probs_indices = [
+                [0.7, 0.8],
+                [0.1, 0.9]
+            ]
+    '''
+    # Prepare the [0, topk) row
+    topk_row = tl.arange(0, topk_next_power_of_2)
+    topk_row = tl.where(topk_row < topk, topk_row, -1)
+    topk_row_mask = topk_row != -1
+    # Prepare the [0, num_of_local_experts) row
+    num_exp_row = tl.arange(0, num_of_local_experts_next_power_of_2)
+    num_exp_row = tl.where(num_exp_row < num_of_local_experts, num_exp_row, -1)
+    num_exp_row_mask = num_exp_row != -1
+    # Load a [1, num_of_local_experts] row from the local routing map
+    row_idx = tl.program_id(0)
+    ptr_offset = row_idx * num_of_local_experts + num_exp_row
+    probs_in_multihot_row = tl.load(probs_in_multihot_ptr + ptr_offset, mask=num_exp_row_mask)
+    # Get the original position of the valid value in the the indices
+    position_map_row = tl.load(position_map_ptr + ptr_offset, mask=num_exp_row_mask)
+    position_map_row = tl.where(num_exp_row_mask, position_map_row, -1)
+    mask = position_map_row != -1
+    # Store to initialize
+    tl.store(probs_indices_ptr + row_idx * topk + topk_row, 0, mask=topk_row_mask)
+    # Use barrier to make sure the initialization is done
+    tl.debug_barrier()
+    # Restore the indices and probs_indices
+    tl.store(probs_indices_ptr + row_idx * topk + position_map_row, probs_in_multihot_row, mask)
+class IndicesToMultihot(torch.autograd.Function):
+    """Convert moe topk indices to multihot representation.
+    This class implements a custom forward and backward propagation
+    operation for efficiently converting indices to multihot
+    representation.
+    It is an experimental feature and may change in future versions.
+    """
+    @staticmethod
+    def forward(ctx, indices, probs_indices, num_of_local_experts):
+        '''Forward function for IndicesToMultihot
+        Convert indices to multihot representation.
+        Args:
+            indices: [num_of_tokens, topk]
+            probs_indices: [num_of_tokens, topk]
+            num_of_local_experts: int
+        Returns:
+            multihot_indices: [num_of_tokens, num_of_local_experts]
+            probs_in_multihot: [num_of_tokens, num_of_local_experts]
+        '''
+        num_of_tokens = indices.shape[0]
+        assert (
+            indices.shape == probs_indices.shape
+        ), "indices and probs_indices must have the same shape"
+        topk = indices.shape[1]
+        multihot_indices = torch.empty(
+            (num_of_tokens, num_of_local_experts), dtype=torch.bool, device="cuda"
+        )
+        probs_in_multihot = torch.empty(
+            (num_of_tokens, num_of_local_experts), dtype=probs_indices.dtype, device="cuda"
+        )
+        position_map = torch.empty(
+            (num_of_tokens, num_of_local_experts), dtype=torch.int32, device="cuda"
+        )
+        # Compute the next power of 2 for the topk and num_of_local_experts
+        topk_next_power_of_2 = 2 ** int(math.ceil(math.log2(topk)))
+        num_of_local_experts_next_power_of_2 = 2 ** int(math.ceil(math.log2(num_of_local_experts)))
+        grid = (num_of_tokens,)
+        _indices_to_multihot_kernel[grid](
+            indices,
+            probs_indices,
+            multihot_indices,
+            probs_in_multihot,
+            position_map,
+            num_of_local_experts,
+            num_of_local_experts_next_power_of_2,
+            topk,
+            topk_next_power_of_2,
+            BLOCK_SIZE=32,  # use only 1 warp per block
+            num_warps=1,
+        )
+        ctx.save_for_backward(position_map)
+        ctx.num_of_tokens = num_of_tokens
+        ctx.num_of_local_experts = num_of_local_experts
+        ctx.topk = topk
+        return multihot_indices, probs_in_multihot
+    @staticmethod
+    def backward(ctx, grad_multihot_indices, grad_probs_in_multihot):
+        '''Backward function for IndicesToMultihot
+        Convert multihot probs representation to indices.
+        indices is ignored in the backward function.
+        Args:
+            grad_multihot_indices: [num_of_tokens, num_of_local_experts]
+            grad_probs_in_multihot: [num_of_tokens, num_of_local_experts]
+        Returns:
+            grad_probs_indices: [num_of_tokens, topk]
+        '''
+        position_map = ctx.saved_tensors[0]
+        num_of_tokens = ctx.num_of_tokens
+        num_of_local_experts = ctx.num_of_local_experts
+        topk = ctx.topk
+        # Initialize the gradient of the indices and probs_indices
+        grad_probs_indices = torch.empty(
+            (num_of_tokens, topk), dtype=grad_probs_in_multihot.dtype, device="cuda"
+        )
+        # Compute the next power of 2 for the topk and num_of_local_experts
+        topk_next_power_of_2 = 2 ** int(math.ceil(math.log2(topk)))
+        num_of_local_experts_next_power_of_2 = 2 ** int(math.ceil(math.log2(num_of_local_experts)))
+        grid = (num_of_tokens,)
+        _multihot_to_indices_kernel[grid](
+            # if the grad_probs_in_multihot is all-one/all-zero,
+            # overlapping stride will cause error without contiguous()
+            grad_probs_in_multihot.contiguous(),
+            position_map,
+            grad_probs_indices,
+            num_of_local_experts,
+            num_of_local_experts_next_power_of_2,
+            topk,
+            topk_next_power_of_2,
+            BLOCK_SIZE=32,  # use only 1 warp per block
+            num_warps=1,
+        )
+        return None, grad_probs_indices, None, None
+def fused_indices_to_multihot(indices, probs_indices, num_of_local_experts):
+    """Convert moe topk indices to multihot representation.
+    This function is an experimental feature and may change in future versions.
+    """
+    return IndicesToMultihot.apply(indices, probs_indices, num_of_local_experts)