PyPI - d9d - Versions diffs - 0.1.0__py3-none-any.whl - Mend

d9d 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

d9d/__init__.py +0 -0
d9d/core/__init__.py +0 -0
d9d/core/autograd/__init__.py +7 -0
d9d/core/autograd/grad_context.py +85 -0
d9d/core/dist_context/__init__.py +19 -0
d9d/core/dist_context/configured.py +215 -0
d9d/core/dist_context/device_mesh_domains.py +185 -0
d9d/core/dist_context/log.py +30 -0
d9d/core/dist_context/params.py +113 -0
d9d/core/dist_ops/__init__.py +16 -0
d9d/core/dist_ops/object.py +68 -0
d9d/core/dist_ops/tensor.py +192 -0
d9d/core/protocol/__init__.py +8 -0
d9d/core/protocol/training.py +38 -0
d9d/core/sharding/__init__.py +15 -0
d9d/core/sharding/auto_spec.py +66 -0
d9d/core/sharding/shard.py +154 -0
d9d/core/sharding/spec.py +28 -0
d9d/core/sharding/unshard.py +117 -0
d9d/core/types/__init__.py +12 -0
d9d/core/types/data.py +14 -0
d9d/core/types/pytree.py +26 -0
d9d/dataset/__init__.py +17 -0
d9d/dataset/buffer_sorted.py +143 -0
d9d/dataset/padding.py +79 -0
d9d/dataset/sharded.py +195 -0
d9d/internals/__init__.py +0 -0
d9d/internals/determinism/__init__.py +10 -0
d9d/internals/determinism/seed.py +63 -0
d9d/internals/grad_norm/__init__.py +8 -0
d9d/internals/grad_norm/group.py +87 -0
d9d/internals/grad_norm/norm.py +169 -0
d9d/internals/grad_sync/__init__.py +14 -0
d9d/internals/grad_sync/bucket.py +317 -0
d9d/internals/grad_sync/placement_helper.py +23 -0
d9d/internals/grad_sync/synchronizer.py +257 -0
d9d/internals/pipeline_state/__init__.py +14 -0
d9d/internals/pipeline_state/api.py +45 -0
d9d/internals/pipeline_state/handler.py +111 -0
d9d/internals/pipeline_state/storage.py +236 -0
d9d/internals/profiling/__init__.py +7 -0
d9d/internals/profiling/profile.py +112 -0
d9d/internals/state/__init__.py +6 -0
d9d/internals/state/main_process.py +44 -0
d9d/kernel/__init__.py +0 -0
d9d/kernel/cce/__init__.py +5 -0
d9d/kernel/cce/cce.py +298 -0
d9d/kernel/cce/main.py +282 -0
d9d/kernel/general/__init__.py +5 -0
d9d/kernel/general/get_int_dtype.py +7 -0
d9d/kernel/gmm/__init__.py +5 -0
d9d/kernel/gmm/function.py +78 -0
d9d/kernel/moe/__init__.py +8 -0
d9d/kernel/moe/indices_to_multihot.py +268 -0
d9d/kernel/moe/permute_with_probs.py +1035 -0
d9d/kernel/stochastic/__init__.py +11 -0
d9d/kernel/stochastic/adamw_step.py +204 -0
d9d/kernel/stochastic/copy.py +104 -0
d9d/kernel/stochastic/ops/__init__.py +5 -0
d9d/kernel/stochastic/ops/round.py +22 -0
d9d/kernel/swiglu/__init__.py +5 -0
d9d/kernel/swiglu/function.py +36 -0
d9d/kernel/swiglu/op.py +167 -0
d9d/loop/__init__.py +0 -0
d9d/loop/auto/__init__.py +9 -0
d9d/loop/auto/auto_lr_scheduler.py +46 -0
d9d/loop/auto/auto_optimizer.py +196 -0
d9d/loop/component/__init__.py +35 -0
d9d/loop/component/batch_maths.py +106 -0
d9d/loop/component/checkpointer.py +172 -0
d9d/loop/component/data_loader_factory.py +258 -0
d9d/loop/component/garbage_collector.py +94 -0
d9d/loop/component/gradient_clipper.py +89 -0
d9d/loop/component/gradient_manager.py +149 -0
d9d/loop/component/job_logger.py +146 -0
d9d/loop/component/job_profiler.py +62 -0
d9d/loop/component/loss_computer.py +86 -0
d9d/loop/component/model_stage_exporter.py +37 -0
d9d/loop/component/model_stage_factory.py +261 -0
d9d/loop/component/optimizer_factory.py +88 -0
d9d/loop/component/stepper.py +52 -0
d9d/loop/component/timeout_manager.py +54 -0
d9d/loop/component/train_task_operator.py +152 -0
d9d/loop/config/__init__.py +36 -0
d9d/loop/config/config.py +225 -0
d9d/loop/config/types.py +24 -0
d9d/loop/control/__init__.py +61 -0
d9d/loop/control/dataset_provider.py +58 -0
d9d/loop/control/lr_scheduler_provider.py +47 -0
d9d/loop/control/model_provider.py +162 -0
d9d/loop/control/optimizer_provider.py +45 -0
d9d/loop/control/task.py +304 -0
d9d/loop/run/__init__.py +6 -0
d9d/loop/run/train.py +355 -0
d9d/loop/state.py +143 -0
d9d/lr_scheduler/__init__.py +9 -0
d9d/lr_scheduler/piecewise/__init__.py +18 -0
d9d/lr_scheduler/piecewise/builder.py +152 -0
d9d/lr_scheduler/piecewise/config.py +176 -0
d9d/lr_scheduler/piecewise/curves.py +75 -0
d9d/lr_scheduler/piecewise/engine.py +76 -0
d9d/lr_scheduler/visualizer.py +74 -0
d9d/metric/__init__.py +10 -0
d9d/metric/abc.py +79 -0
d9d/metric/impl/__init__.py +7 -0
d9d/metric/impl/compose.py +54 -0
d9d/metric/impl/mean.py +94 -0
d9d/model_state/__init__.py +0 -0
d9d/model_state/io/__init__.py +21 -0
d9d/model_state/io/dto.py +30 -0
d9d/model_state/io/module_reader.py +75 -0
d9d/model_state/io/module_writer.py +123 -0
d9d/model_state/io/reader.py +125 -0
d9d/model_state/io/writer.py +309 -0
d9d/model_state/mapper/__init__.py +10 -0
d9d/model_state/mapper/abc.py +70 -0
d9d/model_state/mapper/adapters/__init__.py +12 -0
d9d/model_state/mapper/adapters/mapper.py +27 -0
d9d/model_state/mapper/adapters/module.py +22 -0
d9d/model_state/mapper/compose/__init__.py +17 -0
d9d/model_state/mapper/compose/helper.py +22 -0
d9d/model_state/mapper/compose/parallel.py +58 -0
d9d/model_state/mapper/compose/sequential.py +131 -0
d9d/model_state/mapper/compose/shard.py +36 -0
d9d/model_state/mapper/leaf/__init__.py +18 -0
d9d/model_state/mapper/leaf/dtensor.py +56 -0
d9d/model_state/mapper/leaf/identity.py +23 -0
d9d/model_state/mapper/leaf/rename.py +26 -0
d9d/model_state/mapper/leaf/select_child.py +37 -0
d9d/model_state/mapper/leaf/stack.py +29 -0
d9d/module/__init__.py +0 -0
d9d/module/base/__init__.py +7 -0
d9d/module/base/late_init.py +10 -0
d9d/module/block/__init__.py +0 -0
d9d/module/block/attention/__init__.py +7 -0
d9d/module/block/attention/grouped_query.py +139 -0
d9d/module/block/attention/sdpa/__init__.py +5 -0
d9d/module/block/attention/sdpa/flash.py +52 -0
d9d/module/block/embedding/__init__.py +7 -0
d9d/module/block/embedding/shard_token_embedding.py +103 -0
d9d/module/block/ffn/__init__.py +5 -0
d9d/module/block/ffn/swiglu.py +60 -0
d9d/module/block/head/__init__.py +6 -0
d9d/module/block/head/language_modelling.py +87 -0
d9d/module/block/hidden_states_aggregator/__init__.py +12 -0
d9d/module/block/hidden_states_aggregator/base.py +35 -0
d9d/module/block/hidden_states_aggregator/factory.py +48 -0
d9d/module/block/hidden_states_aggregator/mean.py +61 -0
d9d/module/block/hidden_states_aggregator/noop.py +27 -0
d9d/module/block/moe/__init__.py +13 -0
d9d/module/block/moe/communications/__init__.py +11 -0
d9d/module/block/moe/communications/base.py +58 -0
d9d/module/block/moe/communications/deepep.py +300 -0
d9d/module/block/moe/communications/naive.py +68 -0
d9d/module/block/moe/grouped_experts.py +81 -0
d9d/module/block/moe/grouped_linear.py +78 -0
d9d/module/block/moe/layer.py +122 -0
d9d/module/block/moe/router.py +103 -0
d9d/module/block/positional/__init__.py +8 -0
d9d/module/block/positional/rope.py +150 -0
d9d/module/model/__init__.py +0 -0
d9d/module/model/qwen3_moe/__init__.py +16 -0
d9d/module/model/qwen3_moe/decoder_layer.py +110 -0
d9d/module/model/qwen3_moe/model.py +373 -0
d9d/module/model/qwen3_moe/params.py +69 -0
d9d/module/parallelism/__init__.py +0 -0
d9d/module/parallelism/api/__init__.py +18 -0
d9d/module/parallelism/api/expert_parallel.py +36 -0
d9d/module/parallelism/api/fully_sharded.py +43 -0
d9d/module/parallelism/api/hybrid_sharded.py +49 -0
d9d/module/parallelism/api/replicate_parallel.py +33 -0
d9d/module/parallelism/model/__init__.py +0 -0
d9d/module/parallelism/model/qwen3_moe.py +99 -0
d9d/module/parallelism/style/__init__.py +7 -0
d9d/module/parallelism/style/shard_experts.py +60 -0
d9d/module/parallelism/style/to_local.py +86 -0
d9d/optim/__init__.py +0 -0
d9d/optim/stochastic/__init__.py +5 -0
d9d/optim/stochastic/adamw.py +158 -0
d9d/peft/__init__.py +13 -0
d9d/peft/all/__init__.py +12 -0
d9d/peft/all/config.py +31 -0
d9d/peft/all/method.py +76 -0
d9d/peft/applicator.py +47 -0
d9d/peft/base.py +70 -0
d9d/peft/full_tune/__init__.py +11 -0
d9d/peft/full_tune/config.py +20 -0
d9d/peft/full_tune/method.py +46 -0
d9d/peft/lora/__init__.py +15 -0
d9d/peft/lora/config.py +35 -0
d9d/peft/lora/layer.py +177 -0
d9d/peft/lora/method.py +132 -0
d9d/pipelining/__init__.py +0 -0
d9d/pipelining/api/__init__.py +19 -0
d9d/pipelining/api/module.py +149 -0
d9d/pipelining/api/schedule.py +50 -0
d9d/pipelining/api/sharding.py +9 -0
d9d/pipelining/factory/__init__.py +21 -0
d9d/pipelining/factory/config.py +89 -0
d9d/pipelining/factory/factory.py +114 -0
d9d/pipelining/factory/registry.py +82 -0
d9d/pipelining/infra/__init__.py +0 -0
d9d/pipelining/infra/schedule/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/__init__.py +0 -0
d9d/pipelining/infra/schedule/component/program/__init__.py +22 -0
d9d/pipelining/infra/schedule/component/program/base.py +35 -0
d9d/pipelining/infra/schedule/component/program/communications.py +203 -0
d9d/pipelining/infra/schedule/component/program/topology.py +78 -0
d9d/pipelining/infra/schedule/component/runtime/__init__.py +29 -0
d9d/pipelining/infra/schedule/component/runtime/action.py +361 -0
d9d/pipelining/infra/schedule/component/runtime/communications.py +101 -0
d9d/pipelining/infra/schedule/component/runtime/executor.py +113 -0
d9d/pipelining/infra/schedule/component/runtime/loss.py +55 -0
d9d/pipelining/infra/schedule/program/__init__.py +15 -0
d9d/pipelining/infra/schedule/program/bfs.py +86 -0
d9d/pipelining/infra/schedule/program/dualpipev.py +234 -0
d9d/pipelining/infra/schedule/program/interleaved.py +240 -0
d9d/pipelining/infra/schedule/program/zerobubblev.py +227 -0
d9d/pipelining/infra/stage/__init__.py +5 -0
d9d/pipelining/infra/stage/communications.py +274 -0
d9d/pipelining/infra/stage/computations.py +317 -0
d9d/pipelining/infra/stage/splitgrad.py +377 -0
d9d/pipelining/infra/stage/stage.py +321 -0
d9d/pipelining/infra/stage/struct_helper.py +46 -0
d9d/pipelining/training/__init__.py +7 -0
d9d/pipelining/training/optimizer.py +41 -0
d9d/pipelining/training/scheduler.py +34 -0
d9d/tracker/__init__.py +14 -0
d9d/tracker/base.py +124 -0
d9d/tracker/factory.py +57 -0
d9d/tracker/provider/__init__.py +0 -0
d9d/tracker/provider/aim/__init__.py +0 -0
d9d/tracker/provider/aim/config.py +23 -0
d9d/tracker/provider/aim/tracker.py +114 -0
d9d/tracker/provider/null.py +61 -0
d9d-0.1.0.dist-info/METADATA +90 -0
d9d-0.1.0.dist-info/RECORD +238 -0
d9d-0.1.0.dist-info/WHEEL +4 -0

d9d/module/block/attention/grouped_query.py ADDED Viewed

@@ -0,0 +1,139 @@
+import torch
+from torch import nn
+from d9d.module.base import ModuleLateInit
+from d9d.module.block.attention.sdpa import FlashSdpa
+from d9d.module.block.positional import RotaryEmbeddingApplicator
+class GroupedQueryAttention(nn.Module, ModuleLateInit):
+    """
+    Implements Grouped Query Attention (GQA) with RoPE and optional QK Normalization.
+    This module performs the full attention mechanism pipeline:
+    1.  Linear projection to Q, K, V.
+    2.  Optional RMS Normalization on Q and K.
+    3.  Rotary Positional Embedding (RoPE) application.
+    4.  Scaled Dot Product Attention (via FlashAttention).
+    5.  Output projection.
+    """
+    def __init__(
+            self,
+            hidden_size: int,
+            num_attention_heads: int,
+            num_key_value_heads: int,
+            head_dim: int,
+            qk_norm_eps: float | None,
+            is_causal: bool
+    ):
+        """
+        Constructs the GroupedQueryAttention layer.
+        Args:
+            hidden_size: Hidden size.
+            num_attention_heads: Number of Query heads.
+            num_key_value_heads: Number of Key/Value heads. If less than `num_attention_heads`, GQA/MQA is enabled.
+            head_dim: Dimensionality of a single attention head.
+            qk_norm_eps: Epsilon for LayerNorm/RMSNorm applied to Q and K. If None, normalization is disabled.
+            is_causal: Whether to apply a causal mask (auto-regressive constraint).
+        """
+        super().__init__()
+        self._head_dim = head_dim
+        self._num_key_value_groups = num_attention_heads // num_key_value_heads
+        self._scaling = head_dim ** -0.5
+        self.q_proj = nn.Linear(
+            hidden_size, num_attention_heads * head_dim, bias=False
+        )
+        self.k_proj = nn.Linear(
+            hidden_size, num_key_value_heads * head_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            hidden_size, num_key_value_heads * head_dim, bias=False
+        )
+        self.o_proj = nn.Linear(
+            num_attention_heads * head_dim, hidden_size, bias=False
+        )
+        self.q_norm: nn.RMSNorm | None
+        self.k_norm: nn.RMSNorm | None
+        if qk_norm_eps is not None:
+            self.q_norm = nn.RMSNorm(normalized_shape=head_dim,
+                                     eps=qk_norm_eps)
+            self.k_norm = nn.RMSNorm(normalized_shape=head_dim,
+                                     eps=qk_norm_eps)
+        else:
+            self.q_norm = None
+            self.k_norm = None
+        self.rope = RotaryEmbeddingApplicator()
+        self.kernel = FlashSdpa()
+        self._is_causal = is_causal
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor | None,
+            position_embeddings: tuple[torch.Tensor, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Computes the attention operation.
+        Args:
+            hidden_states: Input tensor. Shape: `(batch, seq_len, hidden_size)`.
+            attention_mask: Optional mask associated with the inputs.
+            position_embeddings: Tuple of `(cos, sin)` tensors for RoPE application.
+                Each tensor should be of shape `(batch, seq_len, head_dim)`
+        Returns:
+            The attention output tensor. Shape: `(batch, seq_len, hidden_size)`.
+        """
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self._head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        if self.q_norm is not None:
+            query_states = self.q_norm(query_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        if self.k_norm is not None:
+            key_states = self.k_norm(key_states)
+        key_states = key_states.transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states, key_states = self.rope(query_states, key_states, position_embeddings[0], position_embeddings[1])
+        outputs = self.kernel(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=attention_mask,
+            is_causal=self._is_causal,
+            scale=self._scaling
+        )
+        outputs = outputs.reshape(*input_shape, -1).contiguous()
+        outputs = self.o_proj(outputs)
+        return outputs
+    def reset_parameters(self):
+        """Resets module parameters."""
+        self.q_proj.reset_parameters()
+        self.k_proj.reset_parameters()
+        self.v_proj.reset_parameters()
+        self.o_proj.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()

d9d/module/block/attention/sdpa/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .flash import FlashSdpa
+__all__ = [
+    "FlashSdpa"
+]

d9d/module/block/attention/sdpa/flash.py ADDED Viewed

@@ -0,0 +1,52 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+class FlashSdpa(nn.Module):
+    """Executes Scaled Dot Product Attention (SDPA) enforcing the FlashAttention backend."""
+    def __init__(self):
+        """
+        Constructs the FlashSdpa object.
+        """
+        super().__init__()
+    def forward(
+            self,
+            query_states: torch.Tensor,
+            key_states: torch.Tensor,
+            value_states: torch.Tensor,
+            attention_mask: torch.Tensor | None,
+            is_causal: bool,
+            scale: float
+    ) -> torch.Tensor:
+        """
+        Computes Scaled Dot-Product Attention using FlashAttention.
+        Args:
+            query_states: Query tensor. Shape: `(batch, n_q_heads, seq_len, head_dim)`.
+            key_states: Key tensor. Shape: `(batch, n_kv_heads, seq_len, head_dim)`.
+            value_states: Value tensor. Shape: `(batch, n_kv_heads, seq_len, head_dim)`.
+            attention_mask: Optional attention mask (usually not needed for FlashAttn with causal=True).
+            is_causal: If True, applies a causal mask (upper triangular masking).
+            scale: Scaling factor applied to the dot products (usually `1 / sqrt(head_dim)`).
+        Returns:
+            The attention output tensor, permuted to channel-last format.
+                Shape: `(batch, seq_len, n_q_heads, head_dim)`.
+        """
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            results = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=is_causal,
+                scale=scale,
+                enable_gqa=query_states.shape[1] != key_states.shape[1]
+            )
+            return results.transpose(1, 2).contiguous()

d9d/module/block/embedding/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Package providing various embedding layer implementations"""
+from .shard_token_embedding import SplitTokenEmbeddings
+__all__ = [
+    "SplitTokenEmbeddings"
+]

d9d/module/block/embedding/shard_token_embedding.py ADDED Viewed

@@ -0,0 +1,103 @@
+from collections.abc import Mapping, Sequence
+from typing import cast
+import torch
+from torch import nn
+from d9d.module.base import ModuleLateInit
+def _build_token_start_end_indices(
+        split_vocab_size: dict[str, int], split_order: Sequence[str]
+) -> tuple[dict[str, int], dict[str, int]]:
+    offset = 0
+    starts = {}
+    ends = {}
+    for split in split_order:
+        current_size = split_vocab_size[split]
+        starts[split] = offset
+        ends[split] = offset + current_size
+        offset += current_size
+    return starts, ends
+class SplitTokenEmbeddings(nn.Module, ModuleLateInit):
+    """
+    A token embedding layer composed of multiple named, independent embedding tables.
+    This class maintains a dictionary of embedding layers, mapping contiguous
+    ranges of global vocabulary indices to specific named splits (e.g., 'orig',
+    'special', 'prompt_prefix'). This is useful for model adaptation strategies where
+    different sets of tokens require different initialization  training behaviors.
+    """
+    def __init__(
+            self,
+            split_vocab_size: dict[str, int],
+            split_order: Sequence[str],
+            hidden_size: int
+    ):
+        """
+        Constructs the SplitTokenEmbeddings object.
+        Args:
+            split_vocab_size: A dictionary mapping split names to their vocabulary sizes.
+            split_order: A sequence defining the order in which splits are concatenated
+                to form the global vocabulary. Keys provided here must exist in
+                split_vocab_size.
+            hidden_size: The dimensionality of the embedding vectors.
+        """
+        super().__init__()
+        token_embedding = nn.ModuleDict({
+            split_name: nn.Embedding(vocab_size, hidden_size)
+            for split_name, vocab_size in split_vocab_size.items()
+        })
+        self.token_embedding: Mapping[str, nn.Embedding] = cast(Mapping[str, nn.Embedding], token_embedding)
+        self._id_start, self._id_end = _build_token_start_end_indices(split_vocab_size, split_order)
+        self._hidden_size = hidden_size
+        self._split_order = split_order
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Retrieves embeddings for the input indices by routing them to appropriate internal layers.
+        Args:
+            input_ids: Tensor of arbitrary shape containing global vocabulary indices.
+        Returns:
+            Tensor of same shape as input_ids plus a last dimension of hidden_size.
+        """
+        output_embeds: torch.Tensor | None = None
+        for split_name in self._split_order:
+            start_idx = self._id_start[split_name]
+            end_idx = self._id_end[split_name]
+            layer = self.token_embedding[split_name]
+            mask = (input_ids >= start_idx) & (input_ids < end_idx)
+            safe_ids = torch.where(mask, input_ids - start_idx, 0)
+            masked_embed = layer(safe_ids) * mask[..., None]
+            if output_embeds is None:
+                output_embeds = masked_embed
+            else:
+                output_embeds = output_embeds + masked_embed
+        if output_embeds is None:
+            raise ValueError("Embeddings are empty - perhaps no splits were configured")
+        return output_embeds
+    def reset_parameters(self):
+        """
+        Resets parameters for all registered embedding splits.
+        """
+        for layer in self.token_embedding.values():
+            layer.reset_parameters()

d9d/module/block/ffn/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .swiglu import SwiGLU
+__all__ = [
+    "SwiGLU"
+]

d9d/module/block/ffn/swiglu.py ADDED Viewed

@@ -0,0 +1,60 @@
+import torch
+from torch import nn
+from d9d.kernel.swiglu import silu_mul
+from d9d.module.base import ModuleLateInit
+class SwiGLU(nn.Module, ModuleLateInit):
+    """
+    Implements the SwiGLU Feed-Forward Network (FFN).
+    This module applies the gated activation function: `down(SiLU(gate(x)) * up(x))`.
+    It corresponds to the standard MLP block used in architectures like LLaMA.
+    """
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int
+    ):
+        """
+        Constructs a SwiGLU object.
+        Args:
+            hidden_size: The hidden dim size.
+            intermediate_size: The intermediate dim size of the FFN.
+        """
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size)
+    def forward(
+            self,
+            x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Applies the SwiGLU FFN to the input.
+        Args:
+            x: Input tensor. Shape: `(batch_size, seq_len, hidden_dim)`.
+        Returns:
+            Output tensor. Shape: `(batch_size, seq_len, hidden_dim)`.
+        """
+        return self.down_proj(
+            silu_mul(
+                self.gate_proj(x),
+                self.up_proj(x)
+            )
+        )
+    def reset_parameters(self):
+        """Resets module parameters."""
+        self.gate_proj.reset_parameters()
+        self.up_proj.reset_parameters()
+        self.down_proj.reset_parameters()

d9d/module/block/head/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .language_modelling import LM_IGNORE_INDEX, SplitLanguageModellingHead
+__all__ = [
+    "LM_IGNORE_INDEX",
+    "SplitLanguageModellingHead"
+]

d9d/module/block/head/language_modelling.py ADDED Viewed

@@ -0,0 +1,87 @@
+from collections.abc import Mapping, Sequence
+from typing import cast
+import torch
+from torch import nn
+from d9d.kernel.cce import linear_cross_entropy
+from d9d.module.base import ModuleLateInit
+LM_IGNORE_INDEX = -100
+"""Index ignored by LM head while calculating logps"""
+class SplitLanguageModellingHead(nn.Module, ModuleLateInit):
+    """
+    A segmented language modeling head that computes per-token cross-entropy loss values using a composed weight matrix.
+    This class maintains separate linear layers for different segments of the vocabulary
+    (e.g., regular vs. special tokens). During the forward pass, it concatenates the
+    weights to form a unified projection matrix and computes the cross-entropy loss
+    efficiently, typically using a fused kernel to avoid materializing full logits.
+    The concatenation order of the weights is determined by `split_order`, which ensures
+    consistency with the global vocabulary indices.
+    """
+    def __init__(
+            self,
+            split_vocab_size: dict[str, int],
+            split_order: Sequence[str],
+            hidden_size: int
+    ):
+        """
+        Constructs the SplitLanguageModellingHead object.
+        Args:
+            split_vocab_size: A dictionary mapping split names to their output vocabulary sizes.
+            split_order: A sequence defining the order in which vocabulary segments should be
+                concatenated. This determines the mapping of global indices to specific heads.
+            hidden_size: The input dimensionality (hidden state size).
+        """
+        super().__init__()
+        lm_head = nn.ModuleDict({
+            split_name: nn.Linear(hidden_size, vocab_size, bias=False)
+            for split_name, vocab_size in split_vocab_size.items()
+        })
+        self.lm_head: Mapping[str, nn.Linear] = cast(Mapping[str, nn.Linear], lm_head)
+        self._split_order = split_order
+        self._hidden_size = hidden_size
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            labels: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Computes the cross-entropy loss for the given hidden states and labels.
+        Args:
+            hidden_states: Input tensor of shape `(B, S, H)`.
+            labels: Target label tensor of shape `(B, S)`. Indices must correspond
+                to the global vocabulary formed by concatenating splits in `split_order`.
+        Returns:
+            A tensor containing per-token loss values (reduction='none'), matching the
+            shape of the labels tensor.
+        """
+        lm_head_weight = torch.cat([self.lm_head[split_name].weight for split_name in self._split_order], dim=0)
+        losses = linear_cross_entropy(
+            hidden_states,
+            lm_head_weight,
+            labels,
+            ignore_index=LM_IGNORE_INDEX,
+            reduction="none"
+        )
+        return losses
+    def reset_parameters(self):
+        """Resets module parameters."""
+        for head in self.lm_head.values():
+            head.reset_parameters()

d9d/module/block/hidden_states_aggregator/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""
+Aggregation utilities for model hidden states.
+"""
+from .base import BaseHiddenStatesAggregator
+from .factory import HiddenStatesAggregationMode, create_hidden_states_aggregator
+__all__ = [
+    "BaseHiddenStatesAggregator",
+    "HiddenStatesAggregationMode",
+    "create_hidden_states_aggregator"
+]

d9d/module/block/hidden_states_aggregator/base.py ADDED Viewed

@@ -0,0 +1,35 @@
+import abc
+import torch
+class BaseHiddenStatesAggregator(abc.ABC):
+    """Abstract base class for hidden states aggregation strategies.
+    This interface defines how hidden states should be collected (added) and
+    how they should be finalized (packed) combined with optional historical snapshots.
+    """
+    @abc.abstractmethod
+    def add_hidden_states(self, hidden_states: torch.Tensor) -> None:
+        """Accumulates a batch of hidden states into the aggregator.
+        Args:
+            hidden_states: The tensor containing the hidden states to process.
+        """
+    @abc.abstractmethod
+    def pack_with_snapshot(self, snapshot: torch.Tensor | None) -> torch.Tensor | None:
+        """Finalizes the aggregation and combines it with an optional previous snapshot.
+        This method typically retrieves the accumulated states, processes them
+        (if not done during addition), and concatenates them with the snapshot.
+        Args:
+            snapshot: An optional tensor representing previously aggregated states
+                to be prepended to the current collection.
+        Returns:
+            The combined result of the snapshot and the newly aggregated states,
+            or None if no states were collected.
+        """

d9d/module/block/hidden_states_aggregator/factory.py ADDED Viewed

@@ -0,0 +1,48 @@
+from enum import StrEnum
+import torch
+from .base import BaseHiddenStatesAggregator
+from .mean import HiddenStatesAggregatorMean
+from .noop import HiddenStatesAggregatorNoOp
+class HiddenStatesAggregationMode(StrEnum):
+    """Enumeration of available hidden state aggregation strategies.
+    Attributes:
+        no: Performs no aggregation (No-Op).
+        mean: Computes the mean of hidden states, taking a mask into account.
+    """
+    no = "no"
+    mean = "mean"
+def create_hidden_states_aggregator(
+        mode: HiddenStatesAggregationMode, agg_mask: torch.Tensor | None
+) -> BaseHiddenStatesAggregator:
+    """Factory function to create a hidden states aggregator.
+    Args:
+        mode: The specific aggregation mode to instantiate.
+        agg_mask: A tensor mask required for specific modes.
+            Can be None if the selected mode does not require masking.
+    Returns:
+        An instance of a concrete BaseHiddenStatesAggregator subclass.
+    Raises:
+        ValueError: If 'mean' mode is selected but 'agg_mask' is None, or if
+            an unknown mode is provided.
+    """
+    match mode:
+        case HiddenStatesAggregationMode.no:
+            return HiddenStatesAggregatorNoOp()
+        case HiddenStatesAggregationMode.mean:
+            if agg_mask is None:
+                raise ValueError("You have to specify aggregation mask")
+            return HiddenStatesAggregatorMean(agg_mask)
+        case _:
+            raise ValueError("Unknown hidden states aggregation mode")

d9d/module/block/hidden_states_aggregator/mean.py ADDED Viewed

@@ -0,0 +1,61 @@
+import torch
+from .base import BaseHiddenStatesAggregator
+def _aggregate_hidden_states(
+        hidden_states: torch.Tensor, agg_mask: torch.Tensor
+) -> torch.Tensor:
+    orig_dtype = hidden_states.dtype
+    hidden_states = hidden_states.float()
+    num_tokens = agg_mask.sum(dim=1)[:, None]
+    masked_states = hidden_states * agg_mask[:, :, None]
+    averaged_states = masked_states.sum(dim=1) / num_tokens
+    return averaged_states.to(orig_dtype)
+class HiddenStatesAggregatorMean(BaseHiddenStatesAggregator):
+    """Aggregator that computes the mean of hidden states using a validity mask."""
+    def __init__(self, agg_mask: torch.Tensor) -> None:
+        """Constructs the mean aggregator with the given mask.
+        Args:
+            agg_mask: A tensor used to mask out padding or invalid tokens
+                during average calculation.
+        """
+        self._agg_mask = agg_mask
+        self._collected_states: list[torch.Tensor] = []
+    def add_hidden_states(self, hidden_states: torch.Tensor) -> None:
+        """Calculates the masked mean immediately and stores the result.
+        Args:
+            hidden_states: The raw hidden states to be averaged and stored.
+        """
+        agg = _aggregate_hidden_states(
+            hidden_states=hidden_states,
+            agg_mask=self._agg_mask
+        )
+        self._collected_states.append(agg)
+    def pack_with_snapshot(self, snapshot: torch.Tensor | None) -> torch.Tensor | None:
+        """Stacks collected projected averages and appends to the snapshot.
+        This operation clears the internal buffer of collected states.
+        Args:
+            snapshot: Previous states to prepend.
+        Returns:
+            A tensor containing the snapshot followed by the stacked collected states,
+            or None if nothing was collected.
+        """
+        if len(self._collected_states) == 0:
+            return None
+        stacked = torch.stack(self._collected_states, dim=0)
+        self._collected_states.clear()
+        if snapshot is not None:
+            stacked = torch.cat([snapshot, stacked], dim=0)
+        return stacked

d9d/module/block/hidden_states_aggregator/noop.py ADDED Viewed

@@ -0,0 +1,27 @@
+import torch
+from .base import BaseHiddenStatesAggregator
+class HiddenStatesAggregatorNoOp(BaseHiddenStatesAggregator):
+    """Aggregator implementation that performs no operations.
+    This acts as a null object for cases where aggregation is disabled in the configuration.
+    """
+    def add_hidden_states(self, hidden_states: torch.Tensor) -> None:
+        """Does nothing.
+        Args:
+            hidden_states: Ignored.
+        """
+    def pack_with_snapshot(self, snapshot: torch.Tensor | None) -> torch.Tensor | None:
+        """Does nothing.
+        Args:
+            snapshot: Ignored.
+        Returns:
+            None.
+        """

d9d/module/block/moe/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Provides building blocks for Mixture-of-Experts (MoE) architectures."""
+from .grouped_experts import GroupedSwiGLU
+from .grouped_linear import GroupedLinear
+from .layer import MoELayer
+from .router import TopKRouter
+__all__ = [
+    "GroupedLinear",
+    "GroupedSwiGLU",
+    "MoELayer",
+    "TopKRouter"
+]

d9d/module/block/moe/communications/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Provides communication strategies for Mixture-of-Experts routing operations."""
+from .base import ExpertCommunicationHandler
+from .deepep import DeepEpCommunicationHandler
+from .naive import NoCommunicationHandler
+__all__ = [
+    "DeepEpCommunicationHandler",
+    "ExpertCommunicationHandler",
+    "NoCommunicationHandler"
+]