PyPI - cortexnet - Versions diffs - 3.2.1__py3-none-any.whl - Mend

cortexnet 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cortexnet/__init__.py +197 -0
cortexnet/adapter/__init__.py +26 -0
cortexnet/adapter/arch_adapter.py +209 -0
cortexnet/adapter/calibrator.py +244 -0
cortexnet/adapter/inference_adapter.py +272 -0
cortexnet/adapter/model_registry.py +378 -0
cortexnet/adapter/weight_adapter.py +415 -0
cortexnet/adversarial.py +195 -0
cortexnet/attention.py +520 -0
cortexnet/blocks.py +682 -0
cortexnet/cache.py +83 -0
cortexnet/causal_reasoning.py +232 -0
cortexnet/compat.py +245 -0
cortexnet/config.py +234 -0
cortexnet/continual_learning.py +256 -0
cortexnet/cortex_block_lite.py +221 -0
cortexnet/distributed.py +213 -0
cortexnet/graph_reasoning.py +207 -0
cortexnet/hierarchical_memory.py +360 -0
cortexnet/interpretability.py +196 -0
cortexnet/memory.py +179 -0
cortexnet/meta_learning.py +187 -0
cortexnet/model.py +1360 -0
cortexnet/multi_agent.py +241 -0
cortexnet/multimodal.py +278 -0
cortexnet/ops/__init__.py +28 -0
cortexnet/ops/device_manager.py +449 -0
cortexnet/ops/npu_ops.py +243 -0
cortexnet/quantization.py +496 -0
cortexnet/routing.py +335 -0
cortexnet/self_evolution.py +174 -0
cortexnet/ssm.py +340 -0
cortexnet/training_utils.py +204 -0
cortexnet/transformer_baseline.py +157 -0
cortexnet-3.2.1.dist-info/METADATA +114 -0
cortexnet-3.2.1.dist-info/RECORD +39 -0
cortexnet-3.2.1.dist-info/WHEEL +5 -0
cortexnet-3.2.1.dist-info/licenses/LICENSE +201 -0
cortexnet-3.2.1.dist-info/top_level.txt +1 -0

cortexnet/cache.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+CortexNet 推理缓存 (CortexNet Inference Cache)
+用于自回归生成时的增量解码，避免每步重算完整序列。
+缓存层级：
+  CortexNetCache
+    └── LayerCache (per block)
+          ├── ssm_state: Tensor          — SSM hidden state
+          ├── memory_state: (Tensor, Tensor) — WorkingMemory (memory, z)
+          └── kv_cache: (Tensor, Tensor, Tensor) — Attention (K, V, indices)
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+import torch
+@dataclass
+class LayerCache:
+    """单层 CortexBlock 的缓存。
+    Attributes:
+        ssm_state: SSM 隐状态 (B, d_inner, N)，可为 None（首次前向）
+        memory_state: WorkingMemory 的 (memory, z)，可为 None
+        kv_cache: Attention 的 (K, V, top_k_indices)，可为 None
+    """
+    ssm_state: Optional[torch.Tensor] = None
+    memory_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = None
+    def as_tuple(self) -> Tuple:
+        """转换为 (ssm_state, memory_state, kv_cache) 元组，兼容旧 API。"""
+        return (self.ssm_state, self.memory_state, self.kv_cache)
+    @staticmethod
+    def from_tuple(t: Tuple) -> LayerCache:
+        """从旧式元组创建。"""
+        if t is None:
+            return LayerCache()
+        return LayerCache(
+            ssm_state=t[0] if len(t) > 0 else None,
+            memory_state=t[1] if len(t) > 1 else None,
+            kv_cache=t[2] if len(t) > 2 else None,
+        )
+@dataclass
+class CortexNetCache:
+    """CortexNet 增量解码缓存（全模型）。
+    Attributes:
+        layers: 每层的 LayerCache
+        position_offset: 已处理的 token 数量
+    """
+    layers: List[LayerCache] = field(default_factory=list)
+    position_offset: int = 0
+    @staticmethod
+    def init_empty(num_layers: int) -> CortexNetCache:
+        """创建空缓存。"""
+        return CortexNetCache(
+            layers=[LayerCache() for _ in range(num_layers)],
+            position_offset=0,
+        )
+    def as_list(self) -> List[Tuple]:
+        """转换为旧式 List[Tuple] 格式，兼容旧 API。"""
+        return [lc.as_tuple() for lc in self.layers]
+    @staticmethod
+    def from_list(lst: List[Tuple]) -> CortexNetCache:
+        """从旧式 List[Tuple] 创建。"""
+        if lst is None:
+            return CortexNetCache()
+        return CortexNetCache(
+            layers=[LayerCache.from_tuple(t) for t in lst],
+        )

cortexnet/causal_reasoning.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""
+因果推理模块 (Causal Reasoning Module)
+核心创新：
+  传统注意力只学习相关性 P(Y|X)，而因果推理学习因果关系 P(Y|do(X))。
+  这使模型不仅知道"什么和什么一起出现"，还知道"什么导致了什么"。
+  ┌─────────────────────────────────────────────────────────────┐
+  │            因果推理 vs 传统注意力                            │
+  ├─────────────────────────────────────────────────────────────┤
+  │                                                             │
+  │  传统注意力:  P(Y|X) — "X 出现时 Y 也出现"                │
+  │    → 只捕获相关性，无法区分因果与混淆                      │
+  │                                                             │
+  │  因果推理:    P(Y|do(X)) — "如果我们主动设置 X，Y 会如何"  │
+  │    → 捕获真正的因果关系，支持反事实推理                    │
+  │                                                             │
+  │  三大组件:                                                  │
+  │  1. 因果强度估计器 — 评估每个 token 的因果重要性           │
+  │  2. 干预注意力 — 基于因果方向选择高因果 token 计算注意力   │
+  │  3. 反事实分支 — "如果这个原因不同，结果会如何？"          │
+  └─────────────────────────────────────────────────────────────┘
+  灵感来源: Pearl's do-calculus, 结构因果模型 (SCM)
+  优化 (v3.2):
+    - InterventionalAttention 使用选择性稀疏注意力替代 O(n²) 全量注意力，
+      仅对因果强度 Top-K 的 token 计算注意力，复杂度降至 O(n·k)
+    - CounterfactualBranch 使用合并的批量线性变换替代 for 循环，
+      单次 matmul 完成所有反事实分支计算
+"""
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+class CausalStrengthEstimator(nn.Module):
+    """因果强度估计器：评估每个 token 对后续 token 的因果影响力。"""
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.scorer = nn.Sequential(
+            nn.Linear(d_model, d_model // 4),
+            nn.GELU(),
+            nn.Linear(d_model // 4, 1),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sigmoid(self.scorer(x))  # (B, L, 1)
+class InterventionalAttention(nn.Module):
+    """干预注意力：基于因果方向的选择性稀疏注意力。
+    优化: 仅对因果强度 Top-K 的 token 计算注意力，
+    复杂度从 O(n²) 降至 O(n·k)，与系统整体 O(n) 定位一致。
+    与标准注意力的区别:
+      标准: attn(i,j) = softmax(Q_i · K_j / √d)  — 全量 O(n²)
+      干预: attn(i,j) = softmax((Q_i · K_top_j + bias_j) / √d)  — 稀疏 O(n·k)
+    因果偏置让 "因果上更重要" 的 token 获得更多关注。
+    """
+    def __init__(self, d_model: int, num_heads: int = 4, top_k_ratio: float = 0.25,
+                 dropout: float = 0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.top_k_ratio = top_k_ratio
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.o_proj = nn.Linear(d_model, d_model, bias=False)
+        self.attn_dropout = nn.Dropout(dropout)
+    def _compute_k_count(self, seq_len: int) -> int:
+        """根据序列长度计算选择的 token 数量。"""
+        k = max(1, int(seq_len * self.top_k_ratio))
+        return min(k, seq_len)
+    def forward(
+        self, x: torch.Tensor, causal_strength: torch.Tensor
+    ) -> torch.Tensor:
+        B, L, D = x.shape
+        H, hd = self.num_heads, self.head_dim
+        q = self.q_proj(x).view(B, L, H, hd).transpose(1, 2)  # (B, H, L, hd)
+        k = self.k_proj(x).view(B, L, H, hd).transpose(1, 2)
+        v = self.v_proj(x).view(B, L, H, hd).transpose(1, 2)
+        # 确定 top-k 数量
+        k_count = self._compute_k_count(L)
+        if k_count >= L:
+            # 短序列退化为完整注意力（但加上因果偏置）
+            return self._full_attention(q, k, v, causal_strength, B, L, D, H, hd)
+        # ═══ 选择性稀疏: 按因果强度选 top-k token 作为 KV ═══
+        # causal_strength: (B, L, 1)
+        scores = causal_strength.squeeze(-1)  # (B, L)
+        _, top_indices = scores.topk(k_count, dim=-1, sorted=False)  # (B, k)
+        # 收集 top-k 的 K, V
+        top_idx_kv = top_indices.unsqueeze(1).unsqueeze(-1).expand(-1, H, -1, hd)
+        k_sel = k.gather(2, top_idx_kv)  # (B, H, k, hd)
+        v_sel = v.gather(2, top_idx_kv)  # (B, H, k, hd)
+        # 计算稀疏注意力分数
+        attn = (q @ k_sel.transpose(-2, -1)) * self.scale  # (B, H, L, k)
+        # 因果偏置: 对选中的 token 施加因果强度调制
+        causal_bias_sel = scores.gather(1, top_indices)  # (B, k)
+        causal_bias = torch.log(causal_bias_sel + 1e-6).clamp(min=-10)
+        attn = attn + causal_bias.unsqueeze(1).unsqueeze(2)  # (B, 1, 1, k) broadcast
+        # 因果掩码: 只允许 attend 到位置 ≤ 当前 token 的 key
+        positions = torch.arange(L, device=x.device).unsqueeze(1)  # (L, 1)
+        key_positions = top_indices.unsqueeze(1)  # (B, 1, k)
+        causal_mask = positions >= key_positions  # (B, L, k)
+        attn = attn.masked_fill(~causal_mask.unsqueeze(1), float("-inf"))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.attn_dropout(attn)
+        out = (attn @ v_sel).transpose(1, 2).contiguous().view(B, L, D)
+        return self.o_proj(out)
+    def _full_attention(self, q, k, v, causal_strength, B, L, D, H, hd):
+        """短序列退化为完整注意力（带因果偏置）。"""
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        causal_bias = causal_strength.squeeze(-1).unsqueeze(1).unsqueeze(2)
+        attn = attn + torch.log(causal_bias + 1e-6).clamp(min=-10)
+        causal_mask = torch.tril(torch.ones(L, L, device=q.device))
+        attn = attn.masked_fill(causal_mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.attn_dropout(attn)
+        out = (attn @ v).transpose(1, 2).contiguous().view(B, L, D)
+        return self.o_proj(out)
+class CounterfactualBranch(nn.Module):
+    """反事实推理分支：探索 "如果原因不同，结果会如何"。
+    优化: 使用合并的权重矩阵 (d_model, num_cf * d_model) 替代
+    K 个独立 nn.Linear 的 for 循环，单次 matmul 完成所有反事实变换。
+    维护 K 个反事实变换，每个代表一种 "假设情景"。
+    通过门控机制软选择最相关的反事实分支。
+    """
+    def __init__(self, d_model: int, num_counterfactuals: int = 4, dropout: float = 0.0):
+        super().__init__()
+        self.num_cf = num_counterfactuals
+        self.d_model = d_model
+        # 合并所有反事实变换为单个矩阵: (d_model, num_cf * d_model)
+        self.merged_transform = nn.Linear(d_model, num_counterfactuals * d_model, bias=False)
+        # 初始化为接近零（初始不改变输入）
+        nn.init.zeros_(self.merged_transform.weight)
+        self.gate = nn.Sequential(
+            nn.Linear(d_model, d_model // 4),
+            nn.GELU(),
+            nn.Linear(d_model // 4, num_counterfactuals),
+        )
+        # 可学习的反事实融合强度
+        self.cf_scale = nn.Parameter(torch.tensor(0.1))
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor, causal_strength: torch.Tensor) -> torch.Tensor:
+        B, L, D = x.shape
+        K = self.num_cf
+        gate_weights = F.softmax(self.gate(x), dim=-1)  # (B, L, K)
+        # 单次 matmul 完成所有 K 个反事实变换
+        all_cf = self.merged_transform(x)  # (B, L, K*D)
+        all_cf = all_cf.view(B, L, K, D)  # (B, L, K, D)
+        # 门控加权融合
+        counterfactual = (all_cf * gate_weights.unsqueeze(-1)).sum(dim=2)  # (B, L, D)
+        counterfactual = self.dropout(counterfactual)
+        # 因果强度高的 token 贡献更多反事实信息
+        return counterfactual * causal_strength * self.cf_scale
+class CausalReasoningModule(nn.Module):
+    """因果推理模块：完整的因果推理流水线。
+    架构:
+      1. 估计每个 token 的因果强度
+      2. 干预注意力（因果方向感知，稀疏 O(n·k)）
+      3. 反事实推理（探索替代情景，批量单次 matmul）
+      4. 融合观察结果和反事实结果
+    Args:
+        d_model: 模型维度
+        num_heads: 干预注意力头数
+        num_counterfactuals: 反事实分支数
+        top_k_ratio: 干预注意力的 top-k 比例
+        dropout: Dropout 比率
+    """
+    def __init__(self, d_model: int, num_heads: int = 4, num_counterfactuals: int = 4,
+                 top_k_ratio: float = 0.25, dropout: float = 0.0):
+        super().__init__()
+        self.causal_estimator = CausalStrengthEstimator(d_model)
+        self.interventional_attn = InterventionalAttention(
+            d_model, num_heads, top_k_ratio=top_k_ratio, dropout=dropout
+        )
+        self.counterfactual = CounterfactualBranch(
+            d_model, num_counterfactuals, dropout=dropout
+        )
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        causal_strength = self.causal_estimator(x)  # (B, L, 1)
+        observational = self.interventional_attn(x, causal_strength)
+        counterfactual = self.counterfactual(x, causal_strength)
+        combined = observational + counterfactual
+        return self.out_proj(self.norm(combined))

cortexnet/compat.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""
+CortexNet 兼容模式组件 (Compatibility Mode Components)
+为开源 LLM (LLaMA/Qwen/Mistral...) 的权重无损迁移提供轻量兼容组件：
+  - _CompatAttention: GQA + KV cache 注意力
+  - _CompatLiteSSM: 低秩 SSM 旁路（默认零影响）
+  - _CompatFusionGate: 轻量两路融合（强偏向 Attention）
+  - _CompatExpert/_CompatMoE: 单专家 FFN 兼容壳
+  - _CompatCortexBlockV3: 完整的 V3 兼容块
+  - _NoOpEvolutionEngine: 占位进化引擎
+这些组件在 compatibility_mode=True 时替代完整 V3 模块使用。
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+try:
+    from .config import CortexNetConfig
+    from .blocks import RMSNorm
+    from .attention import precompute_rope_freqs, apply_rope, apply_rope_with_positions
+except ImportError:
+    from cortexnet.config import CortexNetConfig
+    from cortexnet.blocks import RMSNorm
+    from cortexnet.attention import precompute_rope_freqs, apply_rope, apply_rope_with_positions
+class _NoOpEvolutionEngine(nn.Module):
+    """兼容模式下的轻量占位引擎。"""
+    def __init__(self):
+        super().__init__()
+    def get_compute_budget(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ones(x.size(0), device=x.device, dtype=x.dtype)
+    def get_efficiency_loss(self) -> float:
+        return 0.0
+class _CompatAttention(nn.Module):
+    """与主流 HF 架构对齐的轻量注意力（支持 GQA + KV cache）。"""
+    def __init__(self, config: CortexNetConfig):
+        super().__init__()
+        self.d_model = config.hidden_size
+        self.num_heads = config.num_heads
+        self.num_kv_heads = config.num_kv_heads
+        self.head_dim = self.d_model // self.num_heads
+        kv_dim = self.num_kv_heads * self.head_dim
+        self.q_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        self.k_proj = nn.Linear(self.d_model, kv_dim, bias=False)
+        self.v_proj = nn.Linear(self.d_model, kv_dim, bias=False)
+        self.o_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        # Qwen2/3 常见 q_norm/k_norm
+        self.q_norm = RMSNorm(self.head_dim, config.norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, config.norm_eps)
+        self.register_buffer(
+            "rope_freqs",
+            precompute_rope_freqs(self.head_dim, config.max_seq_len, config.rope_theta),
+            persistent=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor | Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        B, L, D = x.shape
+        q = self.q_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        rope = self.rope_freqs.to(x.device)
+        if past_key_value is None:
+            q = apply_rope(q, rope)
+            k = apply_rope(k, rope)
+        else:
+            past_k, past_v = past_key_value
+            pos_offset = past_k.shape[2]
+            pos = torch.arange(
+                pos_offset, pos_offset + L, device=x.device
+            ).unsqueeze(0).expand(B, -1)
+            q = apply_rope_with_positions(q, rope, pos)
+            k = apply_rope_with_positions(k, rope, pos)
+            k = torch.cat([past_k, k], dim=2)
+            v = torch.cat([past_v, v], dim=2)
+        # GQA: 将 KV 头扩展到 Query 头
+        if self.num_kv_heads != self.num_heads:
+            repeat = self.num_heads // self.num_kv_heads
+            k_attn = k.repeat_interleave(repeat, dim=1)
+            v_attn = v.repeat_interleave(repeat, dim=1)
+        else:
+            k_attn = k
+            v_attn = v
+        attn_out = F.scaled_dot_product_attention(
+            q, k_attn, v_attn,
+            attn_mask=None, dropout_p=0.0,
+            is_causal=(past_key_value is None),
+        )
+        out = self.o_proj(attn_out.transpose(1, 2).contiguous().view(B, L, D))
+        if use_cache:
+            return out, (k, v)
+        return out
+class _CompatLiteSSM(nn.Module):
+    """轻量 SSM 路径（参数开销小，默认零影响，支持后续无感校准启用）。"""
+    def __init__(self, d_model: int, rank: int = 256, skip_threshold: float = 1e-6):
+        super().__init__()
+        self.rank = max(16, min(rank, d_model))
+        self.skip_threshold = float(skip_threshold)
+        self.fast_skip = True
+        self.in_proj = nn.Linear(d_model, self.rank, bias=False)
+        self.out_proj = nn.Linear(self.rank, d_model, bias=False)
+        self.alpha = nn.Parameter(torch.tensor(0.0))
+    def is_effectively_disabled(self) -> bool:
+        return (not self.training) and bool(self.fast_skip)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.is_effectively_disabled():
+            return torch.zeros_like(x)
+        return self.alpha * self.out_proj(torch.tanh(self.in_proj(x)))
+class _CompatFusionGate(nn.Module):
+    """轻量融合门：仅融合 SSM/Attention 两路，初始化强偏向 Attention。"""
+    def __init__(
+        self, d_model: int,
+        long_context_threshold: int = 2048,
+        long_context_ssm_ratio: float = 0.35,
+    ):
+        super().__init__()
+        bottleneck = max(d_model // 32, 32)
+        self.long_context_threshold = int(long_context_threshold)
+        self.long_context_ssm_ratio = float(min(max(long_context_ssm_ratio, 0.0), 0.95))
+        self.context_proj = nn.Sequential(
+            nn.Linear(d_model, bottleneck),
+            nn.SiLU(),
+            nn.Linear(bottleneck, 1),
+        )
+        self.attn_bias = nn.Parameter(torch.tensor(10.0))
+        with torch.no_grad():
+            for module in self.context_proj:
+                if isinstance(module, nn.Linear):
+                    module.weight.zero_()
+                    if module.bias is not None:
+                        module.bias.zero_()
+    def forward(self, x_norm, ssm_out, attn_out, *, ssm_enabled=True):
+        if not ssm_enabled:
+            return attn_out
+        context = x_norm.mean(dim=1)
+        gate = torch.sigmoid(self.attn_bias + self.context_proj(context)).unsqueeze(1)
+        if self.long_context_threshold > 0 and x_norm.size(1) >= self.long_context_threshold:
+            max_gate = 1.0 - self.long_context_ssm_ratio
+            gate = torch.clamp(gate, min=0.0, max=max_gate)
+        return gate * attn_out + (1.0 - gate) * ssm_out
+class _CompatExpert(nn.Module):
+    """与源模型 FFN 权重命名对齐的单专家。"""
+    def __init__(self, d_model: int, d_ff: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(d_model, d_ff, bias=False)
+        self.up_proj = nn.Linear(d_model, d_ff, bias=False)
+        self.down_proj = nn.Linear(d_ff, d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class _CompatMoE(nn.Module):
+    """轻量 MoE 兼容壳：保留 experts.0 参数路径，前向等价单 FFN。"""
+    def __init__(self, d_model: int, d_ff: int):
+        super().__init__()
+        self.experts = nn.ModuleList([_CompatExpert(d_model, d_ff)])
+        self.router = nn.Linear(d_model, 1, bias=False)
+        self.aux_loss = torch.tensor(0.0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self.aux_loss = x.new_zeros(())
+        return self.experts[0](x)
+class _CompatCortexBlockV3(nn.Module):
+    """V3 兼容块：保留 SSM + 稀疏注意力 + 轻量融合门，去除非核心大模块。"""
+    def __init__(self, config: CortexNetConfig):
+        super().__init__()
+        self.norm1 = RMSNorm(config.hidden_size, config.norm_eps)
+        self.norm2 = RMSNorm(config.hidden_size, config.norm_eps)
+        self.ssm = _CompatLiteSSM(
+            config.hidden_size,
+            rank=getattr(config, "compat_ssm_rank", 256),
+        )
+        self.attention = _CompatAttention(config)
+        self.fusion = _CompatFusionGate(
+            config.hidden_size,
+            long_context_threshold=getattr(config, "fusion_long_context_threshold", 2048),
+            long_context_ssm_ratio=getattr(config, "fusion_long_context_ssm_ratio", 0.35),
+        )
+        self.moe = _CompatMoE(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x, past_cache=None, use_cache=False):
+        residual = x
+        x_norm = self.norm1(x)
+        ssm_enabled = not self.ssm.is_effectively_disabled()
+        ssm_out = self.ssm(x_norm) if ssm_enabled else None
+        if use_cache:
+            attn_out, new_cache = self.attention(x_norm, past_key_value=past_cache, use_cache=True)
+        else:
+            attn_out = self.attention(x_norm, past_key_value=past_cache, use_cache=False)
+            new_cache = None
+        fused = self.fusion(x_norm, ssm_out, attn_out, ssm_enabled=(ssm_enabled and ssm_out is not None)) if ssm_enabled and ssm_out is not None else attn_out
+        x = residual + self.dropout(fused)
+        residual = x
+        x = residual + self.dropout(self.moe(self.norm2(x)))
+        if use_cache:
+            return x, new_cache
+        return x
+    def get_aux_loss(self):
+        return self.moe.aux_loss