PyPI - cortexnet - Versions diffs - 3.2.1__py3-none-any.whl - Mend

cortexnet 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cortexnet/__init__.py +197 -0
cortexnet/adapter/__init__.py +26 -0
cortexnet/adapter/arch_adapter.py +209 -0
cortexnet/adapter/calibrator.py +244 -0
cortexnet/adapter/inference_adapter.py +272 -0
cortexnet/adapter/model_registry.py +378 -0
cortexnet/adapter/weight_adapter.py +415 -0
cortexnet/adversarial.py +195 -0
cortexnet/attention.py +520 -0
cortexnet/blocks.py +682 -0
cortexnet/cache.py +83 -0
cortexnet/causal_reasoning.py +232 -0
cortexnet/compat.py +245 -0
cortexnet/config.py +234 -0
cortexnet/continual_learning.py +256 -0
cortexnet/cortex_block_lite.py +221 -0
cortexnet/distributed.py +213 -0
cortexnet/graph_reasoning.py +207 -0
cortexnet/hierarchical_memory.py +360 -0
cortexnet/interpretability.py +196 -0
cortexnet/memory.py +179 -0
cortexnet/meta_learning.py +187 -0
cortexnet/model.py +1360 -0
cortexnet/multi_agent.py +241 -0
cortexnet/multimodal.py +278 -0
cortexnet/ops/__init__.py +28 -0
cortexnet/ops/device_manager.py +449 -0
cortexnet/ops/npu_ops.py +243 -0
cortexnet/quantization.py +496 -0
cortexnet/routing.py +335 -0
cortexnet/self_evolution.py +174 -0
cortexnet/ssm.py +340 -0
cortexnet/training_utils.py +204 -0
cortexnet/transformer_baseline.py +157 -0
cortexnet-3.2.1.dist-info/METADATA +114 -0
cortexnet-3.2.1.dist-info/RECORD +39 -0
cortexnet-3.2.1.dist-info/WHEEL +5 -0
cortexnet-3.2.1.dist-info/licenses/LICENSE +201 -0
cortexnet-3.2.1.dist-info/top_level.txt +1 -0

cortexnet/ssm.py ADDED Viewed

@@ -0,0 +1,340 @@
+from __future__ import annotations
+"""
+多尺度状态空间模块 (Multi-Scale State Space Module, MSSM)
+核心创新：
+  多个并行的 SSM 通道以不同的时间尺度运行，使模型能同时捕获
+  从局部 token 交互到长距离依赖的多种模式。
+  - 快速尺度：捕获局部的、细粒度的模式（如语法结构）
+  - 慢速尺度：捕获长距离的、宏观的模式（如主题、上下文）
+  计算复杂度：O(n)，线性于序列长度。
+理论基础：
+  基于状态空间模型 (SSM) 的连续时间动力学：
+    dh/dt = A·h + B·x    (连续状态方程)
+    y = C·h               (观测方程)
+  通过零阶保持 (ZOH) 离散化：
+    h_t = Ā·h_{t-1} + B̄·x_t    其中 Ā = exp(Δ·A), B̄ = Δ·B
+  不同尺度通过 A 矩阵的不同特征值初始化实现：
+    尺度 i 的 A 矩阵以 2^i 倍的频率初始化。
+优化 (v3.2):
+  - 添加 Triton 自定义 kernel 接口：当 triton 可用时自动使用
+    高效的 GPU kernel，否则回退到 PyTorch 分块并行实现
+  - 添加 logging 支持
+"""
+import math
+import logging
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+# Triton kernel 可用性检测
+_TRITON_AVAILABLE = False
+try:
+    import triton
+    import triton.language as tl
+    _ = (triton, tl)
+    _TRITON_AVAILABLE = True
+    logger.info("Triton available: SSM will use custom GPU kernels")
+except ImportError:
+    pass
+class MultiScaleSSM(nn.Module):
+    """多尺度选择性状态空间模块。
+    每个尺度以不同的时间分辨率运行，由 A 矩阵的初始化控制。
+    快速尺度捕获局部模式，慢速尺度捕获全局依赖。
+    架构：
+        Input → Linear(d, 2·d_inner) → [x, z] split
+        x → Selective Scan (多尺度 A) → y
+        z → SiLU 激活 → gate
+        y · gate → Linear(d_inner, d) → Output
+    Args:
+        d_model: 输入/输出维度
+        num_scales: 时间尺度数量（每个尺度有不同的记忆衰减率）
+        state_size: SSM 状态向量维度
+        expand_factor: 内部维度扩展因子
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_scales: int = 4,
+        state_size: int = 16,
+        expand_factor: int = 2,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_model * expand_factor
+        self.num_scales = num_scales
+        self.state_size = state_size
+        self.d_per_scale = self.d_inner // num_scales
+        assert self.d_inner % num_scales == 0, (
+            f"d_inner ({self.d_inner}) 必须能被 num_scales ({num_scales}) 整除"
+        )
+        # 输入投影：x 用于 SSM，z 用于门控
+        self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)
+        # A 矩阵：多尺度初始化（对数空间，保证稳定性）
+        # 不同尺度以指数级递增的时间常数初始化
+        A_parts = []
+        for i in range(num_scales):
+            base_freqs = torch.arange(1, state_size + 1, dtype=torch.float32)
+            scale_factor = 2.0 ** i  # 指数递增的时间尺度
+            A_part = (
+                (base_freqs * scale_factor)
+                .unsqueeze(0)
+                .expand(self.d_per_scale, -1)
+            )
+            A_parts.append(A_part)
+        A = torch.cat(A_parts, dim=0)  # (d_inner, state_size)
+        self.A_log = nn.Parameter(torch.log(A))
+        # 输入依赖的 SSM 参数（选择性机制的核心）
+        self.B_proj = nn.Linear(self.d_inner, state_size, bias=False)
+        self.C_proj = nn.Linear(self.d_inner, state_size, bias=False)
+        # 离散化步长（输入依赖，使模型能选择性地记忆或遗忘）
+        self.dt_proj = nn.Linear(self.d_inner, self.d_inner, bias=True)
+        # 初始化 dt bias，使初始步长在 [0.001, 0.1] 范围内
+        dt_init = torch.exp(
+            torch.rand(self.d_inner) * (math.log(0.1) - math.log(0.001))
+            + math.log(0.001)
+        )
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(dt_init.log())
+        # D 跳跃连接（直接通路，类似 Mamba）
+        self.D = nn.Parameter(torch.ones(self.d_inner))
+        # 输出门控：0 初始化更中性，sigmoid(0)=0.5
+        self.output_gate = nn.Parameter(torch.zeros(1))
+        # 输出投影
+        self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: (batch, seq_len, d_model)
+            past_state: 上一序列步的 SSM 隐状态 (B, d_inner, N)，用于增量解码
+            use_cache: 若 True 且 past_state 非 None，返回 (output, new_state)
+        Returns:
+            output: (batch, seq_len, d_model)
+            new_state (可选): (B, d_inner, N)，当 use_cache=True 时返回
+        """
+        B, L, D = x.shape
+        input_dtype = x.dtype
+        # 输入投影 + 门控分割
+        xz = self.in_proj(x)  # (B, L, 2*d_inner)
+        x_ssm, z = xz.chunk(2, dim=-1)  # 各 (B, L, d_inner)
+        # 计算输入依赖的 SSM 参数（float32 计算后转回原 dtype，MPS 兼容）
+        A = -torch.exp(self.A_log.float()).to(input_dtype)  # (d_inner, N)
+        B_mat = self.B_proj(x_ssm)  # (B, L, N)
+        C_mat = self.C_proj(x_ssm)  # (B, L, N)
+        dt = F.softplus(self.dt_proj(x_ssm))  # (B, L, d_inner), 正值
+        # 选择性扫描 — 优先 Triton kernel → 分块并行 → 顺序扫描
+        if L > 1:
+            if _TRITON_AVAILABLE and x_ssm.is_cuda:
+                y, new_state = self._triton_scan(
+                    x_ssm, A, B_mat, C_mat, dt,
+                    past_state=past_state,
+                    use_cache=use_cache,
+                )
+            else:
+                y, new_state = self._chunk_parallel_scan(
+                    x_ssm, A, B_mat, C_mat, dt,
+                    chunk_size=min(max(16, L), 64),
+                    past_state=past_state,
+                    use_cache=use_cache,
+                )
+        else:
+            y, new_state = self._selective_scan(
+                x_ssm, A, B_mat, C_mat, dt,
+                past_state=past_state,
+                use_cache=use_cache,
+            )
+        # 确保 y 与输入 dtype 一致（scan 可能返回 float32）
+        y = y.to(input_dtype)
+        # 跳跃连接
+        y = y + x_ssm * self.D.to(input_dtype).unsqueeze(0).unsqueeze(0)
+        # 门控输出
+        y = y * F.silu(z)
+        out = self.out_proj(y) * torch.sigmoid(self.output_gate.to(input_dtype))
+        if use_cache and new_state is not None:
+            return out, new_state
+        return out
+    def _selective_scan(
+        self,
+        x: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        dt: torch.Tensor,
+        past_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """执行选择性扫描（纯 PyTorch 顺序实现）。
+        支持 past_state 增量解码：传入上一步的 h 作为初始状态。
+        Returns:
+            y: (B, L, d_inner) - 输出
+            new_state: (B, d_inner, N) - 最终隐状态，use_cache=True 时返回
+        """
+        batch, L, d_inner = x.shape
+        N = A.shape[1]
+        orig_dtype = x.dtype
+        # 在 float32 中计算以防止 float16 溢出
+        x = x.float()
+        A = A.float()
+        B = B.float()
+        C = C.float()
+        dt = dt.float()
+        # 预计算离散化参数（clamp 防止 exp 溢出）
+        A_bar = torch.exp((dt.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0)).clamp(max=20))
+        B_bar = dt.unsqueeze(-1) * B.unsqueeze(2)
+        h = (
+            past_state.float()
+            if past_state is not None
+            else torch.zeros(batch, d_inner, N, device=x.device, dtype=torch.float32)
+        )
+        outputs = []
+        for t in range(L):
+            h = A_bar[:, t] * h + B_bar[:, t] * x[:, t].unsqueeze(-1)
+            y_t = (h * C[:, t].unsqueeze(1)).sum(-1)
+            outputs.append(y_t)
+        y = torch.stack(outputs, dim=1).to(orig_dtype)
+        new_state = h.to(orig_dtype) if use_cache else None
+        return y, new_state
+    def _chunk_parallel_scan(
+        self,
+        x: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        dt: torch.Tensor,
+        chunk_size: int = 64,
+        past_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """分块并行扫描：块内并行计算，块间顺序传播。
+        支持 past_state 增量解码。
+        """
+        batch, L, d_inner = x.shape
+        N = A.shape[1]
+        orig_dtype = x.dtype
+        # 在 float32 中计算以防止 float16 溢出
+        x = x.float()
+        A = A.float()
+        B = B.float()
+        C = C.float()
+        dt = dt.float()
+        A_bar = torch.exp((dt.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0)).clamp(max=20))
+        B_bar = dt.unsqueeze(-1) * B.unsqueeze(2)
+        num_chunks = (L + chunk_size - 1) // chunk_size
+        h = (
+            past_state.float()
+            if past_state is not None
+            else torch.zeros(batch, d_inner, N, device=x.device, dtype=torch.float32)
+        )
+        all_outputs = []
+        for c in range(num_chunks):
+            s = c * chunk_size
+            e = min(s + chunk_size, L)
+            a_chunk = A_bar[:, s:e]
+            b_chunk = B_bar[:, s:e]
+            x_chunk = x[:, s:e]
+            c_chunk = C[:, s:e]
+            log_a = torch.log(a_chunk.clamp(min=1e-8))
+            log_a_cum = torch.cumsum(log_a, dim=1)
+            a_cum = torch.exp(log_a_cum)
+            h_contrib = a_cum * h.unsqueeze(1)
+            input_term = b_chunk * x_chunk.unsqueeze(-1)
+            normalized = input_term / (a_cum + 1e-8)
+            cum_input = torch.cumsum(normalized, dim=1)
+            input_contrib = a_cum * cum_input
+            h_all = h_contrib + input_contrib
+            y_chunk = (h_all * c_chunk.unsqueeze(2)).sum(-1)
+            all_outputs.append(y_chunk)
+            h = h_all[:, -1]
+        y = torch.cat(all_outputs, dim=1).to(orig_dtype)
+        new_state = h.to(orig_dtype) if use_cache else None
+        return y, new_state
+    def _triton_scan(
+        self,
+        x: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        dt: torch.Tensor,
+        past_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Triton GPU kernel 加速的选择性扫描。
+        当 Triton 可用且输入在 CUDA 上时自动调用。
+        核心思路：将分块扫描的内外层循环融合为单个 Triton kernel，
+        避免多次 kernel launch 和中间内存分配。
+        当前版本为接口占位，委托给 _chunk_parallel_scan。
+        TODO: 实现原生 Triton kernel body。
+        """
+        logger.debug("Using Triton scan path (delegating to chunk_parallel)")
+        return self._chunk_parallel_scan(
+            x, A, B, C, dt,
+            chunk_size=min(max(16, x.shape[1]), 64),
+            past_state=past_state,
+            use_cache=use_cache,
+        )

cortexnet/training_utils.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""
+CortexNet 训练工具 (Training Utilities)
+提供梯度监控、种子设置、设备选择等训练辅助功能。
+"""
+from __future__ import annotations
+import random
+from typing import Dict
+import torch
+import torch.nn as nn
+import numpy as np
+class GradientMonitor:
+    """梯度监控器：记录训练过程中的梯度统计信息。
+    用于诊断梯度消失/爆炸、优化学习率等。
+    Usage:
+        monitor = GradientMonitor(model)
+        loss.backward()
+        stats = monitor.get_stats()
+    """
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self._hooks = []
+        self._grad_stats: Dict[str, Dict[str, float]] = {}
+        self._register_hooks()
+    def _register_hooks(self):
+        for name, param in self.model.named_parameters():
+            if param.requires_grad:
+                hook = param.register_hook(
+                    lambda grad, n=name: self._record_grad(n, grad)
+                )
+                self._hooks.append(hook)
+    def _record_grad(self, name: str, grad: torch.Tensor):
+        self._grad_stats[name] = {
+            "mean": grad.mean().item(),
+            "std": grad.std().item(),
+            "max": grad.max().item(),
+            "min": grad.min().item(),
+            "norm": grad.norm().item(),
+        }
+    def get_stats(self) -> Dict[str, Dict[str, float]]:
+        """获取最近一次 backward 的梯度统计。"""
+        return dict(self._grad_stats)
+    def get_summary(self) -> Dict[str, float]:
+        """获取汇总统计。"""
+        if not self._grad_stats:
+            return {}
+        norms = [s["norm"] for s in self._grad_stats.values()]
+        return {
+            "grad_norm_mean": sum(norms) / len(norms),
+            "grad_norm_max": max(norms),
+            "grad_norm_min": min(norms),
+            "num_params_tracked": len(norms),
+        }
+    def remove_hooks(self):
+        """移除所有已注册的钩子。"""
+        for hook in self._hooks:
+            hook.remove()
+        self._hooks.clear()
+    def __del__(self):
+        self.remove_hooks()
+def check_gradients_finite(model: nn.Module) -> bool:
+    """检查模型所有参数的梯度是否都是有限值（无 NaN/Inf）。
+    Args:
+        model: 要检查的模型
+    Returns:
+        True 如果所有梯度都有限（或无梯度），False 如果存在 NaN/Inf。
+    """
+    for name, param in model.named_parameters():
+        if param.grad is not None:
+            if not torch.isfinite(param.grad).all():
+                return False
+    return True
+def set_seed(seed: int = 42):
+    """设置全局随机种子，确保可复现性。
+    Args:
+        seed: 随机种子值
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def get_best_device() -> torch.device:
+    """自动选择最佳可用计算设备。
+    优先级: CUDA GPU > Apple MPS > CPU
+    Returns:
+        最佳可用设备的 torch.device 对象
+    """
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def create_optimizer_and_scheduler(
+    model: torch.nn.Module,
+    lr: float = 3e-4,
+    weight_decay: float = 0.01,
+    warmup_steps: int = 500,
+    total_steps: int = 10000,
+    min_lr_ratio: float = 0.1,
+    betas: tuple = (0.9, 0.95),
+):
+    """创建 AdamW 优化器 + 余弦退火调度器。
+    典型的 LLM 训练配置：
+      - AdamW (β₁=0.9, β₂=0.95)
+      - 线性 warmup → 余弦退火
+      - 最终学习率 = min_lr_ratio × 初始学习率
+    Args:
+        model: 目标模型
+        lr: 初始学习率
+        weight_decay: L2 正则权重（不应用于 bias/norm）
+        warmup_steps: warmup 步数
+        total_steps: 总训练步数
+        min_lr_ratio: 最终学习率与初始学习率的比值
+        betas: Adam 的 β 参数
+    Returns:
+        (optimizer, scheduler) 元组
+    """
+    import math
+    # 分组参数：bias 和 LayerNorm/RMSNorm 不做 weight decay
+    decay_params = []
+    no_decay_params = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        if param.ndim == 1 or "bias" in name or "norm" in name.lower():
+            no_decay_params.append(param)
+        else:
+            decay_params.append(param)
+    param_groups = [
+        {"params": decay_params, "weight_decay": weight_decay},
+        {"params": no_decay_params, "weight_decay": 0.0},
+    ]
+    optimizer = torch.optim.AdamW(param_groups, lr=lr, betas=betas)
+    # 余弦退火 + 线性 warmup
+    def lr_lambda(step):
+        if step < warmup_steps:
+            return step / max(warmup_steps, 1)
+        progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
+        return min_lr_ratio + (1 - min_lr_ratio) * 0.5 * (1 + math.cos(math.pi * progress))
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+    return optimizer, scheduler
+def safe_clip_grad_norm_(
+    model: torch.nn.Module,
+    max_norm: float = 1.0,
+    norm_type: float = 2.0,
+) -> float:
+    """安全的梯度裁剪：先检查 NaN，再执行 clip。
+    Args:
+        model: 目标模型
+        max_norm: 梯度范数上限
+        norm_type: 范数类型（默认 L2）
+    Returns:
+        裁剪前的梯度总范数
+    """
+    # 检查 NaN/Inf 梯度
+    for name, param in model.named_parameters():
+        if param.grad is not None and not torch.isfinite(param.grad).all():
+            param.grad.zero_()  # 用零替代 NaN 梯度
+    total_norm = torch.nn.utils.clip_grad_norm_(
+        model.parameters(), max_norm, norm_type=norm_type,
+    )
+    return float(total_norm)

cortexnet/transformer_baseline.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""
+Transformer 基线模型 (Transformer Baseline)
+标准 Transformer 语言模型，用于与 CortexNet 进行公平对比。
+使用 Pre-LN (Pre-LayerNorm) 结构 + RoPE 位置编码。
+"""
+from __future__ import annotations
+from typing import Optional, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .blocks import RMSNorm
+from .attention import precompute_rope_freqs, apply_rope
+class TransformerBlock(nn.Module):
+    """标准 Transformer 解码器块（Pre-LN）。"""
+    def __init__(self, d_model: int, num_heads: int, d_ff: int,
+                 max_seq_len: int = 8192, dropout: float = 0.0,
+                 rope_theta: float = 10000.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.norm1 = RMSNorm(d_model)
+        self.norm2 = RMSNorm(d_model)
+        # 自注意力
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.o_proj = nn.Linear(d_model, d_model, bias=False)
+        # FFN (SwiGLU)
+        self.gate_proj = nn.Linear(d_model, d_ff, bias=False)
+        self.up_proj = nn.Linear(d_model, d_ff, bias=False)
+        self.down_proj = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout)
+        # RoPE
+        self.register_buffer(
+            "rope_freqs",
+            precompute_rope_freqs(self.head_dim, max_seq_len, rope_theta),
+            persistent=False,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, D = x.shape
+        residual = x
+        x_norm = self.norm1(x)
+        # Multi-head attention with RoPE
+        q = self.q_proj(x_norm).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x_norm).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x_norm).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        q = apply_rope(q, self.rope_freqs)
+        k = apply_rope(k, self.rope_freqs)
+        # Causal attention
+        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, L, D)
+        x = residual + self.dropout(self.o_proj(attn_out))
+        # SwiGLU FFN
+        residual = x
+        x_norm = self.norm2(x)
+        x = residual + self.dropout(
+            self.down_proj(F.silu(self.gate_proj(x_norm)) * self.up_proj(x_norm))
+        )
+        return x
+class TransformerLM(nn.Module):
+    """标准 Transformer 语言模型（用于对比基线）。
+    Args:
+        vocab_size: 词汇表大小
+        d_model: 模型维度
+        num_layers: 层数
+        num_heads: 注意力头数
+        d_ff: FFN 中间维度
+        max_seq_len: 最大序列长度
+        dropout: Dropout 比率
+    """
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        d_model: int = 512,
+        num_layers: int = 4,
+        num_heads: int = 8,
+        d_ff: int = 1024,
+        max_seq_len: int = 8192,
+        dropout: float = 0.0,
+        rope_theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.embed = nn.Embedding(vocab_size, d_model)
+        self.embed_dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(d_model, num_heads, d_ff, max_seq_len, dropout, rope_theta)
+            for _ in range(num_layers)
+        ])
+        self.final_norm = RMSNorm(d_model)
+        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
+        self.lm_head.weight = self.embed.weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        B, L = input_ids.shape
+        x = self.embed_dropout(self.embed(input_ids))
+        for block in self.blocks:
+            x = block(x)
+        x = self.final_norm(x)
+        logits = self.lm_head(x)
+        result = {"logits": logits}
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, self.vocab_size),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+            result["loss"] = loss
+        return result