PyPI - cortexnet - Versions diffs - 3.2.1__py3-none-any.whl - Mend

cortexnet 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cortexnet/__init__.py +197 -0
cortexnet/adapter/__init__.py +26 -0
cortexnet/adapter/arch_adapter.py +209 -0
cortexnet/adapter/calibrator.py +244 -0
cortexnet/adapter/inference_adapter.py +272 -0
cortexnet/adapter/model_registry.py +378 -0
cortexnet/adapter/weight_adapter.py +415 -0
cortexnet/adversarial.py +195 -0
cortexnet/attention.py +520 -0
cortexnet/blocks.py +682 -0
cortexnet/cache.py +83 -0
cortexnet/causal_reasoning.py +232 -0
cortexnet/compat.py +245 -0
cortexnet/config.py +234 -0
cortexnet/continual_learning.py +256 -0
cortexnet/cortex_block_lite.py +221 -0
cortexnet/distributed.py +213 -0
cortexnet/graph_reasoning.py +207 -0
cortexnet/hierarchical_memory.py +360 -0
cortexnet/interpretability.py +196 -0
cortexnet/memory.py +179 -0
cortexnet/meta_learning.py +187 -0
cortexnet/model.py +1360 -0
cortexnet/multi_agent.py +241 -0
cortexnet/multimodal.py +278 -0
cortexnet/ops/__init__.py +28 -0
cortexnet/ops/device_manager.py +449 -0
cortexnet/ops/npu_ops.py +243 -0
cortexnet/quantization.py +496 -0
cortexnet/routing.py +335 -0
cortexnet/self_evolution.py +174 -0
cortexnet/ssm.py +340 -0
cortexnet/training_utils.py +204 -0
cortexnet/transformer_baseline.py +157 -0
cortexnet-3.2.1.dist-info/METADATA +114 -0
cortexnet-3.2.1.dist-info/RECORD +39 -0
cortexnet-3.2.1.dist-info/WHEEL +5 -0
cortexnet-3.2.1.dist-info/licenses/LICENSE +201 -0
cortexnet-3.2.1.dist-info/top_level.txt +1 -0

cortexnet/__init__.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""CortexNet public package API."""
+from __future__ import annotations
+from .config import CortexNetConfig, TrainingConfig
+from .model import CortexNet, CortexNetBase, CortexNetV2 as _CortexNetV2, CortexNetV3 as _CortexNetV3
+from .blocks import CortexBlock, CortexBlockV2 as _CortexBlockV2, CortexBlockV3 as _CortexBlockV3, RMSNorm, AdaptiveFusionGate
+from .ssm import MultiScaleSSM
+from .attention import SelectiveSparseAttention
+from .memory import SynapticMemory
+from .routing import MixtureOfExperts, ExpertFFN, CollaborativeMoE
+from .cache import CortexNetCache
+from .transformer_baseline import TransformerLM
+from .quantization import quantize_dynamic, QuantizationWrapper
+from .hierarchical_memory import (
+    HierarchicalMemorySystem,
+    WorkingMemory,
+    EpisodicMemory,
+    SemanticMemory,
+    MemoryController,
+)
+from .graph_reasoning import GraphReasoningModule
+from .meta_learning import MetaLearningAdapter, TaskAdaptiveController, ContextEncoder
+from .multimodal import MultiModalEncoder, PatchEmbedding, AudioEmbedding, CrossModalFusion
+from .continual_learning import (
+    ElasticWeightConsolidation,
+    ProgressiveMemoryReplay,
+    ContinualLearningManager,
+)
+from .interpretability import ThoughtFlowMonitor
+from .causal_reasoning import CausalReasoningModule, InterventionalAttention, CounterfactualBranch
+from .self_evolution import SelfEvolutionEngine, DynamicPathController, ComputeBudgetAllocator
+from .multi_agent import MultiAgentSystem, SpecialistAgent, AgentCoordinator, SharedMessageBoard
+from .adversarial import AdversarialShield, AdversarialTrainer, InputShield, FeatureShield
+from .training_utils import GradientMonitor, check_gradients_finite, set_seed, get_best_device
+from .adapter import (
+    WeightAdapter,
+    ArchitectureAdapter,
+    InferenceAdapter,
+    LightweightCalibrator,
+    ModelRegistry,
+    detect_model_type,
+    get_cortexnet_config,
+)
+from .ops import (
+    DeviceManager,
+    get_best_device_info,
+    is_npu_available,
+    is_mlu_available,
+    get_device_type,
+    resolve_device_string,
+    resolve_dtype_for_device,
+    NPUOperators,
+    get_operators,
+)
+from .distributed import (
+    setup_distributed,
+    cleanup_distributed,
+    wrap_fsdp,
+    wrap_ddp,
+    get_rank,
+    get_world_size,
+    is_main_process,
+)
+_HAS_DATA = False
+try:
+    from .data import (
+        SimpleTokenizer,
+        MiniMindTokenizer,
+        TextCorpusDataset,
+        StreamingDataset,
+        ConversationDataset,
+        PretrainDataset,
+        CodeCompletionDataset,
+        CodeGenerationDataset,
+        download_wikitext2,
+        download_minimind_data,
+    )
+    _DATA_EXPORTS = (
+        SimpleTokenizer,
+        MiniMindTokenizer,
+        TextCorpusDataset,
+        StreamingDataset,
+        ConversationDataset,
+        PretrainDataset,
+        CodeCompletionDataset,
+        CodeGenerationDataset,
+        download_wikitext2,
+        download_minimind_data,
+    )
+    _HAS_DATA = True
+except ImportError:
+    _DATA_EXPORTS = ()
+    pass
+# Legacy compatibility aliases (not part of the default public API list).
+CortexNetV2 = _CortexNetV2
+CortexNetV3 = _CortexNetV3
+CortexBlockV2 = _CortexBlockV2
+CortexBlockV3 = _CortexBlockV3
+__version__ = "3.2.1"
+__all__ = [
+    "CortexNet",
+    "CortexNetBase",
+    "CortexNetConfig",
+    "TrainingConfig",
+    "CortexBlock",
+    "RMSNorm",
+    "AdaptiveFusionGate",
+    "MultiScaleSSM",
+    "SelectiveSparseAttention",
+    "SynapticMemory",
+    "MixtureOfExperts",
+    "ExpertFFN",
+    "CollaborativeMoE",
+    "CortexNetCache",
+    "TransformerLM",
+    "quantize_dynamic",
+    "QuantizationWrapper",
+    "HierarchicalMemorySystem",
+    "WorkingMemory",
+    "EpisodicMemory",
+    "SemanticMemory",
+    "MemoryController",
+    "GraphReasoningModule",
+    "MetaLearningAdapter",
+    "TaskAdaptiveController",
+    "ContextEncoder",
+    "MultiModalEncoder",
+    "PatchEmbedding",
+    "AudioEmbedding",
+    "CrossModalFusion",
+    "ElasticWeightConsolidation",
+    "ProgressiveMemoryReplay",
+    "ContinualLearningManager",
+    "ThoughtFlowMonitor",
+    "CausalReasoningModule",
+    "InterventionalAttention",
+    "CounterfactualBranch",
+    "SelfEvolutionEngine",
+    "DynamicPathController",
+    "ComputeBudgetAllocator",
+    "MultiAgentSystem",
+    "SpecialistAgent",
+    "AgentCoordinator",
+    "SharedMessageBoard",
+    "AdversarialShield",
+    "AdversarialTrainer",
+    "InputShield",
+    "FeatureShield",
+    "GradientMonitor",
+    "check_gradients_finite",
+    "set_seed",
+    "get_best_device",
+    "WeightAdapter",
+    "ArchitectureAdapter",
+    "InferenceAdapter",
+    "LightweightCalibrator",
+    "ModelRegistry",
+    "detect_model_type",
+    "get_cortexnet_config",
+    "DeviceManager",
+    "get_best_device_info",
+    "is_npu_available",
+    "is_mlu_available",
+    "get_device_type",
+    "resolve_device_string",
+    "resolve_dtype_for_device",
+    "NPUOperators",
+    "get_operators",
+    "setup_distributed",
+    "cleanup_distributed",
+    "wrap_fsdp",
+    "wrap_ddp",
+    "get_rank",
+    "get_world_size",
+    "is_main_process",
+]
+if _HAS_DATA:
+    __all__.extend(
+        [
+            "SimpleTokenizer",
+            "MiniMindTokenizer",
+            "TextCorpusDataset",
+            "StreamingDataset",
+            "ConversationDataset",
+            "PretrainDataset",
+            "CodeCompletionDataset",
+            "CodeGenerationDataset",
+            "download_wikitext2",
+            "download_minimind_data",
+        ]
+    )

cortexnet/adapter/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+CortexNet 适配器子包 (Adapter Sub-package)
+提供开源大模型到 CortexNet 的自动适配能力：
+- 模型识别与注册
+- 权重映射
+- 架构适配
+- 推理接口统一
+- 轻量校准
+"""
+from .model_registry import ModelRegistry, detect_model_type, get_cortexnet_config
+from .weight_adapter import WeightAdapter
+from .arch_adapter import ArchitectureAdapter
+from .inference_adapter import InferenceAdapter
+from .calibrator import LightweightCalibrator
+__all__ = [
+    "ModelRegistry",
+    "detect_model_type",
+    "get_cortexnet_config",
+    "WeightAdapter",
+    "ArchitectureAdapter",
+    "InferenceAdapter",
+    "LightweightCalibrator",
+]

cortexnet/adapter/arch_adapter.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""
+架构适配层 (Architecture Adapter)
+核心功能：
+  根据源模型的架构特性，自动调整 CortexNet 内部模块的参数和行为，
+  使其最大程度保留原模型的能力。
+适配项：
+  1. 稀疏注意力 top-k / 滑动窗口
+  2. SSM 多尺度参数缩放
+  3. 动态门控阈值（训推一致性）
+  4. 位置编码参数（RoPE scaling / ALiBi）
+  5. MoE 路由初始化
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+import torch
+import torch.nn as nn
+logger = logging.getLogger(__name__)
+class ArchitectureAdapter:
+    """架构适配器。
+    在模型初始化后，根据源模型类型自动调整 CortexNet 模块参数。
+    """
+    def __init__(self, model_type: str, config: Any):
+        self.model_type = model_type
+        self.config = config
+    def adapt(self, cortex_model: nn.Module) -> nn.Module:
+        """对 CortexNet 模型执行全面架构适配。
+        Args:
+            cortex_model: 已初始化的 CortexNet 模型
+        Returns:
+            适配后的模型（原地修改）
+        """
+        logger.info(f"Starting architecture adaptation for model type: {self.model_type}")
+        self._adapt_attention(cortex_model)
+        self._adapt_ssm(cortex_model)
+        self._adapt_position_encoding(cortex_model)
+        self._adapt_dynamic_gating(cortex_model)
+        self._adapt_moe(cortex_model)
+        self._adapt_normalization(cortex_model)
+        logger.info("Architecture adaptation complete.")
+        return cortex_model
+    def _adapt_attention(self, model: nn.Module):
+        """适配注意力模块。"""
+        preserve_pretrained_behavior = bool(getattr(self.config, "source_model_path", None))
+        for block in model.blocks:
+            attn = block.attention
+            # 预训练权重迁移优先保真：Lite 稀疏注意力退化为全注意力，避免随机重要性打分破坏输出。
+            if preserve_pretrained_behavior and hasattr(attn, "top_k_ratio"):
+                attn.top_k_ratio = 1.0
+            # 滑动窗口（Mistral/Mistral-derived）
+            if self.config.sliding_window_size > 0:
+                if hasattr(attn, 'sliding_window_size'):
+                    attn.sliding_window_size = self.config.sliding_window_size
+                logger.debug(
+                    f"Set sliding window size to {self.config.sliding_window_size}"
+                )
+            # 根据模型上下文窗口长度调整 top-k
+            if (not preserve_pretrained_behavior) and self.config.max_seq_len > 8192:
+                # 长上下文模型：降低 top-k 比例以节省显存
+                new_ratio = max(0.1, 0.25 * (8192 / self.config.max_seq_len))
+                if hasattr(attn, 'top_k_ratio'):
+                    attn.top_k_ratio = new_ratio
+                logger.debug(
+                    f"Adjusted top_k_ratio to {new_ratio:.3f} "
+                    f"for max_seq_len={self.config.max_seq_len}"
+                )
+            # Lite 三路径中 SSM/Memory 无预训练权重时，先强偏向 Attention 作为稳定基线。
+            if preserve_pretrained_behavior and hasattr(block, "fusion_weights"):
+                with torch.no_grad():
+                    block.fusion_weights.copy_(
+                        torch.tensor(
+                            [-8.0, 8.0, -8.0],
+                            device=block.fusion_weights.device,
+                            dtype=block.fusion_weights.dtype,
+                        )
+                    )
+    def _adapt_ssm(self, model: nn.Module):
+        """适配 SSM 多尺度参数。"""
+        for block in model.blocks:
+            ssm = block.ssm
+            # 根据模型隐藏维度调整 SSM 内部扩展
+            if self.config.hidden_size >= 4096:
+                # 大模型：更多尺度捕获不同粒度的依赖
+                if hasattr(ssm, 'num_scales'):
+                    target_scales = min(8, self.config.hidden_size // 512)
+                    logger.debug(
+                        f"SSM scales adjusted for large model: {target_scales}"
+                    )
+    def _adapt_position_encoding(self, model: nn.Module):
+        """适配位置编码参数。"""
+        for block in model.blocks:
+            attn = block.attention
+            # RoPE scaling（动态 NTK、线性插值等）
+            if self.config.rope_scaling:
+                scaling_type = self.config.rope_scaling.get("type", "linear")
+                scaling_factor = self.config.rope_scaling.get("factor", 1.0)
+                if hasattr(attn, 'rope_scaling_factor'):
+                    attn.rope_scaling_factor = scaling_factor
+                    attn.rope_scaling_type = scaling_type
+                logger.debug(
+                    f"Applied RoPE scaling: type={scaling_type}, factor={scaling_factor}"
+                )
+            # 更新 RoPE theta
+            if hasattr(attn, 'rope_theta'):
+                attn.rope_theta = self.config.rope_theta
+    def _adapt_dynamic_gating(self, model: nn.Module):
+        """优化动态门控：从硬二值化改为软裁剪，解决训推不一致。
+        CortexNet V3 的 DynamicPathController 默认使用硬裁剪（eval 时 >0 → 1.0）。
+        这里改为软阈值（sigmoid 连续值），保证训练和推理行为一致。
+        """
+        _SOFT_THRESHOLD = 0.1  # 软裁剪阈值
+        for block in model.blocks:
+            if hasattr(block, 'path_controller'):
+                controller = block.path_controller
+                # Monkey-patch forward 方法，使推理时也使用 sigmoid（非硬裁剪）
+                original_forward = controller.forward
+                def _soft_forward(self_ctrl, x, _orig=original_forward, _thresh=_SOFT_THRESHOLD):
+                    context = x.mean(dim=1)
+                    logits = self_ctrl.gate_net(context)
+                    if self_ctrl.training:
+                        noise = torch.zeros_like(logits).uniform_(1e-4, 1 - 1e-4)
+                        noise = (torch.log(noise) - torch.log(1 - noise)).clamp(-10, 10)
+                        gates = torch.sigmoid(
+                            (logits + noise) / max(self_ctrl.temperature, 0.1)
+                        )
+                    else:
+                        # 软裁剪：sigmoid 输出，而非硬二值
+                        gates = torch.sigmoid(logits)
+                        # 低于阈值的路径置零（节省计算但保持连续性）
+                        gates = gates * (gates > _thresh).float()
+                    return gates
+                import types
+                controller.forward = types.MethodType(_soft_forward, controller)
+        logger.debug(f"Applied soft gating threshold: {_SOFT_THRESHOLD}")
+    def _adapt_moe(self, model: nn.Module):
+        """适配 MoE 路由。
+        对于非 MoE 原模型（如 LLaMA），需要将原模型的 FFN 权重
+        复制到 CortexNet MoE 的第一个专家，并初始化路由器偏向该专家。
+        Lite 模式下无 MoE，自动跳过。
+        """
+        for block in model.blocks:
+            if not hasattr(block, 'moe'):
+                continue  # Lite 模式使用 FFN，无需适配
+            moe = block.moe
+            # 检查是否已经有映射过的权重（experts.0 有非零权重）
+            expert_0 = moe.experts[0] if hasattr(moe, 'experts') else None
+            if expert_0 is None:
+                continue
+            # 初始化路由器偏置，使其倾向于激活 expert 0
+            if hasattr(moe, 'router') and hasattr(moe.router, 'weight'):
+                router = moe.router
+                with torch.no_grad():
+                    # 对 expert 0 的路由权重加大偏置
+                    if router.weight.shape[0] > 0:
+                        router.weight.data[0] += 0.5
+        logger.debug("MoE router initialized with bias toward adapted expert")
+    def _adapt_normalization(self, model: nn.Module):
+        """确保归一化层类型与 CortexNet 一致。"""
+        # CortexNet 使用 RMSNorm，如果源模型使用 LayerNorm 则参数已在 WeightAdapter 中处理
+        # 这里只做最终验证
+        for name, module in model.named_modules():
+            if isinstance(module, nn.LayerNorm):
+                logger.warning(
+                    f"Found unexpected LayerNorm at {name}. "
+                    f"CortexNet expects RMSNorm. This may cause subtle differences."
+                )

cortexnet/adapter/calibrator.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""
+轻量校准器 (Lightweight Calibrator)
+核心功能：
+  使用极少量数据（~100 样本，1 epoch）微调 CortexNet 的适配层参数，
+  使模型在适配后的行为与原生模型尽可能一致。
+关键设计：
+  1. 仅优化 <1% 的参数（融合门控 + MoE 路由层）
+  2. 冻结所有核心权重（Q/K/V/FFN 等）
+  3. 仅使用自回归交叉熵损失（无辅助损失）
+  4. 校准结果缓存复用
+"""
+from __future__ import annotations
+import os
+import json
+import logging
+import hashlib
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+logger = logging.getLogger(__name__)
+class LightweightCalibrator:
+    """轻量校准器。
+    Args:
+        cortex_model: CortexNet 模型
+        model_type: 源模型类型
+        cache_dir: 校准参数缓存目录
+    """
+    def __init__(
+        self,
+        cortex_model: nn.Module,
+        model_type: str = "default",
+        cache_dir: Optional[str] = None,
+    ):
+        self.model = cortex_model
+        self.model_type = model_type
+        self.cache_dir = cache_dir or os.path.join(
+            os.path.expanduser("~"), ".cache", "cortexnet", "calibration"
+        )
+        os.makedirs(self.cache_dir, exist_ok=True)
+    def calibrate(
+        self,
+        calibration_data: Optional[List[Dict[str, torch.Tensor]]] = None,
+        n_samples: int = 100,
+        lr: float = 1e-5,
+        use_cache: bool = True,
+    ) -> nn.Module:
+        """执行轻量校准。
+        Args:
+            calibration_data: 校准数据列表，每个元素包含 "input_ids"
+            n_samples: 校准样本数（如未提供数据则自动生成）
+            lr: 学习率
+            use_cache: 是否使用/保存缓存
+        Returns:
+            校准后的模型
+        """
+        # 尝试加载缓存
+        cache_key = self._get_cache_key()
+        if use_cache and self._load_cached(cache_key):
+            logger.info("Loaded calibration parameters from cache.")
+            return self.model
+        # 准备校准数据
+        if calibration_data is None:
+            calibration_data = self._generate_calibration_data(n_samples)
+        if not calibration_data:
+            logger.warning("No calibration data available. Skipping calibration.")
+            return self.model
+        # 冻结核心权重，仅解冻适配层
+        trainable_params = self._freeze_core_weights()
+        if not trainable_params:
+            logger.info("No trainable adaptation parameters found. Skipping calibration.")
+            return self.model
+        trainable_count = sum(p.numel() for p in trainable_params)
+        total_count = sum(p.numel() for p in self.model.parameters())
+        logger.info(
+            f"Calibration: optimizing {trainable_count:,} / {total_count:,} params "
+            f"({100 * trainable_count / max(total_count, 1):.2f}%)"
+        )
+        # 优化器
+        optimizer = torch.optim.AdamW(trainable_params, lr=lr, weight_decay=0.01)
+        loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
+        # 1 epoch 快速校准
+        self.model.train()
+        total_loss = 0.0
+        n_steps = 0
+        for batch in calibration_data[:n_samples]:
+            input_ids = batch["input_ids"]
+            if input_ids.dim() == 1:
+                input_ids = input_ids.unsqueeze(0)
+            device = next(self.model.parameters()).device
+            input_ids = input_ids.to(device)
+            labels = input_ids.clone()
+            # 清零梯度（在 forward 前，确保无残留计算图引用）
+            optimizer.zero_grad(set_to_none=True)
+            try:
+                # 前向（eval 模式避免 aux_loss 图残留，手动开启梯度）
+                self.model.eval()
+                with torch.enable_grad():
+                    output = self.model(input_ids)
+                    logits = output["logits"]
+                    # 移位交叉熵
+                    shift_logits = logits[:, :-1, :].contiguous()
+                    shift_labels = labels[:, 1:].contiguous()
+                    loss = loss_fn(
+                        shift_logits.view(-1, shift_logits.size(-1)),
+                        shift_labels.view(-1),
+                    )
+                    # 跳过 NaN/Inf loss（保护训练稳定性）
+                    if not loss.isfinite():
+                        continue
+                    loss.backward()
+                torch.nn.utils.clip_grad_norm_(trainable_params, max_norm=1.0)
+                optimizer.step()
+                total_loss += loss.detach().item()
+                n_steps += 1
+            except RuntimeError:
+                # 自动恢复：清除可能残留的梯度
+                optimizer.zero_grad(set_to_none=True)
+                continue
+        avg_loss = total_loss / max(n_steps, 1)
+        logger.info(f"Calibration complete: {n_steps} steps, avg loss={avg_loss:.4f}")
+        # 保存缓存
+        if use_cache:
+            self._save_cached(cache_key)
+        # 恢复参数梯度状态
+        for param in self.model.parameters():
+            param.requires_grad = True
+        self.model.eval()
+        return self.model
+    def _freeze_core_weights(self) -> List[nn.Parameter]:
+        """冻结核心权重，返回可训练的适配层参数。"""
+        # 需要优化的参数名模式（适配层）
+        trainable_patterns = [
+            "fusion",          # 融合门控
+            "router",          # MoE 路由器
+            "gate_net",        # 动态路径控制器
+            "meta_adapter",    # 元学习适配器
+            "task_controller", # 任务控制器
+            "path_controller", # 路径控制器
+        ]
+        trainable_params = []
+        for name, param in self.model.named_parameters():
+            is_trainable = any(pat in name for pat in trainable_patterns)
+            param.requires_grad = is_trainable
+            if is_trainable:
+                trainable_params.append(param)
+        return trainable_params
+    def _generate_calibration_data(
+        self,
+        n_samples: int,
+    ) -> List[Dict[str, torch.Tensor]]:
+        """生成合成校准数据（当无真实数据时使用）。
+        使用随机 token 序列作为校准数据。
+        虽然不如真实文本效果好，但足以校准门控/路由参数。
+        """
+        vocab_size = getattr(self.model.config, 'vocab_size', 32000)
+        max_len = min(getattr(self.model.config, 'max_seq_len', 512), 256)
+        data = []
+        for _ in range(n_samples):
+            ids = torch.randint(1, vocab_size, (max_len,), dtype=torch.long)
+            data.append({"input_ids": ids})
+        logger.info(f"Generated {n_samples} synthetic calibration samples")
+        return data
+    def _get_cache_key(self) -> str:
+        """生成校准缓存的唯一键。"""
+        config_str = json.dumps({
+            "model_type": self.model_type,
+            "hidden_size": getattr(self.model.config, 'hidden_size', 0),
+            "num_layers": getattr(self.model.config, 'num_layers', 0),
+            "vocab_size": getattr(self.model.config, 'vocab_size', 0),
+        }, sort_keys=True)
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]
+    def _save_cached(self, cache_key: str):
+        """保存校准参数到缓存。"""
+        cache_path = os.path.join(self.cache_dir, f"calibration_{cache_key}.pt")
+        # 仅保存适配层参数
+        adapt_state = {
+            name: param.data.clone()
+            for name, param in self.model.named_parameters()
+            if any(pat in name for pat in [
+                "fusion", "router", "gate_net",
+                "meta_adapter", "task_controller", "path_controller",
+            ])
+        }
+        torch.save(adapt_state, cache_path)
+        logger.info(f"Saved calibration cache: {cache_path}")
+    def _load_cached(self, cache_key: str) -> bool:
+        """从缓存加载校准参数。"""
+        cache_path = os.path.join(self.cache_dir, f"calibration_{cache_key}.pt")
+        if not os.path.exists(cache_path):
+            return False
+        try:
+            adapt_state = torch.load(cache_path, map_location="cpu", weights_only=True)
+            model_state = self.model.state_dict()
+            for name, param in adapt_state.items():
+                if name in model_state and model_state[name].shape == param.shape:
+                    model_state[name] = param
+            self.model.load_state_dict(model_state, strict=False)
+            return True
+        except Exception as e:
+            logger.warning(f"Failed to load calibration cache: {e}")
+            return False