PyPI - cortexnet - Versions diffs - 3.2.1__py3-none-any.whl - Mend

cortexnet 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

cortexnet/__init__.py +197 -0
cortexnet/adapter/__init__.py +26 -0
cortexnet/adapter/arch_adapter.py +209 -0
cortexnet/adapter/calibrator.py +244 -0
cortexnet/adapter/inference_adapter.py +272 -0
cortexnet/adapter/model_registry.py +378 -0
cortexnet/adapter/weight_adapter.py +415 -0
cortexnet/adversarial.py +195 -0
cortexnet/attention.py +520 -0
cortexnet/blocks.py +682 -0
cortexnet/cache.py +83 -0
cortexnet/causal_reasoning.py +232 -0
cortexnet/compat.py +245 -0
cortexnet/config.py +234 -0
cortexnet/continual_learning.py +256 -0
cortexnet/cortex_block_lite.py +221 -0
cortexnet/distributed.py +213 -0
cortexnet/graph_reasoning.py +207 -0
cortexnet/hierarchical_memory.py +360 -0
cortexnet/interpretability.py +196 -0
cortexnet/memory.py +179 -0
cortexnet/meta_learning.py +187 -0
cortexnet/model.py +1360 -0
cortexnet/multi_agent.py +241 -0
cortexnet/multimodal.py +278 -0
cortexnet/ops/__init__.py +28 -0
cortexnet/ops/device_manager.py +449 -0
cortexnet/ops/npu_ops.py +243 -0
cortexnet/quantization.py +496 -0
cortexnet/routing.py +335 -0
cortexnet/self_evolution.py +174 -0
cortexnet/ssm.py +340 -0
cortexnet/training_utils.py +204 -0
cortexnet/transformer_baseline.py +157 -0
cortexnet-3.2.1.dist-info/METADATA +114 -0
cortexnet-3.2.1.dist-info/RECORD +39 -0
cortexnet-3.2.1.dist-info/WHEEL +5 -0
cortexnet-3.2.1.dist-info/licenses/LICENSE +201 -0
cortexnet-3.2.1.dist-info/top_level.txt +1 -0

cortexnet/hierarchical_memory.py ADDED Viewed

@@ -0,0 +1,360 @@
+from __future__ import annotations
+"""
+分层记忆系统 (Hierarchical Memory System)
+受人类认知架构启发的三层记忆系统：
+  ┌─────────────────────────────────────────────────────────────┐
+  │                    分层记忆架构                              │
+  ├─────────────────────────────────────────────────────────────┤
+  │                                                             │
+  │  ┌─── 工作记忆 (Working Memory) ──────────────────────┐    │
+  │  │  • 快速权重，逐 token 更新                          │    │
+  │  │  • 高分辨率，小容量                                  │    │
+  │  │  • 衰减快，捕获即时上下文                            │    │
+  │  │  • 类似人脑前额叶的工作记忆                          │    │
+  │  └────────────────────────────────────────────────────┘    │
+  │                                                             │
+  │  ┌─── 情景记忆 (Episodic Memory) ─────────────────────┐    │
+  │  │  • 可学习的记忆槽位，跨序列积累                      │    │
+  │  │  • 通过交叉注意力读写                                │    │
+  │  │  • 中等容量，存储压缩的上下文快照                    │    │
+  │  │  • 类似海马体的情景记忆                              │    │
+  │  └────────────────────────────────────────────────────┘    │
+  │                                                             │
+  │  ┌─── 语义记忆 (Semantic Memory) ─────────────────────┐    │
+  │  │  • 全局可学习知识库，所有层共享                      │    │
+  │  │  • 通过训练缓慢更新                                  │    │
+  │  │  • 大容量，存储抽象的通用知识                        │    │
+  │  │  • 类似大脑皮层的长期语义记忆                        │    │
+  │  └────────────────────────────────────────────────────┘    │
+  │                                                             │
+  │  ┌─── 记忆控制器 (Memory Controller) ─────────────────┐    │
+  │  │  • 动态决定从哪层记忆读取/写入                      │    │
+  │  │  • 学习不同情境下的最优记忆策略                      │    │
+  │  │  • 门控融合三层记忆输出                              │    │
+  │  └────────────────────────────────────────────────────┘    │
+  └─────────────────────────────────────────────────────────────┘
+创新点:
+  1. 三层记忆各有不同的时间尺度和容量
+  2. 记忆控制器动态平衡读写策略
+  3. 选择性遗忘机制防止记忆过载
+  4. 工作记忆提供即时上下文，语义记忆提供长期知识
+"""
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class WorkingMemory(nn.Module):
+    """工作记忆：快速权重系统，逐 token 更新。
+    通过快速权重矩阵累积键值关联，每个 token 先读后写。
+    加入选择性遗忘门控，防止记忆过载。
+    数学原理：
+        遗忘门: f_t = σ(W_f · x_t)
+        读取:   o_t = φ(q_t) · M_{t-1} / norm
+        写入:   M_t = f_t · M_{t-1} + φ(k_t)^T · v_t
+    """
+    def __init__(self, d_model: int, memory_dim: int = 64):
+        super().__init__()
+        self.memory_dim = memory_dim
+        self.scale = memory_dim ** -0.5
+        self.q_proj = nn.Linear(d_model, memory_dim, bias=False)
+        self.k_proj = nn.Linear(d_model, memory_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        # 选择性遗忘门控（每个记忆维度独立衰减）
+        self.forget_gate = nn.Linear(d_model, memory_dim)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_memory: Optional[torch.Tensor] = None,
+        past_z: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, L, D = x.shape
+        if use_cache or past_memory is not None:
+            return self._sequential_forward(x, past_memory, past_z, use_cache)
+        return self._parallel_forward(x)
+    def _parallel_forward(self, x: torch.Tensor) -> torch.Tensor:
+        """并行线性注意力（带 per-position 遗忘门），无 Python 循环。
+        log-space cumsum trick 处理 position-dependent decay：
+          log_cum[t] = Σ_{i=0}^{t} log(forget[i])
+          decay(t→s) = exp(log_cum[t] - log_cum[s])
+        """
+        B, L, D = x.shape
+        q = F.elu(self.q_proj(x), alpha=1.0) + 1  # (B, L, mem)
+        k = F.elu(self.k_proj(x), alpha=1.0) + 1
+        v = self.v_proj(x)                          # (B, L, D)
+        q = q * self.scale
+        forget = torch.sigmoid(self.forget_gate(x))  # (B, L, mem)
+        log_f = torch.log(forget.clamp(min=1e-6))   # (B, L, mem)
+        log_f_cum = torch.cumsum(log_f, dim=1).clamp(-20, 20)  # clamp 防溢出
+        exp_neg = torch.exp(-log_f_cum)  # (B, L, mem) — 归一化因子
+        exp_pos = torch.exp(log_f_cum)   # (B, L, mem) — 还原因子
+        # M_t = Σ_{s≤t} decay(t,s) * k_s ⊗ v_s  (并行)
+        kv = k.unsqueeze(-1) * v.unsqueeze(-2)       # (B, L, mem, D)
+        weighted_kv = exp_neg.unsqueeze(-1) * kv
+        cum_kv = torch.cumsum(weighted_kv, dim=1)
+        M = exp_pos.unsqueeze(-1) * cum_kv            # (B, L, mem, D)
+        # z_t = Σ_{s≤t} decay(t,s) * k_s
+        weighted_k = exp_neg * k
+        cum_k = torch.cumsum(weighted_k, dim=1)
+        z = exp_pos * cum_k                            # (B, L, mem)
+        numerator = torch.einsum('blm,blmd->bld', q, M)
+        denominator = torch.einsum('blm,blm->bl', q, z).unsqueeze(-1) + 1e-6
+        return self.out_proj(self.norm(numerator / denominator))
+    def _sequential_forward(self, x, past_memory, past_z, use_cache):
+        """顺序实现（用于增量缓存）。"""
+        B, L, D = x.shape
+        q = F.elu(self.q_proj(x), alpha=1.0) + 1
+        k = F.elu(self.k_proj(x), alpha=1.0) + 1
+        v = self.v_proj(x)
+        q = q * self.scale
+        forget = torch.sigmoid(self.forget_gate(x))
+        memory = (
+            past_memory if past_memory is not None
+            else torch.zeros(B, self.memory_dim, D, device=x.device, dtype=x.dtype)
+        )
+        z = (
+            past_z if past_z is not None
+            else torch.zeros(B, self.memory_dim, 1, device=x.device, dtype=x.dtype)
+        )
+        outputs = []
+        for t in range(L):
+            q_t, k_t, v_t = q[:, t], k[:, t], v[:, t]
+            f_t = forget[:, t].unsqueeze(-1)
+            num = torch.bmm(q_t.unsqueeze(1), memory).squeeze(1)
+            den = torch.bmm(q_t.unsqueeze(1), z).squeeze(1)
+            outputs.append(num / (den + 1e-6))
+            memory = f_t * memory + torch.bmm(k_t.unsqueeze(2), v_t.unsqueeze(1))
+            z = f_t * z + k_t.unsqueeze(2)
+        out = self.out_proj(self.norm(torch.stack(outputs, dim=1)))
+        if use_cache:
+            return out, memory, z
+        return out
+class EpisodicMemory(nn.Module):
+    """情景记忆：可学习的记忆槽位，通过交叉注意力交互。
+    维护固定数量的记忆槽位，通过训练学习存储有用的模式。
+    使用交叉注意力机制让输入 token 检索相关记忆。
+    架构:
+        读取: CrossAttention(input → memory_slots)
+        更新: 通过梯度缓慢调整槽位内容
+    Args:
+        d_model: 模型维度
+        num_slots: 记忆槽位数量
+        num_heads: 交叉注意力头数
+    """
+    def __init__(self, d_model: int, num_slots: int = 32, num_heads: int = 4):
+        super().__init__()
+        self.num_slots = num_slots
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        # 可学习的记忆槽位
+        self.memory_slots = nn.Parameter(
+            torch.randn(1, num_slots, d_model) * 0.02
+        )
+        # 交叉注意力投影
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        # 记忆更新门控
+        self.update_gate = nn.Sequential(
+            nn.Linear(d_model * 2, d_model),
+            nn.Sigmoid(),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, D = x.shape
+        memory = self.memory_slots.expand(B, -1, -1)  # (B, slots, D)
+        # 交叉注意力：input queries → memory keys/values
+        q = self.q_proj(x).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(memory).view(B, self.num_slots, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(memory).view(B, self.num_slots, self.num_heads, self.head_dim).transpose(1, 2)
+        # SDPA: 自动选择 FlashAttention / Memory-Efficient 后端
+        out = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=False)
+        out = out.transpose(1, 2).contiguous().view(B, L, D)
+        return self.out_proj(out)
+class SemanticMemory(nn.Module):
+    """语义记忆：全局长期知识库。
+    维护一组全局知识向量，通过双向交叉注意力与输入交互：
+    1. 输入从语义记忆中检索相关知识
+    2. 语义记忆根据输入生成上下文相关的知识表示
+    与情景记忆的区别:
+    - 语义记忆存储抽象的、跨样本的通用知识
+    - 情景记忆存储具体的、样本内的上下文信息
+    - 语义记忆的容量更大，更新更慢
+    Args:
+        d_model: 模型维度
+        num_slots: 知识向量数量
+    """
+    def __init__(self, d_model: int, num_slots: int = 64):
+        super().__init__()
+        self.num_slots = num_slots
+        # 全局知识向量
+        self.knowledge = nn.Parameter(
+            torch.randn(1, num_slots, d_model) * 0.02
+        )
+        # 双向注意力
+        self.input_to_memory = nn.Linear(d_model, d_model, bias=False)
+        self.memory_to_input = nn.Linear(d_model, d_model, bias=False)
+        self.gate = nn.Linear(d_model, d_model)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, L, D = x.shape
+        knowledge = self.knowledge.expand(B, -1, -1)  # (B, slots, D)
+        # 输入 → 知识检索 (SDPA 加速)
+        query = self.input_to_memory(x).unsqueeze(1)  # (B, 1, L, D) 作为单头
+        k = knowledge.unsqueeze(1)  # (B, 1, slots, D)
+        v = knowledge.unsqueeze(1)  # (B, 1, slots, D)
+        retrieved = F.scaled_dot_product_attention(
+            query, k, v, dropout_p=0.0, is_causal=False,
+        ).squeeze(1)  # (B, L, D)
+        # 门控融合
+        gate = torch.sigmoid(self.gate(x))
+        output = gate * retrieved + (1 - gate) * x
+        return self.out_proj(output)
+class MemoryController(nn.Module):
+    """记忆控制器：动态协调三层记忆的读写。
+    根据输入内容决定从哪层记忆中读取以及各层的权重：
+    - 需要即时上下文 → 偏重工作记忆
+    - 需要历史模式 → 偏重情景记忆
+    - 需要通用知识 → 偏重语义记忆
+    """
+    def __init__(self, d_model: int):
+        super().__init__()
+        bottleneck = max(d_model // 4, 32)
+        self.controller = nn.Sequential(
+            nn.Linear(d_model, bottleneck),
+            nn.SiLU(),
+            nn.Linear(bottleneck, 3),  # 三层记忆的权重
+        )
+    def forward(
+        self,
+        working_out: torch.Tensor,
+        episodic_out: torch.Tensor,
+        semantic_out: torch.Tensor,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        # 基于输入计算记忆权重
+        weights = F.softmax(self.controller(x), dim=-1)  # (B, L, 3)
+        stacked = torch.stack(
+            [working_out, episodic_out, semantic_out], dim=-1
+        )  # (B, L, D, 3)
+        fused = (stacked * weights.unsqueeze(-2)).sum(dim=-1)
+        return fused
+class HierarchicalMemorySystem(nn.Module):
+    """分层记忆系统：统一接口。
+    整合三层记忆和记忆控制器，提供统一的前向接口。
+    Args:
+        d_model: 模型维度
+        working_dim: 工作记忆维度
+        episodic_slots: 情景记忆槽位数
+        semantic_slots: 语义记忆槽位数
+        num_heads: 情景记忆注意力头数
+    """
+    def __init__(
+        self,
+        d_model: int,
+        working_dim: int = 64,
+        episodic_slots: int = 32,
+        semantic_slots: int = 64,
+        num_heads: int = 4,
+    ):
+        super().__init__()
+        self.working = WorkingMemory(d_model, working_dim)
+        self.episodic = EpisodicMemory(d_model, episodic_slots, num_heads)
+        self.semantic = SemanticMemory(d_model, semantic_slots)
+        self.controller = MemoryController(d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_working_memory: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor | Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """
+        Args:
+            x: (batch, seq_len, d_model)
+            past_working_memory: (past_memory, past_z) 仅 WorkingMemory 有状态
+            use_cache: 若 True 返回 (output, (new_memory, new_z))
+        """
+        if past_working_memory is not None:
+            past_mem, past_z = past_working_memory
+        else:
+            past_mem = past_z = None
+        if use_cache:
+            w_out, new_mem, new_z = self.working(
+                x, past_memory=past_mem, past_z=past_z, use_cache=True
+            )
+        else:
+            w_out = self.working(
+                x, past_memory=past_mem, past_z=past_z, use_cache=False
+            )
+        e_out = self.episodic(x)
+        s_out = self.semantic(x)
+        out = self.controller(w_out, e_out, s_out, x)
+        if use_cache:
+            return out, (new_mem, new_z)
+        return out

cortexnet/interpretability.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""
+可解释性与监控系统 (Interpretability & Monitoring System)
+核心创新：
+  实时追踪模型内部的"思维流"，让模型的决策过程透明可解释。
+  ┌─────────────────────────────────────────────────────────┐
+  │              CortexNet 思维流监控                        │
+  ├─────────────────────────────────────────────────────────┤
+  │                                                         │
+  │  ┌─── 路径利用率分析 ──────────────────────────────┐   │
+  │  │  追踪 SSM / Attention / Memory / GNN 的使用比例  │   │
+  │  │  了解模型在不同层、不同 token 上偏好哪条路径     │   │
+  │  └──────────────────────────────────────────────────┘   │
+  │                                                         │
+  │  ┌─── 专家路由可视化 ──────────────────────────────┐   │
+  │  │  追踪哪些 token 被路由到哪些专家                 │   │
+  │  │  检测专家负载均衡和专业化程度                     │   │
+  │  └──────────────────────────────────────────────────┘   │
+  │                                                         │
+  │  ┌─── 注意力重要性分析 ────────────────────────────┐   │
+  │  │  追踪哪些 token 被选为"重要 token"               │   │
+  │  │  可视化稀疏注意力的选择模式                       │   │
+  │  └──────────────────────────────────────────────────┘   │
+  │                                                         │
+  │  ┌─── 记忆系统状态 ───────────────────────────────┐    │
+  │  │  监控工作记忆/情景记忆/语义记忆的活跃度          │   │
+  │  │  追踪记忆控制器的分配策略                         │   │
+  │  └──────────────────────────────────────────────────┘   │
+  └─────────────────────────────────────────────────────────┘
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Any
+from collections import defaultdict
+class ThoughtFlowMonitor:
+    """思维流监控器：追踪模型内部的信息处理路径。
+    通过 forward hook 机制捕获模型的内部状态，
+    无需修改模型代码即可实现完整的可解释性。
+    使用方法：
+        monitor = ThoughtFlowMonitor(model)
+        monitor.start_monitoring()
+        output = model(input_ids)
+        report = monitor.get_report()
+        monitor.stop_monitoring()
+    """
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.hooks: List[Any] = []
+        self.data: Dict[str, Any] = defaultdict(dict)
+        self._monitoring = False
+    def start_monitoring(self):
+        """开始监控（注册 forward hooks）。"""
+        self.stop_monitoring()  # 清除旧 hooks
+        self.data = defaultdict(dict)
+        self._monitoring = True
+        self._register_hooks()
+    def stop_monitoring(self):
+        """停止监控（移除所有 hooks）。"""
+        for hook in self.hooks:
+            hook.remove()
+        self.hooks = []
+        self._monitoring = False
+    def _register_hooks(self):
+        """注册 forward hooks 到关键组件。"""
+        # 检查模型是否有 blocks 属性
+        blocks = getattr(self.model, "blocks", [])
+        for layer_idx, block in enumerate(blocks):
+            # 捕获融合门控权重
+            fusion = getattr(block, "fusion", None)
+            if fusion is not None:
+                gate_module = getattr(fusion, "gate", None)
+                if gate_module is not None:
+                    self.hooks.append(
+                        gate_module.register_forward_hook(
+                            self._make_fusion_hook(layer_idx)
+                        )
+                    )
+            # 捕获 MoE 路由决策
+            moe = getattr(block, "moe", None)
+            if moe is not None:
+                router = getattr(moe, "router", None)
+                if router is not None:
+                    self.hooks.append(
+                        router.register_forward_hook(
+                            self._make_routing_hook(layer_idx)
+                        )
+                    )
+    def _make_fusion_hook(self, layer_idx: int):
+        def hook(module, input, output):
+            # output 是 gate 的输出: (B, L, num_paths)
+            if isinstance(output, torch.Tensor):
+                weights = F.softmax(output.detach(), dim=-1)
+                self.data["fusion_weights"][layer_idx] = (
+                    weights.cpu()
+                )
+        return hook
+    def _make_routing_hook(self, layer_idx: int):
+        def hook(module, input, output):
+            # output 是 router logits: (B*L, num_experts)
+            if isinstance(output, torch.Tensor):
+                probs = F.softmax(output.detach(), dim=-1)
+                self.data["routing_probs"][layer_idx] = probs.cpu()
+        return hook
+    def get_report(self) -> Dict[str, Any]:
+        """生成可解释性报告。"""
+        report = {}
+        # 路径利用率
+        if "fusion_weights" in self.data:
+            report["pathway_utilization"] = (
+                self._analyze_pathway_utilization()
+            )
+        # 专家负载
+        if "routing_probs" in self.data:
+            report["expert_load"] = self._analyze_expert_load()
+        return report
+    def _analyze_pathway_utilization(self) -> Dict[str, List[float]]:
+        """分析各层的路径利用率。"""
+        path_names = ["SSM", "Attention", "Memory", "GraphReasoning"]
+        utilization = {name: [] for name in path_names}
+        for layer_idx in sorted(self.data["fusion_weights"].keys()):
+            weights = self.data["fusion_weights"][layer_idx]
+            mean_weights = weights.mean(dim=(0, 1))  # (num_paths,)
+            for i, name in enumerate(path_names):
+                if i < len(mean_weights):
+                    utilization[name].append(mean_weights[i].item())
+        return utilization
+    def _analyze_expert_load(self) -> Dict[str, Any]:
+        """分析专家负载分布。"""
+        expert_loads = {}
+        for layer_idx in sorted(self.data["routing_probs"].keys()):
+            probs = self.data["routing_probs"][layer_idx]
+            mean_probs = probs.mean(dim=0)  # (num_experts,)
+            expert_loads[f"layer_{layer_idx}"] = {
+                "mean_prob": mean_probs.tolist(),
+                "max_prob": mean_probs.max().item(),
+                "min_prob": mean_probs.min().item(),
+                "balance_ratio": (
+                    mean_probs.min() / mean_probs.max()
+                ).item()
+                if mean_probs.max() > 0
+                else 0,
+            }
+        return expert_loads
+    def print_summary(self):
+        """打印监控摘要。"""
+        report = self.get_report()
+        print("\n  ╔═══ CortexNet 思维流报告 ═══╗\n")
+        if "pathway_utilization" in report:
+            print("  ◆ 路径利用率（各层平均）:")
+            for path, values in report["pathway_utilization"].items():
+                if values:
+                    avg = sum(values) / len(values)
+                    bar = "█" * int(avg * 40)
+                    print(f"    {path:>14s}: {avg:.1%} {bar}")
+        if "expert_load" in report:
+            print("\n  ◆ 专家负载均衡:")
+            for layer, stats in report["expert_load"].items():
+                balance = stats.get("balance_ratio", 0)
+                status = "✓ 均衡" if balance > 0.5 else "⚠ 不均衡"
+                print(
+                    f"    {layer}: balance={balance:.2f} {status}"
+                )
+        print("\n  ╚═══════════════════════════╝")