npm - @lorrylurui/code-intelligence-mcp - Versions diffs - 2.0.5 → 2.0.6 - Mend

@lorrylurui/code-intelligence-mcp 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +103 -0
package/dist/config/env.js +9 -0
package/dist/config/tuning.js +114 -0
package/dist/db/schema.js +37 -0
package/dist/indexer/chunkText.js +164 -0
package/dist/indexer/embedText.js +2 -2
package/dist/repositories/chunkRepository.js +181 -0
package/dist/repositories/symbolRepository.js +4 -5
package/dist/server/createServer.js +5 -1
package/dist/services/contextAssembler.js +150 -0
package/dist/services/ranking.js +37 -39
package/dist/services/recommendationService.js +325 -104
package/dist/tools/queryDocs.js +113 -0
package/dist/tools/searchSymbols.js +3 -2
package/dist/types/chunk.js +1 -0
package/package.json +1 -1

package/dist/repositories/symbolRepository.js CHANGED Viewed

@@ -1,9 +1,8 @@
 import { env } from '../config/env.js';
 import { getPool } from '../db/postgres.js';
+import { SYMBOL_SIMILARITY_THRESHOLD, SYMBOL_TOP_K } from '../config/tuning.js';
 import { createEmbeddingClient } from '../services/embeddingClient.js';
 import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
-const SIMILARITY_THRESHOLD = 0;
-const TOP_K = 20;
 const inMemorySymbols = [
     {
         id: 1,
@@ -188,7 +187,7 @@ export class SymbolRepository {
      * 不再需要在 Node 拉取全量向量做内存计算。
      */
     async searchSemanticHits(query, opts) {
-        console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? TOP_K), String(SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
+        console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? SYMBOL_TOP_K), String(SYMBOL_SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
         if (!env.embeddingServiceUrl) {
             console.error('[code-intelligence-mcp] repository.searchSemanticHits.error missingEmbeddingServiceUrl');
             throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
@@ -197,7 +196,7 @@ export class SymbolRepository {
             console.error('[code-intelligence-mcp] repository.searchSemanticHits.noPool returnEmpty');
             return [];
         }
-        const limit = opts?.limit ?? TOP_K;
+        const limit = opts?.limit ?? SYMBOL_TOP_K;
         const client = createEmbeddingClient(env.embeddingServiceUrl);
         const [queryVec] = await client.embed([query.trim()]);
         if (!queryVec?.length) {
@@ -226,7 +225,7 @@ export class SymbolRepository {
             symbol: mapRow(r),
             similarity: Number(r.similarity),
         }));
-        const passed = mapped.filter((x) => x.similarity >= SIMILARITY_THRESHOLD);
+        const passed = mapped.filter((x) => x.similarity >= SYMBOL_SIMILARITY_THRESHOLD);
         console.error('[code-intelligence-mcp] repository.searchSemanticHits.db table=%s rawRows=%s passedThreshold=%s topRaw=%s', env.symbolsTable, String(rows.length), String(passed.length), JSON.stringify(mapped.slice(0, 5).map((x) => ({
             id: x.symbol.id,
             name: x.symbol.name,

package/dist/server/createServer.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { createSearchByStructureTool } from '../tools/searchByStructure.js';
 import { createIncUsageTool } from '../tools/incUsage.js';
 import { RecommendationService } from '../services/recommendationService.js';
 import { createRecommendComponentTool } from '../tools/recommendComponent.js';
+import { createQueryDocsTool } from '../tools/queryDocs.js';
 export function createServer() {
     console.error('[code-intelligence-mcp] createServer.init');
     const server = new McpServer({
@@ -39,6 +40,9 @@ export function createServer() {
     console.error('[code-intelligence-mcp] tool.registered %s', recommendComponentTool.name);
     registerReusableCodeAdvisorPrompt(server);
     console.error('[code-intelligence-mcp] prompt.registered reusable-code-advisor');
-    console.error('[code-intelligence-mcp] createServer.ready toolCount=6 promptCount=1');
+    const queryDocsTool = createQueryDocsTool();
+    server.tool(queryDocsTool.name, queryDocsTool.description, queryDocsTool.inputSchema, queryDocsTool.handler);
+    console.error('[code-intelligence-mcp] tool.registered %s', queryDocsTool.name);
+    console.error('[code-intelligence-mcp] createServer.ready toolCount=7 promptCount=1');
     return server;
 }

package/dist/services/contextAssembler.js ADDED Viewed

@@ -0,0 +1,150 @@
+/**
+ * contextAssembler: RAG 上下文组装器。
+ *
+ * 完整流程：
+ *   topK hits
+ *     → 邻块扩展（getAdjacentChunks）：补全被截断的边界上下文
+ *     → 去重（path + chunk_index）：避免重复块被重复计费
+ *     → 相关性排序：命中块靠前，纯邻块靠后
+ *     → 字符预算截断：超出 CONTEXT_MAX_CHARS 时丢弃末尾块
+ *     → 文本渲染：拼成可直接注入 prompt 的 contextText
+ *
+ * 为什么需要邻块扩展？
+ *   chunk 切分时按结构和字符数截断，单个 chunk 可能只包含一段话的前半句。
+ *   取前后邻块（radius=1 即各一块）可以在不大幅增加 token 成本的前提下
+ *   把被截断的上下文还原，显著降低 LLM 产生"幻觉引用"的概率。
+ *
+ * 为什么要字符预算？
+ *   大多数 LLM 有 context window 限制。超出预算不仅导致截断错误，
+ *   还会因为过长的无关文本降低模型对真正相关段落的注意力权重（"lost in the middle"问题）。
+ *   控制预算 = 控制召回精度。
+ */
+import { CONTEXT_ADJACENT_RADIUS, CONTEXT_MAX_CHARS, CONTEXT_MAX_CHUNKS, } from '../config/tuning.js';
+/**
+ * 渲染单个 chunk 为可读文本块，附带来源元信息。
+ *
+ * 格式示例：
+ *   [来源: qa-doc/topK.md · 第2块/共5块 · 相似度 0.87]
+ *   topK 参数控制返回数量，默认值为...
+ */
+function renderChunk(chunk) {
+    const parts = [`来源: ${chunk.path}`];
+    parts.push(`第${chunk.chunkIndex + 1}块/共${chunk.chunkCount}块`);
+    if (chunk.similarity != null) {
+        parts.push(`相似度 ${chunk.similarity.toFixed(2)}`);
+    }
+    const header = `[${parts.join(' · ')}]`;
+    return `${header}\n${chunk.content}`;
+}
+export class ContextAssembler {
+    repo;
+    constructor(repo) {
+        this.repo = repo;
+    }
+    /**
+     * 组装 RAG 上下文。
+     *
+     * @param hits         来自 ChunkRepository.searchSemantic() 的 topK 结果（已按相似度降序）
+     * @param opts.maxChars         覆盖 CONTEXT_MAX_CHARS，用于运行时动态调整 token 预算
+     * @param opts.adjacentRadius   覆盖 CONTEXT_ADJACENT_RADIUS，0 表示不做邻块扩展
+     * @param opts.maxChunks        覆盖 CONTEXT_MAX_CHUNKS
+     */
+    async assemble(hits, opts) {
+        const maxChars = opts?.maxChars ?? CONTEXT_MAX_CHARS;
+        const radius = opts?.adjacentRadius ?? CONTEXT_ADJACENT_RADIUS;
+        const maxChunks = opts?.maxChunks ?? CONTEXT_MAX_CHUNKS;
+        const hitCount = hits.length;
+        // ── 步骤1：邻块扩展 ──────────────────────────────────────────────────
+        // 对每个命中块并行拉取前后邻块，补全被切分边界截断的上下文。
+        // 邻块本身没有 similarity 分数，排序时置于命中块之后。
+        const expanded = await this.expandWithAdjacentChunks(hits, radius);
+        // ── 步骤2：去重 ───────────────────────────────────────────────────────
+        // 多个命中块扩展后可能重叠，以 path+chunk_index 为键去重，保留先出现的版本
+        // （命中块在前，保留其 similarity；邻块在后，若与命中块重叠则丢弃邻块副本）。
+        const deduped = deduplicateChunks(expanded);
+        // ── 步骤3：排序 ───────────────────────────────────────────────────────
+        // similarity 有值（命中块）> similarity 无值（纯邻块）；同类内部按相似度降序。
+        const sorted = sortChunks(deduped);
+        // ── 步骤4：字符预算截断 ───────────────────────────────────────────────
+        const { selected, truncated } = applyBudget(sorted, maxChars, maxChunks);
+        // ── 步骤5：文本渲染 ───────────────────────────────────────────────────
+        const contextText = selected.map(renderChunk).join('\n\n---\n\n');
+        return {
+            chunks: selected,
+            contextText,
+            hitCount,
+            totalChunks: selected.length,
+            truncated,
+        };
+    }
+    /**
+     * 对每个命中块并行拉取邻块，返回命中块 + 所有邻块的扁平列表（含重复，由后续去重处理）。
+     * radius=0 时跳过数据库查询，直接返回原始命中列表。
+     */
+    async expandWithAdjacentChunks(hits, radius) {
+        if (radius <= 0 || hits.length === 0)
+            return [...hits];
+        // 并行拉取，避免串行 N 次查询放大延迟。
+        const adjacentGroups = await Promise.all(hits.map((hit) => this.repo.getAdjacentChunks(hit.path, hit.chunkIndex, radius)));
+        // 命中块在前，邻块紧随其后（之后去重时命中块的 similarity 会被保留）。
+        const result = [...hits];
+        for (const group of adjacentGroups) {
+            result.push(...group);
+        }
+        return result;
+    }
+}
+/** 以 `${path}::${chunkIndex}` 为键去重，保留先出现的副本（命中块的 similarity 优先）。 */
+function deduplicateChunks(chunks) {
+    const seen = new Set();
+    const result = [];
+    for (const chunk of chunks) {
+        const key = `${chunk.path}::${chunk.chunkIndex}`;
+        if (!seen.has(key)) {
+            seen.add(key);
+            result.push(chunk);
+        }
+    }
+    return result;
+}
+/**
+ * 排序规则：
+ * 1. 有 similarity（命中块）排在无 similarity（纯邻块）之前
+ * 2. 同类内部按 similarity 降序
+ * 3. 纯邻块内部保持原有顺序（path + chunkIndex 升序，保证上下文连贯）
+ */
+function sortChunks(chunks) {
+    return [...chunks].sort((a, b) => {
+        const aHasSim = a.similarity != null;
+        const bHasSim = b.similarity != null;
+        if (aHasSim && !bHasSim)
+            return -1;
+        if (!aHasSim && bHasSim)
+            return 1;
+        if (aHasSim && bHasSim)
+            return (b.similarity ?? 0) - (a.similarity ?? 0);
+        // 纯邻块按路径+索引保持文档顺序
+        const pathCmp = a.path.localeCompare(b.path);
+        return pathCmp !== 0 ? pathCmp : a.chunkIndex - b.chunkIndex;
+    });
+}
+/**
+ * 从排好序的 chunk 列表中按字符预算和数量上限截取子集。
+ * 按顺序累加字符数，第一个超出预算的 chunk 及之后的全部丢弃。
+ */
+function applyBudget(chunks, maxChars, maxChunks) {
+    const selected = [];
+    let totalChars = 0;
+    for (const chunk of chunks) {
+        if (selected.length >= maxChunks) {
+            return { selected, truncated: true };
+        }
+        const chunkChars = renderChunk(chunk).length;
+        if (totalChars + chunkChars > maxChars && selected.length > 0) {
+            return { selected, truncated: true };
+        }
+        selected.push(chunk);
+        totalChars += chunkChars;
+    }
+    return { selected, truncated: false };
+}

package/dist/services/ranking.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { CALLEE_MATCH_SCORE_MAX, CALLEE_MATCH_SCORE_PER_MATCH, COMMON_PATH_SCORE_NO, COMMON_PATH_SCORE_YES, RANK_WEIGHTS, RECENCY_SCORE_DEFAULT, RECENCY_SCORE_OLDEST, RECENCY_SCORE_TIERS, SEMANTIC_REASON_THRESHOLD_HIGH, SEMANTIC_REASON_THRESHOLD_MED, TEXT_MATCH_SCORES, TOKEN_OVERLAP_TIERS, USAGE_REASON_THRESHOLD_HIGH, USAGE_SCORE_LOG_DIVISOR, } from '../config/tuning.js';
 function clamp01(value) {
     if (value < 0)
         return 0;
@@ -31,12 +32,11 @@ function tokenOverlapScore(query, symbol) {
         .toLowerCase();
     const matched = queryTokens.filter((token) => text.includes(token)).length;
     const overlapRatio = matched / queryTokens.length;
-    if (matched >= 4 && overlapRatio >= 0.45)
-        return 0.78;
-    if (matched >= 3 && overlapRatio >= 0.3)
-        return 0.68;
-    if (matched >= 2 && overlapRatio >= 0.18)
-        return 0.56;
+    for (const tier of TOKEN_OVERLAP_TIERS) {
+        if (matched >= tier.minMatches && overlapRatio >= tier.minRatio) {
+            return tier.score;
+        }
+    }
     return 0;
 }
 function textMatchScore(query, symbol) {
@@ -48,34 +48,36 @@ function textMatchScore(query, symbol) {
     if (name === q)
         return { score: 1, matchedBy: 'exact_name' };
     if (name.includes(q))
-        return { score: 0.85, matchedBy: 'name_contains' };
+        return {
+            score: TEXT_MATCH_SCORES.nameContains,
+            matchedBy: 'name_contains',
+        };
     if (description.includes(q))
-        return { score: 0.65, matchedBy: 'description_contains' };
+        return {
+            score: TEXT_MATCH_SCORES.descriptionContains,
+            matchedBy: 'description_contains',
+        };
     const overlapScore = tokenOverlapScore(query, symbol);
     if (overlapScore > 0)
         return { score: overlapScore, matchedBy: 'token_overlap' };
-    return { score: 0.2, matchedBy: 'weak' };
+    return { score: TEXT_MATCH_SCORES.weak, matchedBy: 'weak' };
 }
 function usageScore(usageCount) {
     // log scale to avoid very large usage monopolizing ranking.
-    return clamp01(Math.log10(usageCount + 1) / 3);
+    return clamp01(Math.log10(usageCount + 1) / USAGE_SCORE_LOG_DIVISOR);
 }
 function recencyScore(createdAt) {
     if (!createdAt)
-        return 0.4;
+        return RECENCY_SCORE_DEFAULT;
     const ts = new Date(createdAt).getTime();
     if (Number.isNaN(ts))
-        return 0.4;
+        return RECENCY_SCORE_DEFAULT;
     const days = (Date.now() - ts) / (1000 * 60 * 60 * 24);
-    if (days <= 7)
-        return 1;
-    if (days <= 30)
-        return 0.8;
-    if (days <= 90)
-        return 0.6;
-    if (days <= 180)
-        return 0.4;
-    return 0.25;
+    for (const tier of RECENCY_SCORE_TIERS) {
+        if (days <= tier.maxDays)
+            return tier.score;
+    }
+    return RECENCY_SCORE_OLDEST;
 }
 function daysSinceCreated(createdAt) {
     if (!createdAt)
@@ -87,14 +89,10 @@ function daysSinceCreated(createdAt) {
 }
 function commonPathScore(path) {
     const lower = path.toLowerCase();
-    return lower.includes('/common/') || lower.includes('/shared/') ? 1 : 0.35;
+    return lower.includes('/common/') || lower.includes('/shared/')
+        ? COMMON_PATH_SCORE_YES
+        : COMMON_PATH_SCORE_NO;
 }
-const RANK_WEIGHTS = {
-    textMatch: 0.5,
-    usage: 0.3,
-    recency: 0.1,
-    commonPath: 0.1,
-};
 /**
  * Phase 5：以向量余弦相似度作为主文本维度，再叠加 usage / recency / common 和 calleeNames 匹配度。
  * calleeNames 作为结构信息独立信号，不污染纯语义向量。
@@ -113,7 +111,7 @@ export function rankSemanticHits(hits, query) {
             const queryLower = query.toLowerCase();
             const matchedCallees = calleeNames.filter((callee) => queryLower.includes(callee.toLowerCase())).length;
             if (matchedCallees > 0) {
-                calleeMatchScore = Math.min(matchedCallees * 0.05, 0.2);
+                calleeMatchScore = Math.min(matchedCallees * CALLEE_MATCH_SCORE_PER_MATCH, CALLEE_MATCH_SCORE_MAX);
             }
         }
         const score = textScore * RANK_WEIGHTS.textMatch +
@@ -122,13 +120,13 @@ export function rankSemanticHits(hits, query) {
             common * RANK_WEIGHTS.commonPath +
             calleeMatchScore;
         const reasonParts = [];
-        if (textScore >= 0.55)
+        if (textScore >= SEMANTIC_REASON_THRESHOLD_HIGH)
             reasonParts.push('语义相似度高');
-        else if (textScore >= 0.4)
+        else if (textScore >= SEMANTIC_REASON_THRESHOLD_MED)
             reasonParts.push('语义相关');
-        if (usage >= 0.6)
+        if (usage >= USAGE_REASON_THRESHOLD_HIGH)
             reasonParts.push('使用频率高');
-        if (common >= 1)
+        if (common >= COMMON_PATH_SCORE_YES)
             reasonParts.push('位于 shared/common 路径');
         if (calleeMatchScore > 0)
             reasonParts.push('函数调用关系匹配');
@@ -152,7 +150,7 @@ export function rankSemanticHits(hits, query) {
                 },
                 commonPath: {
                     score: Number(common.toFixed(3)),
-                    isCommonPath: common >= 1,
+                    isCommonPath: common >= COMMON_PATH_SCORE_YES,
                 },
                 weights: RANK_WEIGHTS,
                 summary: reasonParts.join(' + '),
@@ -173,15 +171,15 @@ export function rankSymbols(query, symbols) {
             recency * RANK_WEIGHTS.recency +
             common * RANK_WEIGHTS.commonPath;
         const reasonParts = [];
-        if (text.score >= 0.85)
+        if (text.score >= TEXT_MATCH_SCORES.nameContains)
             reasonParts.push('文本匹配度高');
-        else if (text.score >= 0.65)
+        else if (text.score >= TEXT_MATCH_SCORES.descriptionContains)
             reasonParts.push('描述命中');
         else if (text.matchedBy === 'token_overlap')
             reasonParts.push('关键词片段高度重合');
-        if (usage >= 0.6)
+        if (usage >= USAGE_REASON_THRESHOLD_HIGH)
             reasonParts.push('使用频率高');
-        if (common >= 1)
+        if (common >= COMMON_PATH_SCORE_YES)
             reasonParts.push('位于 shared/common 路径');
         if (reasonParts.length === 0)
             reasonParts.push('综合相关性较好');
@@ -203,7 +201,7 @@ export function rankSymbols(query, symbols) {
                 },
                 commonPath: {
                     score: Number(common.toFixed(3)),
-                    isCommonPath: common >= 1,
+                    isCommonPath: common >= COMMON_PATH_SCORE_YES,
                 },
                 weights: RANK_WEIGHTS,
                 summary: reasonParts.join(' + '),