@lorrylurui/code-intelligence-mcp 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,8 @@
1
1
  import { env } from '../config/env.js';
2
2
  import { getPool } from '../db/postgres.js';
3
+ import { SYMBOL_SIMILARITY_THRESHOLD, SYMBOL_TOP_K } from '../config/tuning.js';
3
4
  import { createEmbeddingClient } from '../services/embeddingClient.js';
4
5
  import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
5
- const SIMILARITY_THRESHOLD = 0;
6
- const TOP_K = 20;
7
6
  const inMemorySymbols = [
8
7
  {
9
8
  id: 1,
@@ -188,7 +187,7 @@ export class SymbolRepository {
188
187
  * 不再需要在 Node 拉取全量向量做内存计算。
189
188
  */
190
189
  async searchSemanticHits(query, opts) {
191
- console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? TOP_K), String(SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
190
+ console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? SYMBOL_TOP_K), String(SYMBOL_SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
192
191
  if (!env.embeddingServiceUrl) {
193
192
  console.error('[code-intelligence-mcp] repository.searchSemanticHits.error missingEmbeddingServiceUrl');
194
193
  throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
@@ -197,7 +196,7 @@ export class SymbolRepository {
197
196
  console.error('[code-intelligence-mcp] repository.searchSemanticHits.noPool returnEmpty');
198
197
  return [];
199
198
  }
200
- const limit = opts?.limit ?? TOP_K;
199
+ const limit = opts?.limit ?? SYMBOL_TOP_K;
201
200
  const client = createEmbeddingClient(env.embeddingServiceUrl);
202
201
  const [queryVec] = await client.embed([query.trim()]);
203
202
  if (!queryVec?.length) {
@@ -226,7 +225,7 @@ export class SymbolRepository {
226
225
  symbol: mapRow(r),
227
226
  similarity: Number(r.similarity),
228
227
  }));
229
- const passed = mapped.filter((x) => x.similarity >= SIMILARITY_THRESHOLD);
228
+ const passed = mapped.filter((x) => x.similarity >= SYMBOL_SIMILARITY_THRESHOLD);
230
229
  console.error('[code-intelligence-mcp] repository.searchSemanticHits.db table=%s rawRows=%s passedThreshold=%s topRaw=%s', env.symbolsTable, String(rows.length), String(passed.length), JSON.stringify(mapped.slice(0, 5).map((x) => ({
231
230
  id: x.symbol.id,
232
231
  name: x.symbol.name,
@@ -8,6 +8,7 @@ import { createSearchByStructureTool } from '../tools/searchByStructure.js';
8
8
  import { createIncUsageTool } from '../tools/incUsage.js';
9
9
  import { RecommendationService } from '../services/recommendationService.js';
10
10
  import { createRecommendComponentTool } from '../tools/recommendComponent.js';
11
+ import { createQueryDocsTool } from '../tools/queryDocs.js';
11
12
  export function createServer() {
12
13
  console.error('[code-intelligence-mcp] createServer.init');
13
14
  const server = new McpServer({
@@ -39,6 +40,9 @@ export function createServer() {
39
40
  console.error('[code-intelligence-mcp] tool.registered %s', recommendComponentTool.name);
40
41
  registerReusableCodeAdvisorPrompt(server);
41
42
  console.error('[code-intelligence-mcp] prompt.registered reusable-code-advisor');
42
- console.error('[code-intelligence-mcp] createServer.ready toolCount=6 promptCount=1');
43
+ const queryDocsTool = createQueryDocsTool();
44
+ server.tool(queryDocsTool.name, queryDocsTool.description, queryDocsTool.inputSchema, queryDocsTool.handler);
45
+ console.error('[code-intelligence-mcp] tool.registered %s', queryDocsTool.name);
46
+ console.error('[code-intelligence-mcp] createServer.ready toolCount=7 promptCount=1');
43
47
  return server;
44
48
  }
@@ -0,0 +1,150 @@
1
+ /**
2
+ * contextAssembler: RAG 上下文组装器。
3
+ *
4
+ * 完整流程:
5
+ * topK hits
6
+ * → 邻块扩展(getAdjacentChunks):补全被截断的边界上下文
7
+ * → 去重(path + chunk_index):避免重复块被重复计费
8
+ * → 相关性排序:命中块靠前,纯邻块靠后
9
+ * → 字符预算截断:超出 CONTEXT_MAX_CHARS 时丢弃末尾块
10
+ * → 文本渲染:拼成可直接注入 prompt 的 contextText
11
+ *
12
+ * 为什么需要邻块扩展?
13
+ * chunk 切分时按结构和字符数截断,单个 chunk 可能只包含一段话的前半句。
14
+ * 取前后邻块(radius=1 即各一块)可以在不大幅增加 token 成本的前提下
15
+ * 把被截断的上下文还原,显著降低 LLM 产生"幻觉引用"的概率。
16
+ *
17
+ * 为什么要字符预算?
18
+ * 大多数 LLM 有 context window 限制。超出预算不仅导致截断错误,
19
+ * 还会因为过长的无关文本降低模型对真正相关段落的注意力权重("lost in the middle"问题)。
20
+ * 控制预算 = 控制召回精度。
21
+ */
22
+ import { CONTEXT_ADJACENT_RADIUS, CONTEXT_MAX_CHARS, CONTEXT_MAX_CHUNKS, } from '../config/tuning.js';
23
+ /**
24
+ * 渲染单个 chunk 为可读文本块,附带来源元信息。
25
+ *
26
+ * 格式示例:
27
+ * [来源: qa-doc/topK.md · 第2块/共5块 · 相似度 0.87]
28
+ * topK 参数控制返回数量,默认值为...
29
+ */
30
+ function renderChunk(chunk) {
31
+ const parts = [`来源: ${chunk.path}`];
32
+ parts.push(`第${chunk.chunkIndex + 1}块/共${chunk.chunkCount}块`);
33
+ if (chunk.similarity != null) {
34
+ parts.push(`相似度 ${chunk.similarity.toFixed(2)}`);
35
+ }
36
+ const header = `[${parts.join(' · ')}]`;
37
+ return `${header}\n${chunk.content}`;
38
+ }
39
+ export class ContextAssembler {
40
+ repo;
41
+ constructor(repo) {
42
+ this.repo = repo;
43
+ }
44
+ /**
45
+ * 组装 RAG 上下文。
46
+ *
47
+ * @param hits 来自 ChunkRepository.searchSemantic() 的 topK 结果(已按相似度降序)
48
+ * @param opts.maxChars 覆盖 CONTEXT_MAX_CHARS,用于运行时动态调整 token 预算
49
+ * @param opts.adjacentRadius 覆盖 CONTEXT_ADJACENT_RADIUS,0 表示不做邻块扩展
50
+ * @param opts.maxChunks 覆盖 CONTEXT_MAX_CHUNKS
51
+ */
52
+ async assemble(hits, opts) {
53
+ const maxChars = opts?.maxChars ?? CONTEXT_MAX_CHARS;
54
+ const radius = opts?.adjacentRadius ?? CONTEXT_ADJACENT_RADIUS;
55
+ const maxChunks = opts?.maxChunks ?? CONTEXT_MAX_CHUNKS;
56
+ const hitCount = hits.length;
57
+ // ── 步骤1:邻块扩展 ──────────────────────────────────────────────────
58
+ // 对每个命中块并行拉取前后邻块,补全被切分边界截断的上下文。
59
+ // 邻块本身没有 similarity 分数,排序时置于命中块之后。
60
+ const expanded = await this.expandWithAdjacentChunks(hits, radius);
61
+ // ── 步骤2:去重 ───────────────────────────────────────────────────────
62
+ // 多个命中块扩展后可能重叠,以 path+chunk_index 为键去重,保留先出现的版本
63
+ // (命中块在前,保留其 similarity;邻块在后,若与命中块重叠则丢弃邻块副本)。
64
+ const deduped = deduplicateChunks(expanded);
65
+ // ── 步骤3:排序 ───────────────────────────────────────────────────────
66
+ // similarity 有值(命中块)> similarity 无值(纯邻块);同类内部按相似度降序。
67
+ const sorted = sortChunks(deduped);
68
+ // ── 步骤4:字符预算截断 ───────────────────────────────────────────────
69
+ const { selected, truncated } = applyBudget(sorted, maxChars, maxChunks);
70
+ // ── 步骤5:文本渲染 ───────────────────────────────────────────────────
71
+ const contextText = selected.map(renderChunk).join('\n\n---\n\n');
72
+ return {
73
+ chunks: selected,
74
+ contextText,
75
+ hitCount,
76
+ totalChunks: selected.length,
77
+ truncated,
78
+ };
79
+ }
80
+ /**
81
+ * 对每个命中块并行拉取邻块,返回命中块 + 所有邻块的扁平列表(含重复,由后续去重处理)。
82
+ * radius=0 时跳过数据库查询,直接返回原始命中列表。
83
+ */
84
+ async expandWithAdjacentChunks(hits, radius) {
85
+ if (radius <= 0 || hits.length === 0)
86
+ return [...hits];
87
+ // 并行拉取,避免串行 N 次查询放大延迟。
88
+ const adjacentGroups = await Promise.all(hits.map((hit) => this.repo.getAdjacentChunks(hit.path, hit.chunkIndex, radius)));
89
+ // 命中块在前,邻块紧随其后(之后去重时命中块的 similarity 会被保留)。
90
+ const result = [...hits];
91
+ for (const group of adjacentGroups) {
92
+ result.push(...group);
93
+ }
94
+ return result;
95
+ }
96
+ }
97
+ /** 以 `${path}::${chunkIndex}` 为键去重,保留先出现的副本(命中块的 similarity 优先)。 */
98
+ function deduplicateChunks(chunks) {
99
+ const seen = new Set();
100
+ const result = [];
101
+ for (const chunk of chunks) {
102
+ const key = `${chunk.path}::${chunk.chunkIndex}`;
103
+ if (!seen.has(key)) {
104
+ seen.add(key);
105
+ result.push(chunk);
106
+ }
107
+ }
108
+ return result;
109
+ }
110
+ /**
111
+ * 排序规则:
112
+ * 1. 有 similarity(命中块)排在无 similarity(纯邻块)之前
113
+ * 2. 同类内部按 similarity 降序
114
+ * 3. 纯邻块内部保持原有顺序(path + chunkIndex 升序,保证上下文连贯)
115
+ */
116
+ function sortChunks(chunks) {
117
+ return [...chunks].sort((a, b) => {
118
+ const aHasSim = a.similarity != null;
119
+ const bHasSim = b.similarity != null;
120
+ if (aHasSim && !bHasSim)
121
+ return -1;
122
+ if (!aHasSim && bHasSim)
123
+ return 1;
124
+ if (aHasSim && bHasSim)
125
+ return (b.similarity ?? 0) - (a.similarity ?? 0);
126
+ // 纯邻块按路径+索引保持文档顺序
127
+ const pathCmp = a.path.localeCompare(b.path);
128
+ return pathCmp !== 0 ? pathCmp : a.chunkIndex - b.chunkIndex;
129
+ });
130
+ }
131
+ /**
132
+ * 从排好序的 chunk 列表中按字符预算和数量上限截取子集。
133
+ * 按顺序累加字符数,第一个超出预算的 chunk 及之后的全部丢弃。
134
+ */
135
+ function applyBudget(chunks, maxChars, maxChunks) {
136
+ const selected = [];
137
+ let totalChars = 0;
138
+ for (const chunk of chunks) {
139
+ if (selected.length >= maxChunks) {
140
+ return { selected, truncated: true };
141
+ }
142
+ const chunkChars = renderChunk(chunk).length;
143
+ if (totalChars + chunkChars > maxChars && selected.length > 0) {
144
+ return { selected, truncated: true };
145
+ }
146
+ selected.push(chunk);
147
+ totalChars += chunkChars;
148
+ }
149
+ return { selected, truncated: false };
150
+ }
@@ -1,3 +1,4 @@
1
+ import { CALLEE_MATCH_SCORE_MAX, CALLEE_MATCH_SCORE_PER_MATCH, COMMON_PATH_SCORE_NO, COMMON_PATH_SCORE_YES, RANK_WEIGHTS, RECENCY_SCORE_DEFAULT, RECENCY_SCORE_OLDEST, RECENCY_SCORE_TIERS, SEMANTIC_REASON_THRESHOLD_HIGH, SEMANTIC_REASON_THRESHOLD_MED, TEXT_MATCH_SCORES, TOKEN_OVERLAP_TIERS, USAGE_REASON_THRESHOLD_HIGH, USAGE_SCORE_LOG_DIVISOR, } from '../config/tuning.js';
1
2
  function clamp01(value) {
2
3
  if (value < 0)
3
4
  return 0;
@@ -31,12 +32,11 @@ function tokenOverlapScore(query, symbol) {
31
32
  .toLowerCase();
32
33
  const matched = queryTokens.filter((token) => text.includes(token)).length;
33
34
  const overlapRatio = matched / queryTokens.length;
34
- if (matched >= 4 && overlapRatio >= 0.45)
35
- return 0.78;
36
- if (matched >= 3 && overlapRatio >= 0.3)
37
- return 0.68;
38
- if (matched >= 2 && overlapRatio >= 0.18)
39
- return 0.56;
35
+ for (const tier of TOKEN_OVERLAP_TIERS) {
36
+ if (matched >= tier.minMatches && overlapRatio >= tier.minRatio) {
37
+ return tier.score;
38
+ }
39
+ }
40
40
  return 0;
41
41
  }
42
42
  function textMatchScore(query, symbol) {
@@ -48,34 +48,36 @@ function textMatchScore(query, symbol) {
48
48
  if (name === q)
49
49
  return { score: 1, matchedBy: 'exact_name' };
50
50
  if (name.includes(q))
51
- return { score: 0.85, matchedBy: 'name_contains' };
51
+ return {
52
+ score: TEXT_MATCH_SCORES.nameContains,
53
+ matchedBy: 'name_contains',
54
+ };
52
55
  if (description.includes(q))
53
- return { score: 0.65, matchedBy: 'description_contains' };
56
+ return {
57
+ score: TEXT_MATCH_SCORES.descriptionContains,
58
+ matchedBy: 'description_contains',
59
+ };
54
60
  const overlapScore = tokenOverlapScore(query, symbol);
55
61
  if (overlapScore > 0)
56
62
  return { score: overlapScore, matchedBy: 'token_overlap' };
57
- return { score: 0.2, matchedBy: 'weak' };
63
+ return { score: TEXT_MATCH_SCORES.weak, matchedBy: 'weak' };
58
64
  }
59
65
  function usageScore(usageCount) {
60
66
  // log scale to avoid very large usage monopolizing ranking.
61
- return clamp01(Math.log10(usageCount + 1) / 3);
67
+ return clamp01(Math.log10(usageCount + 1) / USAGE_SCORE_LOG_DIVISOR);
62
68
  }
63
69
  function recencyScore(createdAt) {
64
70
  if (!createdAt)
65
- return 0.4;
71
+ return RECENCY_SCORE_DEFAULT;
66
72
  const ts = new Date(createdAt).getTime();
67
73
  if (Number.isNaN(ts))
68
- return 0.4;
74
+ return RECENCY_SCORE_DEFAULT;
69
75
  const days = (Date.now() - ts) / (1000 * 60 * 60 * 24);
70
- if (days <= 7)
71
- return 1;
72
- if (days <= 30)
73
- return 0.8;
74
- if (days <= 90)
75
- return 0.6;
76
- if (days <= 180)
77
- return 0.4;
78
- return 0.25;
76
+ for (const tier of RECENCY_SCORE_TIERS) {
77
+ if (days <= tier.maxDays)
78
+ return tier.score;
79
+ }
80
+ return RECENCY_SCORE_OLDEST;
79
81
  }
80
82
  function daysSinceCreated(createdAt) {
81
83
  if (!createdAt)
@@ -87,14 +89,10 @@ function daysSinceCreated(createdAt) {
87
89
  }
88
90
  function commonPathScore(path) {
89
91
  const lower = path.toLowerCase();
90
- return lower.includes('/common/') || lower.includes('/shared/') ? 1 : 0.35;
92
+ return lower.includes('/common/') || lower.includes('/shared/')
93
+ ? COMMON_PATH_SCORE_YES
94
+ : COMMON_PATH_SCORE_NO;
91
95
  }
92
- const RANK_WEIGHTS = {
93
- textMatch: 0.5,
94
- usage: 0.3,
95
- recency: 0.1,
96
- commonPath: 0.1,
97
- };
98
96
  /**
99
97
  * Phase 5:以向量余弦相似度作为主文本维度,再叠加 usage / recency / common 和 calleeNames 匹配度。
100
98
  * calleeNames 作为结构信息独立信号,不污染纯语义向量。
@@ -113,7 +111,7 @@ export function rankSemanticHits(hits, query) {
113
111
  const queryLower = query.toLowerCase();
114
112
  const matchedCallees = calleeNames.filter((callee) => queryLower.includes(callee.toLowerCase())).length;
115
113
  if (matchedCallees > 0) {
116
- calleeMatchScore = Math.min(matchedCallees * 0.05, 0.2);
114
+ calleeMatchScore = Math.min(matchedCallees * CALLEE_MATCH_SCORE_PER_MATCH, CALLEE_MATCH_SCORE_MAX);
117
115
  }
118
116
  }
119
117
  const score = textScore * RANK_WEIGHTS.textMatch +
@@ -122,13 +120,13 @@ export function rankSemanticHits(hits, query) {
122
120
  common * RANK_WEIGHTS.commonPath +
123
121
  calleeMatchScore;
124
122
  const reasonParts = [];
125
- if (textScore >= 0.55)
123
+ if (textScore >= SEMANTIC_REASON_THRESHOLD_HIGH)
126
124
  reasonParts.push('语义相似度高');
127
- else if (textScore >= 0.4)
125
+ else if (textScore >= SEMANTIC_REASON_THRESHOLD_MED)
128
126
  reasonParts.push('语义相关');
129
- if (usage >= 0.6)
127
+ if (usage >= USAGE_REASON_THRESHOLD_HIGH)
130
128
  reasonParts.push('使用频率高');
131
- if (common >= 1)
129
+ if (common >= COMMON_PATH_SCORE_YES)
132
130
  reasonParts.push('位于 shared/common 路径');
133
131
  if (calleeMatchScore > 0)
134
132
  reasonParts.push('函数调用关系匹配');
@@ -152,7 +150,7 @@ export function rankSemanticHits(hits, query) {
152
150
  },
153
151
  commonPath: {
154
152
  score: Number(common.toFixed(3)),
155
- isCommonPath: common >= 1,
153
+ isCommonPath: common >= COMMON_PATH_SCORE_YES,
156
154
  },
157
155
  weights: RANK_WEIGHTS,
158
156
  summary: reasonParts.join(' + '),
@@ -173,15 +171,15 @@ export function rankSymbols(query, symbols) {
173
171
  recency * RANK_WEIGHTS.recency +
174
172
  common * RANK_WEIGHTS.commonPath;
175
173
  const reasonParts = [];
176
- if (text.score >= 0.85)
174
+ if (text.score >= TEXT_MATCH_SCORES.nameContains)
177
175
  reasonParts.push('文本匹配度高');
178
- else if (text.score >= 0.65)
176
+ else if (text.score >= TEXT_MATCH_SCORES.descriptionContains)
179
177
  reasonParts.push('描述命中');
180
178
  else if (text.matchedBy === 'token_overlap')
181
179
  reasonParts.push('关键词片段高度重合');
182
- if (usage >= 0.6)
180
+ if (usage >= USAGE_REASON_THRESHOLD_HIGH)
183
181
  reasonParts.push('使用频率高');
184
- if (common >= 1)
182
+ if (common >= COMMON_PATH_SCORE_YES)
185
183
  reasonParts.push('位于 shared/common 路径');
186
184
  if (reasonParts.length === 0)
187
185
  reasonParts.push('综合相关性较好');
@@ -203,7 +201,7 @@ export function rankSymbols(query, symbols) {
203
201
  },
204
202
  commonPath: {
205
203
  score: Number(common.toFixed(3)),
206
- isCommonPath: common >= 1,
204
+ isCommonPath: common >= COMMON_PATH_SCORE_YES,
207
205
  },
208
206
  weights: RANK_WEIGHTS,
209
207
  summary: reasonParts.join(' + '),