@lorrylurui/code-intelligence-mcp 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -0
- package/dist/config/env.js +9 -0
- package/dist/config/tuning.js +114 -0
- package/dist/db/schema.js +37 -0
- package/dist/index.js +1 -0
- package/dist/indexer/babelParser.js +2 -1
- package/dist/indexer/chunkText.js +164 -0
- package/dist/indexer/embedText.js +2 -2
- package/dist/indexer/indexProject.js +193 -22
- package/dist/indexer/jsAstNormalizer.js +36 -6
- package/dist/prompts/reusableCodeAdvisorPrompt.js +63 -34
- package/dist/repositories/chunkRepository.js +181 -0
- package/dist/repositories/symbolRepository.js +108 -15
- package/dist/server/createServer.js +16 -0
- package/dist/services/contextAssembler.js +150 -0
- package/dist/services/ranking.js +109 -58
- package/dist/services/recommendationService.js +515 -46
- package/dist/services/reindex.js +25 -0
- package/dist/tools/getSymbolDetail.js +2 -1
- package/dist/tools/queryDocs.js +113 -0
- package/dist/tools/recommendComponent.js +86 -10
- package/dist/tools/searchByStructure.js +2 -1
- package/dist/tools/searchSymbols.js +57 -21
- package/dist/types/chunk.js +1 -0
- package/dist/workers/embeddingWorker.js +0 -1
- package/package.json +1 -1
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import { env } from '../config/env.js';
|
|
2
2
|
import { getPool } from '../db/postgres.js';
|
|
3
|
+
import { SYMBOL_SIMILARITY_THRESHOLD, SYMBOL_TOP_K } from '../config/tuning.js';
|
|
3
4
|
import { createEmbeddingClient } from '../services/embeddingClient.js';
|
|
4
5
|
import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
|
|
5
|
-
const SIMILARITY_THRESHOLD = 0.5;
|
|
6
|
-
const TOP_K = 20;
|
|
7
6
|
const inMemorySymbols = [
|
|
8
7
|
{
|
|
9
8
|
id: 1,
|
|
@@ -82,21 +81,58 @@ function getMetaArray(meta, key) {
|
|
|
82
81
|
return [];
|
|
83
82
|
return value.filter((v) => typeof v === 'string');
|
|
84
83
|
}
|
|
84
|
+
function extractSearchTokens(query) {
|
|
85
|
+
const tokens = new Set();
|
|
86
|
+
const normalized = query.trim().toLowerCase();
|
|
87
|
+
for (const match of normalized.matchAll(/[a-z0-9_]+/g)) {
|
|
88
|
+
if (match[0].length >= 2)
|
|
89
|
+
tokens.add(match[0]);
|
|
90
|
+
}
|
|
91
|
+
for (const match of query.matchAll(/[\u4e00-\u9fff]{2,}/g)) {
|
|
92
|
+
const text = match[0];
|
|
93
|
+
for (let index = 0; index < text.length - 1; index += 1) {
|
|
94
|
+
tokens.add(text.slice(index, index + 2));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return [...tokens];
|
|
98
|
+
}
|
|
99
|
+
function buildSearchText(symbol) {
|
|
100
|
+
return [
|
|
101
|
+
symbol.name,
|
|
102
|
+
symbol.path,
|
|
103
|
+
symbol.description ?? '',
|
|
104
|
+
JSON.stringify(symbol.meta ?? {}),
|
|
105
|
+
]
|
|
106
|
+
.join(' ')
|
|
107
|
+
.toLowerCase();
|
|
108
|
+
}
|
|
109
|
+
function countTokenMatches(text, tokens) {
|
|
110
|
+
return tokens.reduce((count, token) => text.includes(token.toLowerCase()) ? count + 1 : count, 0);
|
|
111
|
+
}
|
|
85
112
|
export class SymbolRepository {
|
|
86
113
|
pool;
|
|
87
114
|
constructor() {
|
|
88
115
|
this.pool = getPool();
|
|
89
116
|
}
|
|
90
117
|
async search(query, type) {
|
|
118
|
+
console.error('[code-intelligence-mcp] repository.search.start query=%s type=%s table=%s searchableStatus=%s hasPool=%s', query, type ?? '', env.symbolsTable, String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
|
|
91
119
|
if (!this.pool) {
|
|
92
120
|
const q = query.toLowerCase();
|
|
93
|
-
|
|
121
|
+
const tokens = extractSearchTokens(query);
|
|
122
|
+
const matched = inMemorySymbols.filter((s) => {
|
|
94
123
|
const typeOk = type ? s.type === type : true;
|
|
124
|
+
const text = buildSearchText(s);
|
|
95
125
|
return (typeOk &&
|
|
96
|
-
(
|
|
97
|
-
(s.description ?? '').toLowerCase().includes(q)));
|
|
126
|
+
(text.includes(q) || countTokenMatches(text, tokens) >= 2));
|
|
98
127
|
});
|
|
128
|
+
console.error('[code-intelligence-mcp] repository.search.memory count=%s top=%s', String(matched.length), JSON.stringify(matched.slice(0, 3).map((s) => ({
|
|
129
|
+
id: s.id,
|
|
130
|
+
name: s.name,
|
|
131
|
+
path: s.path,
|
|
132
|
+
}))));
|
|
133
|
+
return matched;
|
|
99
134
|
}
|
|
135
|
+
const tokens = extractSearchTokens(query);
|
|
100
136
|
const params = [
|
|
101
137
|
`%${query}%`,
|
|
102
138
|
SEARCHABLE_STATUS,
|
|
@@ -104,15 +140,46 @@ export class SymbolRepository {
|
|
|
104
140
|
let sql = `
|
|
105
141
|
SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
|
|
106
142
|
FROM ${env.symbolsTable}
|
|
107
|
-
|
|
143
|
+
WHERE (
|
|
144
|
+
name ILIKE $1 OR
|
|
145
|
+
description ILIKE $1 OR
|
|
146
|
+
path ILIKE $1 OR
|
|
147
|
+
meta::text ILIKE $1
|
|
148
|
+
)
|
|
108
149
|
AND status = $2
|
|
109
150
|
`;
|
|
151
|
+
if (tokens.length) {
|
|
152
|
+
const tokenClauses = tokens.map((token) => {
|
|
153
|
+
// 每个query token都要在name/description/path/meta中至少匹配一次才算匹配,来提升搜索的准确度,避免单个token过于泛匹配导致的排名干扰
|
|
154
|
+
params.push(`%${token}%`);
|
|
155
|
+
const index = params.length;
|
|
156
|
+
return `name ILIKE $${index} OR description ILIKE $${index} OR path ILIKE $${index} OR meta::text ILIKE $${index}`;
|
|
157
|
+
});
|
|
158
|
+
sql = `
|
|
159
|
+
SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
|
|
160
|
+
FROM ${env.symbolsTable}
|
|
161
|
+
WHERE (
|
|
162
|
+
name ILIKE $1 OR
|
|
163
|
+
description ILIKE $1 OR
|
|
164
|
+
path ILIKE $1 OR
|
|
165
|
+
meta::text ILIKE $1 OR
|
|
166
|
+
(${tokenClauses.join(' OR ')})
|
|
167
|
+
)
|
|
168
|
+
AND status = $2
|
|
169
|
+
`;
|
|
170
|
+
}
|
|
110
171
|
if (type) {
|
|
111
172
|
params.push(type);
|
|
112
173
|
sql += ` AND type = $${params.length}`;
|
|
113
174
|
}
|
|
114
175
|
sql += ' ORDER BY usage_count DESC LIMIT 20';
|
|
115
176
|
const { rows } = await this.pool.query(sql, params);
|
|
177
|
+
console.error('[code-intelligence-mcp] repository.search.db table=%s rows=%s top=%s note=name/description only', env.symbolsTable, String(rows.length), JSON.stringify(rows.slice(0, 3).map((r) => ({
|
|
178
|
+
id: r.id,
|
|
179
|
+
name: r.name,
|
|
180
|
+
path: r.path,
|
|
181
|
+
type: r.type,
|
|
182
|
+
}))));
|
|
116
183
|
return rows.map((r) => mapRow(r));
|
|
117
184
|
}
|
|
118
185
|
/**
|
|
@@ -120,13 +187,16 @@ export class SymbolRepository {
|
|
|
120
187
|
* 不再需要在 Node 拉取全量向量做内存计算。
|
|
121
188
|
*/
|
|
122
189
|
async searchSemanticHits(query, opts) {
|
|
190
|
+
console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? SYMBOL_TOP_K), String(SYMBOL_SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
|
|
123
191
|
if (!env.embeddingServiceUrl) {
|
|
192
|
+
console.error('[code-intelligence-mcp] repository.searchSemanticHits.error missingEmbeddingServiceUrl');
|
|
124
193
|
throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
|
|
125
194
|
}
|
|
126
195
|
if (!this.pool) {
|
|
196
|
+
console.error('[code-intelligence-mcp] repository.searchSemanticHits.noPool returnEmpty');
|
|
127
197
|
return [];
|
|
128
198
|
}
|
|
129
|
-
const limit = opts?.limit ??
|
|
199
|
+
const limit = opts?.limit ?? SYMBOL_TOP_K;
|
|
130
200
|
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
131
201
|
const [queryVec] = await client.embed([query.trim()]);
|
|
132
202
|
if (!queryVec?.length) {
|
|
@@ -151,15 +221,26 @@ export class SymbolRepository {
|
|
|
151
221
|
params.push(limit * 2); // 多取一倍以便 SIMILARITY_THRESHOLD 过滤后仍有足量结果
|
|
152
222
|
sql += ` ORDER BY embedding <=> $1::vector LIMIT $${params.length}`;
|
|
153
223
|
const { rows } = await this.pool.query(sql, params);
|
|
154
|
-
|
|
155
|
-
.map((r) => ({
|
|
224
|
+
const mapped = rows.map((r) => ({
|
|
156
225
|
symbol: mapRow(r),
|
|
157
226
|
similarity: Number(r.similarity),
|
|
227
|
+
}));
|
|
228
|
+
const passed = mapped.filter((x) => x.similarity >= SYMBOL_SIMILARITY_THRESHOLD);
|
|
229
|
+
console.error('[code-intelligence-mcp] repository.searchSemanticHits.db table=%s rawRows=%s passedThreshold=%s topRaw=%s', env.symbolsTable, String(rows.length), String(passed.length), JSON.stringify(mapped.slice(0, 5).map((x) => ({
|
|
230
|
+
id: x.symbol.id,
|
|
231
|
+
name: x.symbol.name,
|
|
232
|
+
path: x.symbol.path,
|
|
233
|
+
similarity: Number(x.similarity.toFixed(4)),
|
|
234
|
+
}))));
|
|
235
|
+
return passed
|
|
236
|
+
.map((r) => ({
|
|
237
|
+
symbol: r.symbol,
|
|
238
|
+
similarity: r.similarity,
|
|
158
239
|
}))
|
|
159
|
-
.filter((x) => x.similarity >= SIMILARITY_THRESHOLD)
|
|
160
240
|
.slice(0, limit);
|
|
161
241
|
}
|
|
162
242
|
async getByName(name) {
|
|
243
|
+
console.error('[code-intelligence-mcp] repository.getByName.start name=%s table=%s hasPool=%s', name, env.symbolsTable, String(Boolean(this.pool)));
|
|
163
244
|
if (!this.pool) {
|
|
164
245
|
return (inMemorySymbols.find((s) => s.name.toLowerCase() === name.toLowerCase()) ?? null);
|
|
165
246
|
}
|
|
@@ -169,6 +250,7 @@ export class SymbolRepository {
|
|
|
169
250
|
WHERE name = $1
|
|
170
251
|
LIMIT 1
|
|
171
252
|
`, [name]);
|
|
253
|
+
console.error('[code-intelligence-mcp] repository.getByName.db table=%s rows=%s', env.symbolsTable, String(rows.length));
|
|
172
254
|
if (rows.length === 0) {
|
|
173
255
|
return null;
|
|
174
256
|
}
|
|
@@ -191,6 +273,7 @@ export class SymbolRepository {
|
|
|
191
273
|
return result.rowCount !== null && result.rowCount > 0;
|
|
192
274
|
}
|
|
193
275
|
async searchByStructure(fields, opts) {
|
|
276
|
+
console.error('[code-intelligence-mcp] repository.searchByStructure.start fields=%s type=%s category=%s table=%s limit=%s hasPool=%s', JSON.stringify(fields), opts?.type ?? '', opts?.category ?? '', env.symbolsTable, String(opts?.limit ?? 20), String(Boolean(this.pool)));
|
|
194
277
|
const normalized = fields.map((f) => f.trim()).filter(Boolean);
|
|
195
278
|
if (normalized.length === 0)
|
|
196
279
|
return [];
|
|
@@ -215,7 +298,13 @@ export class SymbolRepository {
|
|
|
215
298
|
return normalized.every((field) => propPool.includes(field.toLowerCase()));
|
|
216
299
|
};
|
|
217
300
|
if (!this.pool) {
|
|
218
|
-
|
|
301
|
+
const matched = inMemorySymbols.filter(matchesAll).slice(0, limit);
|
|
302
|
+
console.error('[code-intelligence-mcp] repository.searchByStructure.memory matched=%s top=%s', String(matched.length), JSON.stringify(matched.slice(0, 3).map((s) => ({
|
|
303
|
+
id: s.id,
|
|
304
|
+
name: s.name,
|
|
305
|
+
path: s.path,
|
|
306
|
+
}))));
|
|
307
|
+
return matched;
|
|
219
308
|
}
|
|
220
309
|
const params = [];
|
|
221
310
|
let sql = `
|
|
@@ -234,9 +323,13 @@ export class SymbolRepository {
|
|
|
234
323
|
params.push(Math.max(limit * 5, 50));
|
|
235
324
|
sql += ` ORDER BY usage_count DESC LIMIT $${params.length}`;
|
|
236
325
|
const { rows } = await this.pool.query(sql, params);
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
.
|
|
326
|
+
const mapped = rows.map((r) => mapRow(r));
|
|
327
|
+
const filtered = mapped.filter(matchesAll).slice(0, limit);
|
|
328
|
+
console.error('[code-intelligence-mcp] repository.searchByStructure.db table=%s scanned=%s matched=%s top=%s', env.symbolsTable, String(rows.length), String(filtered.length), JSON.stringify(filtered.slice(0, 3).map((s) => ({
|
|
329
|
+
id: s.id,
|
|
330
|
+
name: s.name,
|
|
331
|
+
path: s.path,
|
|
332
|
+
}))));
|
|
333
|
+
return filtered;
|
|
241
334
|
}
|
|
242
335
|
}
|
|
@@ -8,25 +8,41 @@ import { createSearchByStructureTool } from '../tools/searchByStructure.js';
|
|
|
8
8
|
import { createIncUsageTool } from '../tools/incUsage.js';
|
|
9
9
|
import { RecommendationService } from '../services/recommendationService.js';
|
|
10
10
|
import { createRecommendComponentTool } from '../tools/recommendComponent.js';
|
|
11
|
+
import { createQueryDocsTool } from '../tools/queryDocs.js';
|
|
11
12
|
export function createServer() {
|
|
13
|
+
console.error('[code-intelligence-mcp] createServer.init');
|
|
12
14
|
const server = new McpServer({
|
|
13
15
|
name: 'code-intelligence-mcp',
|
|
14
16
|
version: '0.1.0',
|
|
15
17
|
});
|
|
18
|
+
console.error('[code-intelligence-mcp] mcpServer.created name=code-intelligence-mcp version=0.1.0');
|
|
16
19
|
const repository = new SymbolRepository();
|
|
20
|
+
console.error('[code-intelligence-mcp] repository.created');
|
|
17
21
|
const recommendationService = new RecommendationService(repository);
|
|
22
|
+
console.error('[code-intelligence-mcp] recommendationService.created');
|
|
18
23
|
const searchTool = createSearchSymbolsTool(repository);
|
|
19
24
|
server.tool(searchTool.name, searchTool.description, searchTool.inputSchema, searchTool.handler);
|
|
25
|
+
console.error('[code-intelligence-mcp] tool.registered %s', searchTool.name);
|
|
20
26
|
const detailTool = createGetSymbolDetailTool(repository);
|
|
21
27
|
server.tool(detailTool.name, detailTool.description, detailTool.inputSchema, detailTool.handler);
|
|
28
|
+
console.error('[code-intelligence-mcp] tool.registered %s', detailTool.name);
|
|
22
29
|
const structureTool = createSearchByStructureTool(repository);
|
|
23
30
|
server.tool(structureTool.name, structureTool.description, structureTool.inputSchema, structureTool.handler);
|
|
31
|
+
console.error('[code-intelligence-mcp] tool.registered %s', structureTool.name);
|
|
24
32
|
const reindexTool = createReindexTool();
|
|
25
33
|
server.tool(reindexTool.name, reindexTool.description, reindexTool.inputSchema, reindexTool.handler);
|
|
34
|
+
console.error('[code-intelligence-mcp] tool.registered %s', reindexTool.name);
|
|
26
35
|
const incUsageTool = createIncUsageTool(repository);
|
|
27
36
|
server.tool(incUsageTool.name, incUsageTool.description, incUsageTool.inputSchema, incUsageTool.handler);
|
|
37
|
+
console.error('[code-intelligence-mcp] tool.registered %s', incUsageTool.name);
|
|
28
38
|
const recommendComponentTool = createRecommendComponentTool(recommendationService);
|
|
29
39
|
server.tool(recommendComponentTool.name, recommendComponentTool.description, recommendComponentTool.inputSchema, recommendComponentTool.handler);
|
|
40
|
+
console.error('[code-intelligence-mcp] tool.registered %s', recommendComponentTool.name);
|
|
30
41
|
registerReusableCodeAdvisorPrompt(server);
|
|
42
|
+
console.error('[code-intelligence-mcp] prompt.registered reusable-code-advisor');
|
|
43
|
+
const queryDocsTool = createQueryDocsTool();
|
|
44
|
+
server.tool(queryDocsTool.name, queryDocsTool.description, queryDocsTool.inputSchema, queryDocsTool.handler);
|
|
45
|
+
console.error('[code-intelligence-mcp] tool.registered %s', queryDocsTool.name);
|
|
46
|
+
console.error('[code-intelligence-mcp] createServer.ready toolCount=7 promptCount=1');
|
|
31
47
|
return server;
|
|
32
48
|
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* contextAssembler: RAG 上下文组装器。
|
|
3
|
+
*
|
|
4
|
+
* 完整流程:
|
|
5
|
+
* topK hits
|
|
6
|
+
* → 邻块扩展(getAdjacentChunks):补全被截断的边界上下文
|
|
7
|
+
* → 去重(path + chunk_index):避免重复块被重复计费
|
|
8
|
+
* → 相关性排序:命中块靠前,纯邻块靠后
|
|
9
|
+
* → 字符预算截断:超出 CONTEXT_MAX_CHARS 时丢弃末尾块
|
|
10
|
+
* → 文本渲染:拼成可直接注入 prompt 的 contextText
|
|
11
|
+
*
|
|
12
|
+
* 为什么需要邻块扩展?
|
|
13
|
+
* chunk 切分时按结构和字符数截断,单个 chunk 可能只包含一段话的前半句。
|
|
14
|
+
* 取前后邻块(radius=1 即各一块)可以在不大幅增加 token 成本的前提下
|
|
15
|
+
* 把被截断的上下文还原,显著降低 LLM 产生"幻觉引用"的概率。
|
|
16
|
+
*
|
|
17
|
+
* 为什么要字符预算?
|
|
18
|
+
* 大多数 LLM 有 context window 限制。超出预算不仅导致截断错误,
|
|
19
|
+
* 还会因为过长的无关文本降低模型对真正相关段落的注意力权重("lost in the middle"问题)。
|
|
20
|
+
* 控制预算 = 控制召回精度。
|
|
21
|
+
*/
|
|
22
|
+
import { CONTEXT_ADJACENT_RADIUS, CONTEXT_MAX_CHARS, CONTEXT_MAX_CHUNKS, } from '../config/tuning.js';
|
|
23
|
+
/**
|
|
24
|
+
* 渲染单个 chunk 为可读文本块,附带来源元信息。
|
|
25
|
+
*
|
|
26
|
+
* 格式示例:
|
|
27
|
+
* [来源: qa-doc/topK.md · 第2块/共5块 · 相似度 0.87]
|
|
28
|
+
* topK 参数控制返回数量,默认值为...
|
|
29
|
+
*/
|
|
30
|
+
function renderChunk(chunk) {
|
|
31
|
+
const parts = [`来源: ${chunk.path}`];
|
|
32
|
+
parts.push(`第${chunk.chunkIndex + 1}块/共${chunk.chunkCount}块`);
|
|
33
|
+
if (chunk.similarity != null) {
|
|
34
|
+
parts.push(`相似度 ${chunk.similarity.toFixed(2)}`);
|
|
35
|
+
}
|
|
36
|
+
const header = `[${parts.join(' · ')}]`;
|
|
37
|
+
return `${header}\n${chunk.content}`;
|
|
38
|
+
}
|
|
39
|
+
export class ContextAssembler {
|
|
40
|
+
repo;
|
|
41
|
+
constructor(repo) {
|
|
42
|
+
this.repo = repo;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* 组装 RAG 上下文。
|
|
46
|
+
*
|
|
47
|
+
* @param hits 来自 ChunkRepository.searchSemantic() 的 topK 结果(已按相似度降序)
|
|
48
|
+
* @param opts.maxChars 覆盖 CONTEXT_MAX_CHARS,用于运行时动态调整 token 预算
|
|
49
|
+
* @param opts.adjacentRadius 覆盖 CONTEXT_ADJACENT_RADIUS,0 表示不做邻块扩展
|
|
50
|
+
* @param opts.maxChunks 覆盖 CONTEXT_MAX_CHUNKS
|
|
51
|
+
*/
|
|
52
|
+
async assemble(hits, opts) {
|
|
53
|
+
const maxChars = opts?.maxChars ?? CONTEXT_MAX_CHARS;
|
|
54
|
+
const radius = opts?.adjacentRadius ?? CONTEXT_ADJACENT_RADIUS;
|
|
55
|
+
const maxChunks = opts?.maxChunks ?? CONTEXT_MAX_CHUNKS;
|
|
56
|
+
const hitCount = hits.length;
|
|
57
|
+
// ── 步骤1:邻块扩展 ──────────────────────────────────────────────────
|
|
58
|
+
// 对每个命中块并行拉取前后邻块,补全被切分边界截断的上下文。
|
|
59
|
+
// 邻块本身没有 similarity 分数,排序时置于命中块之后。
|
|
60
|
+
const expanded = await this.expandWithAdjacentChunks(hits, radius);
|
|
61
|
+
// ── 步骤2:去重 ───────────────────────────────────────────────────────
|
|
62
|
+
// 多个命中块扩展后可能重叠,以 path+chunk_index 为键去重,保留先出现的版本
|
|
63
|
+
// (命中块在前,保留其 similarity;邻块在后,若与命中块重叠则丢弃邻块副本)。
|
|
64
|
+
const deduped = deduplicateChunks(expanded);
|
|
65
|
+
// ── 步骤3:排序 ───────────────────────────────────────────────────────
|
|
66
|
+
// similarity 有值(命中块)> similarity 无值(纯邻块);同类内部按相似度降序。
|
|
67
|
+
const sorted = sortChunks(deduped);
|
|
68
|
+
// ── 步骤4:字符预算截断 ───────────────────────────────────────────────
|
|
69
|
+
const { selected, truncated } = applyBudget(sorted, maxChars, maxChunks);
|
|
70
|
+
// ── 步骤5:文本渲染 ───────────────────────────────────────────────────
|
|
71
|
+
const contextText = selected.map(renderChunk).join('\n\n---\n\n');
|
|
72
|
+
return {
|
|
73
|
+
chunks: selected,
|
|
74
|
+
contextText,
|
|
75
|
+
hitCount,
|
|
76
|
+
totalChunks: selected.length,
|
|
77
|
+
truncated,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* 对每个命中块并行拉取邻块,返回命中块 + 所有邻块的扁平列表(含重复,由后续去重处理)。
|
|
82
|
+
* radius=0 时跳过数据库查询,直接返回原始命中列表。
|
|
83
|
+
*/
|
|
84
|
+
async expandWithAdjacentChunks(hits, radius) {
|
|
85
|
+
if (radius <= 0 || hits.length === 0)
|
|
86
|
+
return [...hits];
|
|
87
|
+
// 并行拉取,避免串行 N 次查询放大延迟。
|
|
88
|
+
const adjacentGroups = await Promise.all(hits.map((hit) => this.repo.getAdjacentChunks(hit.path, hit.chunkIndex, radius)));
|
|
89
|
+
// 命中块在前,邻块紧随其后(之后去重时命中块的 similarity 会被保留)。
|
|
90
|
+
const result = [...hits];
|
|
91
|
+
for (const group of adjacentGroups) {
|
|
92
|
+
result.push(...group);
|
|
93
|
+
}
|
|
94
|
+
return result;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
/** 以 `${path}::${chunkIndex}` 为键去重,保留先出现的副本(命中块的 similarity 优先)。 */
|
|
98
|
+
function deduplicateChunks(chunks) {
|
|
99
|
+
const seen = new Set();
|
|
100
|
+
const result = [];
|
|
101
|
+
for (const chunk of chunks) {
|
|
102
|
+
const key = `${chunk.path}::${chunk.chunkIndex}`;
|
|
103
|
+
if (!seen.has(key)) {
|
|
104
|
+
seen.add(key);
|
|
105
|
+
result.push(chunk);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return result;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* 排序规则:
|
|
112
|
+
* 1. 有 similarity(命中块)排在无 similarity(纯邻块)之前
|
|
113
|
+
* 2. 同类内部按 similarity 降序
|
|
114
|
+
* 3. 纯邻块内部保持原有顺序(path + chunkIndex 升序,保证上下文连贯)
|
|
115
|
+
*/
|
|
116
|
+
function sortChunks(chunks) {
|
|
117
|
+
return [...chunks].sort((a, b) => {
|
|
118
|
+
const aHasSim = a.similarity != null;
|
|
119
|
+
const bHasSim = b.similarity != null;
|
|
120
|
+
if (aHasSim && !bHasSim)
|
|
121
|
+
return -1;
|
|
122
|
+
if (!aHasSim && bHasSim)
|
|
123
|
+
return 1;
|
|
124
|
+
if (aHasSim && bHasSim)
|
|
125
|
+
return (b.similarity ?? 0) - (a.similarity ?? 0);
|
|
126
|
+
// 纯邻块按路径+索引保持文档顺序
|
|
127
|
+
const pathCmp = a.path.localeCompare(b.path);
|
|
128
|
+
return pathCmp !== 0 ? pathCmp : a.chunkIndex - b.chunkIndex;
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* 从排好序的 chunk 列表中按字符预算和数量上限截取子集。
|
|
133
|
+
* 按顺序累加字符数,第一个超出预算的 chunk 及之后的全部丢弃。
|
|
134
|
+
*/
|
|
135
|
+
function applyBudget(chunks, maxChars, maxChunks) {
|
|
136
|
+
const selected = [];
|
|
137
|
+
let totalChars = 0;
|
|
138
|
+
for (const chunk of chunks) {
|
|
139
|
+
if (selected.length >= maxChunks) {
|
|
140
|
+
return { selected, truncated: true };
|
|
141
|
+
}
|
|
142
|
+
const chunkChars = renderChunk(chunk).length;
|
|
143
|
+
if (totalChars + chunkChars > maxChars && selected.length > 0) {
|
|
144
|
+
return { selected, truncated: true };
|
|
145
|
+
}
|
|
146
|
+
selected.push(chunk);
|
|
147
|
+
totalChars += chunkChars;
|
|
148
|
+
}
|
|
149
|
+
return { selected, truncated: false };
|
|
150
|
+
}
|
package/dist/services/ranking.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CALLEE_MATCH_SCORE_MAX, CALLEE_MATCH_SCORE_PER_MATCH, COMMON_PATH_SCORE_NO, COMMON_PATH_SCORE_YES, RANK_WEIGHTS, RECENCY_SCORE_DEFAULT, RECENCY_SCORE_OLDEST, RECENCY_SCORE_TIERS, SEMANTIC_REASON_THRESHOLD_HIGH, SEMANTIC_REASON_THRESHOLD_MED, TEXT_MATCH_SCORES, TOKEN_OVERLAP_TIERS, USAGE_REASON_THRESHOLD_HIGH, USAGE_SCORE_LOG_DIVISOR, } from '../config/tuning.js';
|
|
1
2
|
function clamp01(value) {
|
|
2
3
|
if (value < 0)
|
|
3
4
|
return 0;
|
|
@@ -5,40 +6,78 @@ function clamp01(value) {
|
|
|
5
6
|
return 1;
|
|
6
7
|
return value;
|
|
7
8
|
}
|
|
9
|
+
function extractTextTokens(text) {
|
|
10
|
+
// eg: query='useDebounceInput组件', tokens=['useDebounceInput', '组件']
|
|
11
|
+
const tokens = new Set();
|
|
12
|
+
const lower = text.trim().toLowerCase();
|
|
13
|
+
for (const match of lower.matchAll(/[a-z0-9_]+/g)) {
|
|
14
|
+
if (match[0].length >= 2)
|
|
15
|
+
tokens.add(match[0]);
|
|
16
|
+
}
|
|
17
|
+
for (const match of text.matchAll(/[\u4e00-\u9fff]{2,}/g)) {
|
|
18
|
+
const chunk = match[0];
|
|
19
|
+
for (let index = 0; index < chunk.length - 1; index += 1) {
|
|
20
|
+
tokens.add(chunk.slice(index, index + 2));
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return [...tokens];
|
|
24
|
+
}
|
|
25
|
+
// 先对query进行切分,然后计算切分后的token在symbol的name/description/path中出现的数量和比例,来判断是否存在关键词重合,进而提升排名。
|
|
26
|
+
function tokenOverlapScore(query, symbol) {
|
|
27
|
+
const queryTokens = extractTextTokens(query);
|
|
28
|
+
if (queryTokens.length === 0)
|
|
29
|
+
return 0;
|
|
30
|
+
const text = [symbol.name, symbol.description ?? '', symbol.path]
|
|
31
|
+
.join(' ')
|
|
32
|
+
.toLowerCase();
|
|
33
|
+
const matched = queryTokens.filter((token) => text.includes(token)).length;
|
|
34
|
+
const overlapRatio = matched / queryTokens.length;
|
|
35
|
+
for (const tier of TOKEN_OVERLAP_TIERS) {
|
|
36
|
+
if (matched >= tier.minMatches && overlapRatio >= tier.minRatio) {
|
|
37
|
+
return tier.score;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return 0;
|
|
41
|
+
}
|
|
8
42
|
function textMatchScore(query, symbol) {
|
|
9
43
|
const q = query.trim().toLowerCase();
|
|
10
44
|
if (!q)
|
|
11
|
-
return { score: 0, matchedBy:
|
|
45
|
+
return { score: 0, matchedBy: 'weak' };
|
|
12
46
|
const name = symbol.name.toLowerCase();
|
|
13
|
-
const description = (symbol.description ??
|
|
47
|
+
const description = (symbol.description ?? '').toLowerCase();
|
|
14
48
|
if (name === q)
|
|
15
|
-
return { score: 1, matchedBy:
|
|
49
|
+
return { score: 1, matchedBy: 'exact_name' };
|
|
16
50
|
if (name.includes(q))
|
|
17
|
-
return {
|
|
51
|
+
return {
|
|
52
|
+
score: TEXT_MATCH_SCORES.nameContains,
|
|
53
|
+
matchedBy: 'name_contains',
|
|
54
|
+
};
|
|
18
55
|
if (description.includes(q))
|
|
19
|
-
return {
|
|
20
|
-
|
|
56
|
+
return {
|
|
57
|
+
score: TEXT_MATCH_SCORES.descriptionContains,
|
|
58
|
+
matchedBy: 'description_contains',
|
|
59
|
+
};
|
|
60
|
+
const overlapScore = tokenOverlapScore(query, symbol);
|
|
61
|
+
if (overlapScore > 0)
|
|
62
|
+
return { score: overlapScore, matchedBy: 'token_overlap' };
|
|
63
|
+
return { score: TEXT_MATCH_SCORES.weak, matchedBy: 'weak' };
|
|
21
64
|
}
|
|
22
65
|
function usageScore(usageCount) {
|
|
23
66
|
// log scale to avoid very large usage monopolizing ranking.
|
|
24
|
-
return clamp01(Math.log10(usageCount + 1) /
|
|
67
|
+
return clamp01(Math.log10(usageCount + 1) / USAGE_SCORE_LOG_DIVISOR);
|
|
25
68
|
}
|
|
26
69
|
function recencyScore(createdAt) {
|
|
27
70
|
if (!createdAt)
|
|
28
|
-
return
|
|
71
|
+
return RECENCY_SCORE_DEFAULT;
|
|
29
72
|
const ts = new Date(createdAt).getTime();
|
|
30
73
|
if (Number.isNaN(ts))
|
|
31
|
-
return
|
|
74
|
+
return RECENCY_SCORE_DEFAULT;
|
|
32
75
|
const days = (Date.now() - ts) / (1000 * 60 * 60 * 24);
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
return 0.6;
|
|
39
|
-
if (days <= 180)
|
|
40
|
-
return 0.4;
|
|
41
|
-
return 0.25;
|
|
76
|
+
for (const tier of RECENCY_SCORE_TIERS) {
|
|
77
|
+
if (days <= tier.maxDays)
|
|
78
|
+
return tier.score;
|
|
79
|
+
}
|
|
80
|
+
return RECENCY_SCORE_OLDEST;
|
|
42
81
|
}
|
|
43
82
|
function daysSinceCreated(createdAt) {
|
|
44
83
|
if (!createdAt)
|
|
@@ -50,62 +89,72 @@ function daysSinceCreated(createdAt) {
|
|
|
50
89
|
}
|
|
51
90
|
function commonPathScore(path) {
|
|
52
91
|
const lower = path.toLowerCase();
|
|
53
|
-
return lower.includes(
|
|
92
|
+
return lower.includes('/common/') || lower.includes('/shared/')
|
|
93
|
+
? COMMON_PATH_SCORE_YES
|
|
94
|
+
: COMMON_PATH_SCORE_NO;
|
|
54
95
|
}
|
|
55
|
-
const RANK_WEIGHTS = {
|
|
56
|
-
textMatch: 0.5,
|
|
57
|
-
usage: 0.3,
|
|
58
|
-
recency: 0.1,
|
|
59
|
-
commonPath: 0.1
|
|
60
|
-
};
|
|
61
96
|
/**
|
|
62
|
-
* Phase 5:以向量余弦相似度作为主文本维度,再叠加 usage / recency / common
|
|
97
|
+
* Phase 5:以向量余弦相似度作为主文本维度,再叠加 usage / recency / common 和 calleeNames 匹配度。
|
|
98
|
+
* calleeNames 作为结构信息独立信号,不污染纯语义向量。
|
|
63
99
|
*/
|
|
64
|
-
export function rankSemanticHits(hits) {
|
|
100
|
+
export function rankSemanticHits(hits, query) {
|
|
65
101
|
return hits
|
|
66
102
|
.map(({ symbol, similarity }) => {
|
|
67
103
|
const textScore = clamp01(similarity);
|
|
68
104
|
const usage = usageScore(symbol.usageCount);
|
|
69
105
|
const recency = recencyScore(symbol.createdAt);
|
|
70
106
|
const common = commonPathScore(symbol.path);
|
|
107
|
+
// ✨ 新增:calleeNames 作为独立信号
|
|
108
|
+
let calleeMatchScore = 0;
|
|
109
|
+
if (query && Array.isArray(symbol.meta?.calleeNames)) {
|
|
110
|
+
const calleeNames = symbol.meta.calleeNames;
|
|
111
|
+
const queryLower = query.toLowerCase();
|
|
112
|
+
const matchedCallees = calleeNames.filter((callee) => queryLower.includes(callee.toLowerCase())).length;
|
|
113
|
+
if (matchedCallees > 0) {
|
|
114
|
+
calleeMatchScore = Math.min(matchedCallees * CALLEE_MATCH_SCORE_PER_MATCH, CALLEE_MATCH_SCORE_MAX);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
71
117
|
const score = textScore * RANK_WEIGHTS.textMatch +
|
|
72
118
|
usage * RANK_WEIGHTS.usage +
|
|
73
119
|
recency * RANK_WEIGHTS.recency +
|
|
74
|
-
common * RANK_WEIGHTS.commonPath
|
|
120
|
+
common * RANK_WEIGHTS.commonPath +
|
|
121
|
+
calleeMatchScore;
|
|
75
122
|
const reasonParts = [];
|
|
76
|
-
if (textScore >=
|
|
77
|
-
reasonParts.push(
|
|
78
|
-
else if (textScore >=
|
|
79
|
-
reasonParts.push(
|
|
80
|
-
if (usage >=
|
|
81
|
-
reasonParts.push(
|
|
82
|
-
if (common >=
|
|
83
|
-
reasonParts.push(
|
|
123
|
+
if (textScore >= SEMANTIC_REASON_THRESHOLD_HIGH)
|
|
124
|
+
reasonParts.push('语义相似度高');
|
|
125
|
+
else if (textScore >= SEMANTIC_REASON_THRESHOLD_MED)
|
|
126
|
+
reasonParts.push('语义相关');
|
|
127
|
+
if (usage >= USAGE_REASON_THRESHOLD_HIGH)
|
|
128
|
+
reasonParts.push('使用频率高');
|
|
129
|
+
if (common >= COMMON_PATH_SCORE_YES)
|
|
130
|
+
reasonParts.push('位于 shared/common 路径');
|
|
131
|
+
if (calleeMatchScore > 0)
|
|
132
|
+
reasonParts.push('函数调用关系匹配');
|
|
84
133
|
if (reasonParts.length === 0)
|
|
85
|
-
reasonParts.push(
|
|
134
|
+
reasonParts.push('综合相关性较好');
|
|
86
135
|
return {
|
|
87
136
|
symbol,
|
|
88
137
|
score: Number(score.toFixed(3)),
|
|
89
138
|
reason: {
|
|
90
139
|
textMatch: {
|
|
91
140
|
score: Number(textScore.toFixed(3)),
|
|
92
|
-
matchedBy:
|
|
141
|
+
matchedBy: 'semantic',
|
|
93
142
|
},
|
|
94
143
|
usage: {
|
|
95
144
|
score: Number(usage.toFixed(3)),
|
|
96
|
-
usageCount: symbol.usageCount
|
|
145
|
+
usageCount: symbol.usageCount,
|
|
97
146
|
},
|
|
98
147
|
recency: {
|
|
99
148
|
score: Number(recency.toFixed(3)),
|
|
100
|
-
daysSinceCreated: daysSinceCreated(symbol.createdAt)
|
|
149
|
+
daysSinceCreated: daysSinceCreated(symbol.createdAt),
|
|
101
150
|
},
|
|
102
151
|
commonPath: {
|
|
103
152
|
score: Number(common.toFixed(3)),
|
|
104
|
-
isCommonPath: common >=
|
|
153
|
+
isCommonPath: common >= COMMON_PATH_SCORE_YES,
|
|
105
154
|
},
|
|
106
155
|
weights: RANK_WEIGHTS,
|
|
107
|
-
summary: reasonParts.join(
|
|
108
|
-
}
|
|
156
|
+
summary: reasonParts.join(' + '),
|
|
157
|
+
},
|
|
109
158
|
};
|
|
110
159
|
})
|
|
111
160
|
.sort((a, b) => b.score - a.score);
|
|
@@ -122,39 +171,41 @@ export function rankSymbols(query, symbols) {
|
|
|
122
171
|
recency * RANK_WEIGHTS.recency +
|
|
123
172
|
common * RANK_WEIGHTS.commonPath;
|
|
124
173
|
const reasonParts = [];
|
|
125
|
-
if (text.score >=
|
|
126
|
-
reasonParts.push(
|
|
127
|
-
else if (text.score >=
|
|
128
|
-
reasonParts.push(
|
|
129
|
-
if (
|
|
130
|
-
reasonParts.push(
|
|
131
|
-
if (
|
|
132
|
-
reasonParts.push(
|
|
174
|
+
if (text.score >= TEXT_MATCH_SCORES.nameContains)
|
|
175
|
+
reasonParts.push('文本匹配度高');
|
|
176
|
+
else if (text.score >= TEXT_MATCH_SCORES.descriptionContains)
|
|
177
|
+
reasonParts.push('描述命中');
|
|
178
|
+
else if (text.matchedBy === 'token_overlap')
|
|
179
|
+
reasonParts.push('关键词片段高度重合');
|
|
180
|
+
if (usage >= USAGE_REASON_THRESHOLD_HIGH)
|
|
181
|
+
reasonParts.push('使用频率高');
|
|
182
|
+
if (common >= COMMON_PATH_SCORE_YES)
|
|
183
|
+
reasonParts.push('位于 shared/common 路径');
|
|
133
184
|
if (reasonParts.length === 0)
|
|
134
|
-
reasonParts.push(
|
|
185
|
+
reasonParts.push('综合相关性较好');
|
|
135
186
|
return {
|
|
136
187
|
symbol,
|
|
137
188
|
score: Number(score.toFixed(3)),
|
|
138
189
|
reason: {
|
|
139
190
|
textMatch: {
|
|
140
191
|
score: Number(text.score.toFixed(3)),
|
|
141
|
-
matchedBy: text.matchedBy
|
|
192
|
+
matchedBy: text.matchedBy,
|
|
142
193
|
},
|
|
143
194
|
usage: {
|
|
144
195
|
score: Number(usage.toFixed(3)),
|
|
145
|
-
usageCount: symbol.usageCount
|
|
196
|
+
usageCount: symbol.usageCount,
|
|
146
197
|
},
|
|
147
198
|
recency: {
|
|
148
199
|
score: Number(recency.toFixed(3)),
|
|
149
|
-
daysSinceCreated: daysSinceCreated(symbol.createdAt)
|
|
200
|
+
daysSinceCreated: daysSinceCreated(symbol.createdAt),
|
|
150
201
|
},
|
|
151
202
|
commonPath: {
|
|
152
203
|
score: Number(common.toFixed(3)),
|
|
153
|
-
isCommonPath: common >=
|
|
204
|
+
isCommonPath: common >= COMMON_PATH_SCORE_YES,
|
|
154
205
|
},
|
|
155
206
|
weights: RANK_WEIGHTS,
|
|
156
|
-
summary: reasonParts.join(
|
|
157
|
-
}
|
|
207
|
+
summary: reasonParts.join(' + '),
|
|
208
|
+
},
|
|
158
209
|
};
|
|
159
210
|
})
|
|
160
211
|
.sort((a, b) => b.score - a.score);
|