@lorrylurui/code-intelligence-mcp 2.0.9 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/cli/eval-analyze-cli.js +16 -10
- package/dist/cli/eval-recommendation-cli.js +37 -36
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -78,6 +78,9 @@ async function loadResults(filePath) {
|
|
|
78
78
|
return results;
|
|
79
79
|
}
|
|
80
80
|
// ─── 工具函数 ─────────────────────────────────────────────────────────────────
|
|
81
|
+
/**
|
|
82
|
+
* 计算平均值,空数组时返回 0。
|
|
83
|
+
*/
|
|
81
84
|
function avg(nums) {
|
|
82
85
|
if (nums.length === 0)
|
|
83
86
|
return 0;
|
|
@@ -100,12 +103,17 @@ function recallByTag(results, tag) {
|
|
|
100
103
|
count: tagged.length,
|
|
101
104
|
};
|
|
102
105
|
}
|
|
106
|
+
/**
|
|
107
|
+
* 计算各项指标的平均值,返回一个汇总对象。
|
|
108
|
+
*/
|
|
103
109
|
function computeMetrics(positive, negative) {
|
|
104
110
|
return {
|
|
105
111
|
recallMain: avg(positive.map((r) => r.recallMain ?? 0)),
|
|
106
112
|
recall50: avg(positive.map((r) => r.recall50 ?? 0)),
|
|
107
|
-
|
|
108
|
-
|
|
113
|
+
firstHitScore: avg(positive.map((r) => r.firstHitScore ?? 0)),
|
|
114
|
+
rankingQuality: avg(positive.map((r) => r.rankingQuality ?? 0)),
|
|
115
|
+
coverage: positive.filter((r) => (r.recallMain ?? 0) > 0).length /
|
|
116
|
+
(positive.length || 1),
|
|
109
117
|
top1Acc: positive.filter((r) => r.top1Correct === true).length /
|
|
110
118
|
(positive.length || 1),
|
|
111
119
|
fpRate: negative.filter((r) => r.falsePositive).length /
|
|
@@ -120,7 +128,7 @@ async function analyze() {
|
|
|
120
128
|
const positive = results.filter((r) => !r.isNegativeSample);
|
|
121
129
|
const negative = results.filter((r) => r.isNegativeSample);
|
|
122
130
|
const metrics = computeMetrics(positive, negative);
|
|
123
|
-
//
|
|
131
|
+
// 如果不传 --baseline,baseMetrics 就是 undefined,delta() 函数返回空字符串,指标后面不显示涨跌。
|
|
124
132
|
let baseMetrics;
|
|
125
133
|
if (BASELINE_PATH && fs.existsSync(BASELINE_PATH)) {
|
|
126
134
|
const baseResults = await loadResults(BASELINE_PATH);
|
|
@@ -164,12 +172,10 @@ async function analyze() {
|
|
|
164
172
|
console.log(sep);
|
|
165
173
|
// ── 关键指标 ──
|
|
166
174
|
console.log('\n关键指标\n');
|
|
167
|
-
console.log(` Recall@10:
|
|
168
|
-
console.log(`
|
|
169
|
-
console.log(`
|
|
170
|
-
console.log(`
|
|
171
|
-
console.log(` Top1 Acc: ${pct(metrics.top1Acc).padStart(7)}${delta(metrics.top1Acc, baseMetrics?.top1Acc)}`);
|
|
172
|
-
console.log(` False Positive: ${pct(metrics.fpRate).padStart(7)}${delta(metrics.fpRate, baseMetrics?.fpRate)}`);
|
|
175
|
+
console.log(` 召回率(Recall@10): ${pct(metrics.recallMain).padStart(7)}${delta(metrics.recallMain, baseMetrics?.recallMain)}`);
|
|
176
|
+
console.log(` 首位命中分(MRR@10): ${pct(metrics.firstHitScore).padStart(7)}${delta(metrics.firstHitScore, baseMetrics?.firstHitScore)}`);
|
|
177
|
+
console.log(` 首条准确率(Top-1): ${pct(metrics.top1Acc).padStart(7)}${delta(metrics.top1Acc, baseMetrics?.top1Acc)}`);
|
|
178
|
+
console.log(` 误触率(FP): ${pct(metrics.fpRate).padStart(7)}${delta(metrics.fpRate, baseMetrics?.fpRate)}`);
|
|
173
179
|
console.log(`\n 总 query 数:${results.length}(正例 ${positive.length},负例 ${negative.length})`);
|
|
174
180
|
// ── 分组 Recall ──
|
|
175
181
|
console.log('\n' + sub);
|
|
@@ -216,7 +222,7 @@ async function analyze() {
|
|
|
216
222
|
` 中文 Recall@10 = ${pct(zhStat.recall)},英文 = ${pct(enStat.recall)},差距 ${pct(enStat.recall - zhStat.recall)}\n` +
|
|
217
223
|
` 零召回中文 query 示例:${zhZero.join('、')}`);
|
|
218
224
|
}
|
|
219
|
-
// 发现2
|
|
225
|
+
// 发现2:函数类类型推断
|
|
220
226
|
const funcStat = tagRecalls.get('function');
|
|
221
227
|
if (funcStat && funcStat.recall < THRESHOLDS.FUNC_RECALL_LOW) {
|
|
222
228
|
findings.push(`函数类 query 召回偏低(Recall@10 = ${pct(funcStat.recall)})\n` +
|
|
@@ -23,7 +23,9 @@ function getArg(flag, fallback) {
|
|
|
23
23
|
const QUERY_SET_PATH = getArg('--query-set', 'offline_eval/query_set.jsonl');
|
|
24
24
|
const OUTPUT_DIR = getArg('--output', 'offline_eval/results');
|
|
25
25
|
const TOP_K_MAIN = Number(getArg('--limit', '10')); // Recall@K_MAIN / MRR@K / nDCG@K
|
|
26
|
-
const
|
|
26
|
+
const RECALL_WIDE_K = 50; // 宽口径召回深度(用于 Recall@50),不是测试集数量
|
|
27
|
+
const REL_RELEVANT_MIN = 1; // rel >= 1 计入相关结果
|
|
28
|
+
const REL_PRIMARY = 2; // rel = 2 表示主答案/最高相关度
|
|
27
29
|
// ─── 指标计算 ─────────────────────────────────────────────────────────────────
|
|
28
30
|
/**
|
|
29
31
|
* 覆盖率 Recall@K:前 K 条结果中命中的相关条目占全部相关条目的比例。
|
|
@@ -31,11 +33,12 @@ const TOP_K_WIDE = 50; // Recall@50(宽口径)
|
|
|
31
33
|
* 负例(expected 全为 rel=0)视为完全命中,返回 1。
|
|
32
34
|
*/
|
|
33
35
|
function recallAtK(returnedNames, expected, k) {
|
|
34
|
-
const relevant = expected.filter((e) => e.rel >=
|
|
36
|
+
const relevant = expected.filter((e) => e.rel >= REL_RELEVANT_MIN);
|
|
35
37
|
if (relevant.length === 0)
|
|
36
38
|
return 1;
|
|
37
39
|
const topK = returnedNames.slice(0, k);
|
|
38
40
|
const hits = relevant.filter((e) => topK.includes(e.name));
|
|
41
|
+
// 召回率@k = 真实召回的 / 所有相关的
|
|
39
42
|
return hits.length / relevant.length;
|
|
40
43
|
}
|
|
41
44
|
/**
|
|
@@ -43,9 +46,10 @@ function recallAtK(returnedNames, expected, k) {
|
|
|
43
46
|
* 衡量「最佳结果排多靠前」;未命中则返回 0。
|
|
44
47
|
*/
|
|
45
48
|
function mrrAtK(returnedNames, expected, k) {
|
|
46
|
-
const relevantNames = new Set(expected.filter((e) => e.rel >=
|
|
49
|
+
const relevantNames = new Set(expected.filter((e) => e.rel >= REL_RELEVANT_MIN).map((e) => e.name));
|
|
47
50
|
const topK = returnedNames.slice(0, k);
|
|
48
51
|
for (let i = 0; i < topK.length; i++) {
|
|
52
|
+
// 有一个命中的 就返回对应的 MRR 分数,越靠前分数越高;如果都没命中,最后返回 0。
|
|
49
53
|
if (relevantNames.has(topK[i]))
|
|
50
54
|
return 1 / (i + 1);
|
|
51
55
|
}
|
|
@@ -72,9 +76,11 @@ function ndcgAtK(returnedNames, expected, k) {
|
|
|
72
76
|
}, 0);
|
|
73
77
|
return idcg === 0 ? 1 : dcg / idcg;
|
|
74
78
|
}
|
|
75
|
-
|
|
79
|
+
/**
|
|
80
|
+
* 返回失败阶段原因数组(无 ID 时按名称降级处理)
|
|
81
|
+
*/
|
|
76
82
|
function classifyFailuresFromTrace(expected, returnedNames, evalTrace, idByName) {
|
|
77
|
-
const relevant = expected.filter((e) => e.rel >=
|
|
83
|
+
const relevant = expected.filter((e) => e.rel >= REL_RELEVANT_MIN);
|
|
78
84
|
const failures = [];
|
|
79
85
|
for (const exp of relevant) {
|
|
80
86
|
if (returnedNames.includes(exp.name))
|
|
@@ -123,8 +129,10 @@ function printSummary(results, kMain, baseline) {
|
|
|
123
129
|
const negative = results.filter((r) => r.isNegativeSample);
|
|
124
130
|
const recallMain = avg(positive.map((r) => r.recallMain ?? 0));
|
|
125
131
|
const recall50 = avg(positive.map((r) => r.recall50 ?? 0));
|
|
126
|
-
const
|
|
127
|
-
const
|
|
132
|
+
const firstHitScore = avg(positive.map((r) => r.firstHitScore ?? 0));
|
|
133
|
+
// const rankingQuality = avg(positive.map((r) => r.rankingQuality ?? 0));
|
|
134
|
+
const coverage = positive.filter((r) => (r.recallMain ?? 0) > 0).length /
|
|
135
|
+
(positive.length || 1);
|
|
128
136
|
const top1Acc = positive.filter((r) => r.top1Correct === true).length /
|
|
129
137
|
(positive.length || 1);
|
|
130
138
|
const fpRate = negative.filter((r) => r.falsePositive).length / (negative.length || 1);
|
|
@@ -141,12 +149,10 @@ function printSummary(results, kMain, baseline) {
|
|
|
141
149
|
console.log('='.repeat(60));
|
|
142
150
|
console.log(`Queries total: ${results.length} (positive: ${positive.length}, negative: ${negative.length})`);
|
|
143
151
|
console.log('');
|
|
144
|
-
console.log(
|
|
145
|
-
console.log(
|
|
146
|
-
console.log(
|
|
147
|
-
console.log(
|
|
148
|
-
console.log(`Top1 Acc: ${formatPct(top1Acc)}${diff('top1Acc', top1Acc)}`);
|
|
149
|
-
console.log(`False Pos: ${formatPct(fpRate)} (negative samples incorrectly returned results)`);
|
|
152
|
+
console.log(`召回率(Recall@${kMain}): ${formatPct(recallMain)}${diff('recallMain', recallMain)}`);
|
|
153
|
+
console.log(`首位命中分(MRR@${kMain}): ${formatPct(firstHitScore)}${diff('firstHitScore', firstHitScore)}`);
|
|
154
|
+
console.log(`首条准确率(Top-1): ${formatPct(top1Acc)}${diff('top1Acc', top1Acc)}`);
|
|
155
|
+
console.log(`误触率(FP): ${formatPct(fpRate)} (负例被错误推荐)`);
|
|
150
156
|
console.log('');
|
|
151
157
|
// ── Failure breakdown ──
|
|
152
158
|
const allFailures = positive.flatMap((r) => r.failures);
|
|
@@ -192,6 +198,8 @@ async function loadQuerySet(filePath) {
|
|
|
192
198
|
const trimmed = line.trim();
|
|
193
199
|
if (!trimmed)
|
|
194
200
|
continue;
|
|
201
|
+
if (trimmed.startsWith('#') || trimmed.startsWith('//'))
|
|
202
|
+
continue;
|
|
195
203
|
cases.push(JSON.parse(trimmed));
|
|
196
204
|
}
|
|
197
205
|
return cases;
|
|
@@ -199,47 +207,43 @@ async function loadQuerySet(filePath) {
|
|
|
199
207
|
async function runEval() {
|
|
200
208
|
console.log(`Loading query set: ${QUERY_SET_PATH}`);
|
|
201
209
|
const cases = await loadQuerySet(QUERY_SET_PATH);
|
|
202
|
-
console.log(`Loaded ${cases.length} queries. Running eval with limit=${TOP_K_MAIN}/${
|
|
210
|
+
console.log(`Loaded ${cases.length} queries. Running eval with limit=${TOP_K_MAIN}/${RECALL_WIDE_K}...\n`);
|
|
203
211
|
const repository = new SymbolRepository();
|
|
204
212
|
const service = new RecommendationService(repository);
|
|
205
213
|
const results = [];
|
|
206
214
|
for (const queryCase of cases) {
|
|
207
215
|
const isNegative = queryCase.expected.length === 0;
|
|
208
|
-
// Run with wide limit (Recall@50)
|
|
209
216
|
const wideResult = await service.recommendComponent({
|
|
210
217
|
...queryCase.input,
|
|
211
|
-
limit:
|
|
218
|
+
limit: RECALL_WIDE_K,
|
|
212
219
|
evalMode: true,
|
|
213
220
|
});
|
|
214
221
|
const wideNames = [
|
|
215
222
|
...(wideResult.recommended ? [wideResult.recommended.name] : []),
|
|
216
223
|
...wideResult.alternatives.map((a) => a.name),
|
|
217
224
|
];
|
|
218
|
-
// Run with main limit for MRR/nDCG (or reuse wide result slice)
|
|
219
225
|
const mainNames = wideNames.slice(0, TOP_K_MAIN);
|
|
220
|
-
// Build id map from returned results
|
|
221
226
|
const allReturned = [
|
|
222
227
|
...(wideResult.recommended ? [wideResult.recommended] : []),
|
|
223
228
|
...wideResult.alternatives,
|
|
224
229
|
];
|
|
225
230
|
const idByName = buildIdMapFromResult(wideResult.recommended, wideResult.alternatives);
|
|
226
|
-
// Metrics (skip for negative samples)
|
|
227
231
|
const recallMain = isNegative
|
|
228
232
|
? null
|
|
229
233
|
: recallAtK(mainNames, queryCase.expected, TOP_K_MAIN);
|
|
230
234
|
const recall50 = isNegative
|
|
231
235
|
? null
|
|
232
|
-
: recallAtK(wideNames, queryCase.expected,
|
|
233
|
-
const
|
|
236
|
+
: recallAtK(wideNames, queryCase.expected, RECALL_WIDE_K);
|
|
237
|
+
const firstHitRank = isNegative
|
|
234
238
|
? null
|
|
235
239
|
: mrrAtK(mainNames, queryCase.expected, TOP_K_MAIN);
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
240
|
+
// const rankingQuality = isNegative
|
|
241
|
+
// ? null
|
|
242
|
+
// : ndcgAtK(mainNames, queryCase.expected, TOP_K_MAIN);
|
|
239
243
|
const top1Correct = isNegative
|
|
240
244
|
? null
|
|
241
|
-
: queryCase.expected.some((e) => e.rel ===
|
|
242
|
-
|
|
245
|
+
: queryCase.expected.some((e) => e.rel === REL_PRIMARY &&
|
|
246
|
+
wideResult.recommended?.name === e.name);
|
|
243
247
|
const failures = isNegative
|
|
244
248
|
? []
|
|
245
249
|
: classifyFailuresFromTrace(queryCase.expected, wideNames, wideResult.evalTrace, idByName);
|
|
@@ -250,8 +254,8 @@ async function runEval() {
|
|
|
250
254
|
tags: queryCase.tags,
|
|
251
255
|
recallMain,
|
|
252
256
|
recall50,
|
|
253
|
-
|
|
254
|
-
|
|
257
|
+
firstHitScore: firstHitRank,
|
|
258
|
+
// rankingQuality,
|
|
255
259
|
top1Correct,
|
|
256
260
|
returnedNames: mainNames,
|
|
257
261
|
failures,
|
|
@@ -259,19 +263,16 @@ async function runEval() {
|
|
|
259
263
|
falsePositive,
|
|
260
264
|
};
|
|
261
265
|
results.push(qr);
|
|
262
|
-
// Progress
|
|
263
266
|
const status = isNegative
|
|
264
267
|
? falsePositive
|
|
265
|
-
? '✗
|
|
266
|
-
: '✓
|
|
268
|
+
? '✗ False Positive)' // 负例,但系统返回了结果 → 误触发(False Positive)
|
|
269
|
+
: '✓ True Negative' // 负例,系统正确返回空 → 真负例(True Negative)
|
|
267
270
|
: recallMain === 1
|
|
268
|
-
? `✓ R@${TOP_K_MAIN}=1.0
|
|
269
|
-
: `✗ R@${TOP_K_MAIN}=${(recallMain ?? 0).toFixed(2)}
|
|
271
|
+
? `✓ R@${TOP_K_MAIN}=1.0 完全召回`
|
|
272
|
+
: `✗ R@${TOP_K_MAIN}=${(recallMain ?? 0).toFixed(2)} 不完全召回`;
|
|
270
273
|
console.log(` [${queryCase.id}] ${queryCase.input.query.slice(0, 40).padEnd(40)} ${status}`);
|
|
271
274
|
}
|
|
272
|
-
// Print summary
|
|
273
275
|
printSummary(results, TOP_K_MAIN, null);
|
|
274
|
-
// Write JSONL report
|
|
275
276
|
if (OUTPUT_DIR) {
|
|
276
277
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
277
278
|
const dateStr = new Date().toISOString().slice(0, 10);
|
|
@@ -280,7 +281,7 @@ async function runEval() {
|
|
|
280
281
|
fs.writeFileSync(outPath, lines + '\n', 'utf8');
|
|
281
282
|
console.log(`Report written to: ${outPath}`);
|
|
282
283
|
}
|
|
283
|
-
//
|
|
284
|
+
// 如果有正例查询完全没有召回任何相关结果,视为严重问题,输出警告并退出非 0 状态码以示 CI 失败。
|
|
284
285
|
const zeroRecall = results.filter((r) => !r.isNegativeSample && r.recallMain === 0);
|
|
285
286
|
if (zeroRecall.length > 0) {
|
|
286
287
|
console.log(`\nWARN: ${zeroRecall.length} positive queries have Recall@${TOP_K_MAIN}=0:`);
|