@lorrylurui/code-intelligence-mcp 2.0.9 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -52,6 +52,8 @@ INDEX_GLOB=xxx/\*\*/\_.{js,jsx,ts,tsx}
52
52
  npx tsx src/cli/eval-recommendation-cli.ts
53
53
  # 或指定 limit
54
54
  npx tsx src/cli/eval-recommendation-cli.ts --limit 10
55
+
56
+ npm run eval
55
57
  ```
56
58
 
57
59
  ## 5)分析离线测评结果
@@ -78,6 +78,9 @@ async function loadResults(filePath) {
78
78
  return results;
79
79
  }
80
80
  // ─── 工具函数 ─────────────────────────────────────────────────────────────────
81
+ /**
82
+ * 计算平均值,空数组时返回 0。
83
+ */
81
84
  function avg(nums) {
82
85
  if (nums.length === 0)
83
86
  return 0;
@@ -100,12 +103,17 @@ function recallByTag(results, tag) {
100
103
  count: tagged.length,
101
104
  };
102
105
  }
106
+ /**
107
+ * 计算各项指标的平均值,返回一个汇总对象。
108
+ */
103
109
  function computeMetrics(positive, negative) {
104
110
  return {
105
111
  recallMain: avg(positive.map((r) => r.recallMain ?? 0)),
106
112
  recall50: avg(positive.map((r) => r.recall50 ?? 0)),
107
- mrr: avg(positive.map((r) => r.mrrMain ?? 0)),
108
- ndcg: avg(positive.map((r) => r.ndcgMain ?? 0)),
113
+ firstHitScore: avg(positive.map((r) => r.firstHitScore ?? 0)),
114
+ rankingQuality: avg(positive.map((r) => r.rankingQuality ?? 0)),
115
+ coverage: positive.filter((r) => (r.recallMain ?? 0) > 0).length /
116
+ (positive.length || 1),
109
117
  top1Acc: positive.filter((r) => r.top1Correct === true).length /
110
118
  (positive.length || 1),
111
119
  fpRate: negative.filter((r) => r.falsePositive).length /
@@ -120,7 +128,7 @@ async function analyze() {
120
128
  const positive = results.filter((r) => !r.isNegativeSample);
121
129
  const negative = results.filter((r) => r.isNegativeSample);
122
130
  const metrics = computeMetrics(positive, negative);
123
- // Baseline(用于 delta 对比)
131
+ // 如果不传 --baseline,baseMetrics 就是 undefined,delta() 函数返回空字符串,指标后面不显示涨跌。
124
132
  let baseMetrics;
125
133
  if (BASELINE_PATH && fs.existsSync(BASELINE_PATH)) {
126
134
  const baseResults = await loadResults(BASELINE_PATH);
@@ -164,12 +172,10 @@ async function analyze() {
164
172
  console.log(sep);
165
173
  // ── 关键指标 ──
166
174
  console.log('\n关键指标\n');
167
- console.log(` Recall@10: ${pct(metrics.recallMain).padStart(7)}${delta(metrics.recallMain, baseMetrics?.recallMain)}`);
168
- console.log(` Recall@50: ${pct(metrics.recall50).padStart(7)}${delta(metrics.recall50, baseMetrics?.recall50)}`);
169
- console.log(` MRR@10: ${pct(metrics.mrr).padStart(7)}${delta(metrics.mrr, baseMetrics?.mrr)}`);
170
- console.log(` nDCG@10: ${pct(metrics.ndcg).padStart(7)}${delta(metrics.ndcg, baseMetrics?.ndcg)}`);
171
- console.log(` Top1 Acc: ${pct(metrics.top1Acc).padStart(7)}${delta(metrics.top1Acc, baseMetrics?.top1Acc)}`);
172
- console.log(` False Positive: ${pct(metrics.fpRate).padStart(7)}${delta(metrics.fpRate, baseMetrics?.fpRate)}`);
175
+ console.log(` 召回率(Recall@10): ${pct(metrics.recallMain).padStart(7)}${delta(metrics.recallMain, baseMetrics?.recallMain)}`);
176
+ console.log(` 首位命中分(MRR@10): ${pct(metrics.firstHitScore).padStart(7)}${delta(metrics.firstHitScore, baseMetrics?.firstHitScore)}`);
177
+ console.log(` 首条准确率(Top-1): ${pct(metrics.top1Acc).padStart(7)}${delta(metrics.top1Acc, baseMetrics?.top1Acc)}`);
178
+ console.log(` 误触率(FP): ${pct(metrics.fpRate).padStart(7)}${delta(metrics.fpRate, baseMetrics?.fpRate)}`);
173
179
  console.log(`\n 总 query 数:${results.length}(正例 ${positive.length},负例 ${negative.length})`);
174
180
  // ── 分组 Recall ──
175
181
  console.log('\n' + sub);
@@ -216,7 +222,7 @@ async function analyze() {
216
222
  ` 中文 Recall@10 = ${pct(zhStat.recall)},英文 = ${pct(enStat.recall)},差距 ${pct(enStat.recall - zhStat.recall)}\n` +
217
223
  ` 零召回中文 query 示例:${zhZero.join('、')}`);
218
224
  }
219
- // 发现2:函数类符号类型推断
225
+ // 发现2:函数类类型推断
220
226
  const funcStat = tagRecalls.get('function');
221
227
  if (funcStat && funcStat.recall < THRESHOLDS.FUNC_RECALL_LOW) {
222
228
  findings.push(`函数类 query 召回偏低(Recall@10 = ${pct(funcStat.recall)})\n` +
@@ -23,7 +23,9 @@ function getArg(flag, fallback) {
23
23
  const QUERY_SET_PATH = getArg('--query-set', 'offline_eval/query_set.jsonl');
24
24
  const OUTPUT_DIR = getArg('--output', 'offline_eval/results');
25
25
  const TOP_K_MAIN = Number(getArg('--limit', '10')); // Recall@K_MAIN / MRR@K / nDCG@K
26
- const TOP_K_WIDE = 50; // Recall@50(宽口径)
26
+ const RECALL_WIDE_K = 50; // 宽口径召回深度(用于 Recall@50),不是测试集数量
27
+ const REL_RELEVANT_MIN = 1; // rel >= 1 计入相关结果
28
+ const REL_PRIMARY = 2; // rel = 2 表示主答案/最高相关度
27
29
  // ─── 指标计算 ─────────────────────────────────────────────────────────────────
28
30
  /**
29
31
  * 覆盖率 Recall@K:前 K 条结果中命中的相关条目占全部相关条目的比例。
@@ -31,11 +33,12 @@ const TOP_K_WIDE = 50; // Recall@50(宽口径)
31
33
  * 负例(expected 全为 rel=0)视为完全命中,返回 1。
32
34
  */
33
35
  function recallAtK(returnedNames, expected, k) {
34
- const relevant = expected.filter((e) => e.rel >= 1);
36
+ const relevant = expected.filter((e) => e.rel >= REL_RELEVANT_MIN);
35
37
  if (relevant.length === 0)
36
38
  return 1;
37
39
  const topK = returnedNames.slice(0, k);
38
40
  const hits = relevant.filter((e) => topK.includes(e.name));
41
+ // 召回率@k = 真实召回的 / 所有相关的
39
42
  return hits.length / relevant.length;
40
43
  }
41
44
  /**
@@ -43,9 +46,10 @@ function recallAtK(returnedNames, expected, k) {
43
46
  * 衡量「最佳结果排多靠前」;未命中则返回 0。
44
47
  */
45
48
  function mrrAtK(returnedNames, expected, k) {
46
- const relevantNames = new Set(expected.filter((e) => e.rel >= 1).map((e) => e.name));
49
+ const relevantNames = new Set(expected.filter((e) => e.rel >= REL_RELEVANT_MIN).map((e) => e.name));
47
50
  const topK = returnedNames.slice(0, k);
48
51
  for (let i = 0; i < topK.length; i++) {
52
+ // 有一个命中的 就返回对应的 MRR 分数,越靠前分数越高;如果都没命中,最后返回 0。
49
53
  if (relevantNames.has(topK[i]))
50
54
  return 1 / (i + 1);
51
55
  }
@@ -72,9 +76,11 @@ function ndcgAtK(returnedNames, expected, k) {
72
76
  }, 0);
73
77
  return idcg === 0 ? 1 : dcg / idcg;
74
78
  }
75
- // ─── 失败分类(无 ID 时按名称降级处理) ─────────────────────────────────────
79
+ /**
80
+ * 返回失败阶段原因数组(无 ID 时按名称降级处理)
81
+ */
76
82
  function classifyFailuresFromTrace(expected, returnedNames, evalTrace, idByName) {
77
- const relevant = expected.filter((e) => e.rel >= 1);
83
+ const relevant = expected.filter((e) => e.rel >= REL_RELEVANT_MIN);
78
84
  const failures = [];
79
85
  for (const exp of relevant) {
80
86
  if (returnedNames.includes(exp.name))
@@ -123,8 +129,10 @@ function printSummary(results, kMain, baseline) {
123
129
  const negative = results.filter((r) => r.isNegativeSample);
124
130
  const recallMain = avg(positive.map((r) => r.recallMain ?? 0));
125
131
  const recall50 = avg(positive.map((r) => r.recall50 ?? 0));
126
- const mrr = avg(positive.map((r) => r.mrrMain ?? 0));
127
- const ndcg = avg(positive.map((r) => r.ndcgMain ?? 0));
132
+ const firstHitScore = avg(positive.map((r) => r.firstHitScore ?? 0));
133
+ // const rankingQuality = avg(positive.map((r) => r.rankingQuality ?? 0));
134
+ const coverage = positive.filter((r) => (r.recallMain ?? 0) > 0).length /
135
+ (positive.length || 1);
128
136
  const top1Acc = positive.filter((r) => r.top1Correct === true).length /
129
137
  (positive.length || 1);
130
138
  const fpRate = negative.filter((r) => r.falsePositive).length / (negative.length || 1);
@@ -141,12 +149,10 @@ function printSummary(results, kMain, baseline) {
141
149
  console.log('='.repeat(60));
142
150
  console.log(`Queries total: ${results.length} (positive: ${positive.length}, negative: ${negative.length})`);
143
151
  console.log('');
144
- console.log(`Recall@${kMain}: ${formatPct(recallMain)}${diff('recallMain', recallMain)}`);
145
- console.log(`Recall@50: ${formatPct(recall50)}${diff('recall50', recall50)}`);
146
- console.log(`MRR@${kMain}: ${formatPct(mrr)}${diff('mrr', mrr)}`);
147
- console.log(`nDCG@${kMain}: ${formatPct(ndcg)}${diff('ndcg', ndcg)}`);
148
- console.log(`Top1 Acc: ${formatPct(top1Acc)}${diff('top1Acc', top1Acc)}`);
149
- console.log(`False Pos: ${formatPct(fpRate)} (negative samples incorrectly returned results)`);
152
+ console.log(`召回率(Recall@${kMain}): ${formatPct(recallMain)}${diff('recallMain', recallMain)}`);
153
+ console.log(`首位命中分(MRR@${kMain}): ${formatPct(firstHitScore)}${diff('firstHitScore', firstHitScore)}`);
154
+ console.log(`首条准确率(Top-1): ${formatPct(top1Acc)}${diff('top1Acc', top1Acc)}`);
155
+ console.log(`误触率(FP): ${formatPct(fpRate)} (负例被错误推荐)`);
150
156
  console.log('');
151
157
  // ── Failure breakdown ──
152
158
  const allFailures = positive.flatMap((r) => r.failures);
@@ -192,6 +198,8 @@ async function loadQuerySet(filePath) {
192
198
  const trimmed = line.trim();
193
199
  if (!trimmed)
194
200
  continue;
201
+ if (trimmed.startsWith('#') || trimmed.startsWith('//'))
202
+ continue;
195
203
  cases.push(JSON.parse(trimmed));
196
204
  }
197
205
  return cases;
@@ -199,47 +207,43 @@ async function loadQuerySet(filePath) {
199
207
  async function runEval() {
200
208
  console.log(`Loading query set: ${QUERY_SET_PATH}`);
201
209
  const cases = await loadQuerySet(QUERY_SET_PATH);
202
- console.log(`Loaded ${cases.length} queries. Running eval with limit=${TOP_K_MAIN}/${TOP_K_WIDE}...\n`);
210
+ console.log(`Loaded ${cases.length} queries. Running eval with limit=${TOP_K_MAIN}/${RECALL_WIDE_K}...\n`);
203
211
  const repository = new SymbolRepository();
204
212
  const service = new RecommendationService(repository);
205
213
  const results = [];
206
214
  for (const queryCase of cases) {
207
215
  const isNegative = queryCase.expected.length === 0;
208
- // Run with wide limit (Recall@50)
209
216
  const wideResult = await service.recommendComponent({
210
217
  ...queryCase.input,
211
- limit: TOP_K_WIDE,
218
+ limit: RECALL_WIDE_K,
212
219
  evalMode: true,
213
220
  });
214
221
  const wideNames = [
215
222
  ...(wideResult.recommended ? [wideResult.recommended.name] : []),
216
223
  ...wideResult.alternatives.map((a) => a.name),
217
224
  ];
218
- // Run with main limit for MRR/nDCG (or reuse wide result slice)
219
225
  const mainNames = wideNames.slice(0, TOP_K_MAIN);
220
- // Build id map from returned results
221
226
  const allReturned = [
222
227
  ...(wideResult.recommended ? [wideResult.recommended] : []),
223
228
  ...wideResult.alternatives,
224
229
  ];
225
230
  const idByName = buildIdMapFromResult(wideResult.recommended, wideResult.alternatives);
226
- // Metrics (skip for negative samples)
227
231
  const recallMain = isNegative
228
232
  ? null
229
233
  : recallAtK(mainNames, queryCase.expected, TOP_K_MAIN);
230
234
  const recall50 = isNegative
231
235
  ? null
232
- : recallAtK(wideNames, queryCase.expected, TOP_K_WIDE);
233
- const mrrMain = isNegative
236
+ : recallAtK(wideNames, queryCase.expected, RECALL_WIDE_K);
237
+ const firstHitRank = isNegative
234
238
  ? null
235
239
  : mrrAtK(mainNames, queryCase.expected, TOP_K_MAIN);
236
- const ndcgMain = isNegative
237
- ? null
238
- : ndcgAtK(mainNames, queryCase.expected, TOP_K_MAIN);
240
+ // const rankingQuality = isNegative
241
+ // ? null
242
+ // : ndcgAtK(mainNames, queryCase.expected, TOP_K_MAIN);
239
243
  const top1Correct = isNegative
240
244
  ? null
241
- : queryCase.expected.some((e) => e.rel === 2 && wideResult.recommended?.name === e.name);
242
- // Failure classification
245
+ : queryCase.expected.some((e) => e.rel === REL_PRIMARY &&
246
+ wideResult.recommended?.name === e.name);
243
247
  const failures = isNegative
244
248
  ? []
245
249
  : classifyFailuresFromTrace(queryCase.expected, wideNames, wideResult.evalTrace, idByName);
@@ -250,8 +254,8 @@ async function runEval() {
250
254
  tags: queryCase.tags,
251
255
  recallMain,
252
256
  recall50,
253
- mrrMain,
254
- ndcgMain,
257
+ firstHitScore: firstHitRank,
258
+ // rankingQuality,
255
259
  top1Correct,
256
260
  returnedNames: mainNames,
257
261
  failures,
@@ -259,19 +263,16 @@ async function runEval() {
259
263
  falsePositive,
260
264
  };
261
265
  results.push(qr);
262
- // Progress
263
266
  const status = isNegative
264
267
  ? falsePositive
265
- ? '✗ FP'
266
- : '✓ TN'
268
+ ? '✗ False Positive)' // 负例,但系统返回了结果 → 误触发(False Positive)
269
+ : '✓ True Negative' // 负例,系统正确返回空 → 真负例(True Negative)
267
270
  : recallMain === 1
268
- ? `✓ R@${TOP_K_MAIN}=1.0`
269
- : `✗ R@${TOP_K_MAIN}=${(recallMain ?? 0).toFixed(2)}`;
271
+ ? `✓ R@${TOP_K_MAIN}=1.0 完全召回`
272
+ : `✗ R@${TOP_K_MAIN}=${(recallMain ?? 0).toFixed(2)} 不完全召回`;
270
273
  console.log(` [${queryCase.id}] ${queryCase.input.query.slice(0, 40).padEnd(40)} ${status}`);
271
274
  }
272
- // Print summary
273
275
  printSummary(results, TOP_K_MAIN, null);
274
- // Write JSONL report
275
276
  if (OUTPUT_DIR) {
276
277
  fs.mkdirSync(OUTPUT_DIR, { recursive: true });
277
278
  const dateStr = new Date().toISOString().slice(0, 10);
@@ -280,7 +281,7 @@ async function runEval() {
280
281
  fs.writeFileSync(outPath, lines + '\n', 'utf8');
281
282
  console.log(`Report written to: ${outPath}`);
282
283
  }
283
- // Exit with non-zero if any positive query has recall=0
284
+ // 如果有正例查询完全没有召回任何相关结果,视为严重问题,输出警告并退出非 0 状态码以示 CI 失败。
284
285
  const zeroRecall = results.filter((r) => !r.isNegativeSample && r.recallMain === 0);
285
286
  if (zeroRecall.length > 0) {
286
287
  console.log(`\nWARN: ${zeroRecall.length} positive queries have Recall@${TOP_K_MAIN}=0:`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lorrylurui/code-intelligence-mcp",
3
- "version": "2.0.9",
3
+ "version": "2.1.0",
4
4
  "private": false,
5
5
  "description": "MCP server 提供仓库内可复用代码块(ts/tsx/js/jsx/css/less)的索引和查询能力,支持基于代码上下文的智能推荐。",
6
6
  "type": "module",