@lorrylurui/code-intelligence-mcp 1.2.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
7
7
 
8
8
  当用户需要可复用代码或实现类需求时,按顺序执行:
9
9
 
10
- 1. 调用 search_symbols 检索候选,type 根据用户需求传(component/util/selector/type
10
+ 1. 调用 search_symbols 检索候选,type 根据用户需求传(component/function/hook/class/type/interface);描述功能意图时设置 semantic=true
11
11
  2. 如果用户指定了结构过滤条件(props/params/properties/hooks),额外调用 search_by_structure 做结构匹配
12
12
  3. 先 search_symbols(limit=20) 拉候选,再对 Top 3 调用 get_symbol_detail 做深度判断
13
13
  4. 若仅凭签名/摘要无法判断,对最相关的若干候选调用 get_symbol_detail 获取详情
@@ -16,9 +16,25 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
16
16
  - **API 是否简单**、入参是否合适
17
17
  - **依赖与副作用**风险
18
18
  - **复用安全性**(稳定性、耦合度、是否便于扩展)
19
- 6. 给出**唯一首选**推荐,并说明理由,同时使用 **AskUserQuestion **工具,提供两个选项:
19
+ 6. 给出**唯一首选**推荐,并说明理由,同时使用 **AskUserQuestion** 工具,提供两个选项:
20
20
  - 采纳推荐
21
21
  - 取消
22
+ 7. 用户选择"采纳推荐"后,立即调用 inc_usage 工具记录该行为(symbolId 从搜索结果的 id 字段获取),不要遗漏此步骤。
23
+
24
+ ## 不适用场景
25
+
26
+ 以下情况不要调用搜索工具:
27
+ - 用户只是问代码如何写(概念性问题),不需要检索已有实现
28
+ - 用户明确说"新建一个"、"自己实现"、"不用已有的"
29
+ - 查询过于通用(如只说"utils"),先与用户确认具体需求再搜索
30
+
31
+ ## 搜索结果判断
32
+
33
+ 根据 semanticSimilarity 决定推荐置信度:
34
+ - **> 0.85**:高置信度,可直接推荐
35
+ - **0.6 – 0.85**:中等置信度,需结合 description 和 get_symbol_detail 综合判断
36
+ - **< 0.6**:低置信度,说明可能无合适实现,明确告知用户
37
+ - **空结果**:明确说"未找到已有实现",不要凭空推荐
22
38
 
23
39
  ## 回复结构
24
40
 
@@ -30,7 +46,7 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
30
46
  - **理由:** 1~3 条要点
31
47
  - **其他候选:** 简要列出及取舍(同步标注副作用)
32
48
  - **用法提示:** 结合用户场景的最小集成说明
33
- - **是否采纳:** 展示两个选项: 选项1.采纳推荐 选项2.取消。等待用户确认
49
+ - **是否采纳:** 展示两个选项:选项1. 采纳推荐 选项2. 取消。等待用户确认
34
50
 
35
51
  ## 约束
36
52
 
@@ -38,12 +54,6 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
38
54
  - 若无合适代码块,明确说明,并给出最接近的选项及差距。
39
55
  - 推理简洁,面向落地实现。
40
56
 
41
- ## 使用反馈
42
-
43
- 当选择‘采纳推荐’必须调用 inc_usage 工具记录采纳行为,调用格式如下:
44
- “inc_usage({ symbolId: <选中的代码块 id> })”
45
- 其中 symbolId 从 search_symbols 或 search_by_structure 返回结果的 id 字段获取。这条记录会用于后续排序优化。
46
-
47
57
  ## 更多示例
48
58
 
49
59
  与仓库内 \`.cursor/skills/reusable-code-advisor/examples.md\` 中的示例一致(在 Cursor 或本地打开该文件查看)。
@@ -1,10 +1,9 @@
1
1
  import { env } from '../config/env.js';
2
- import { getMySqlPool } from '../db/mysql.js';
2
+ import { getPool } from '../db/postgres.js';
3
3
  import { createEmbeddingClient } from '../services/embeddingClient.js';
4
- import { cosineSimilarity } from '../services/vectorMath.js';
5
4
  import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
6
- const THREADHOLD_SIMILARITY_BEFORE_RANKED = 0.5;
7
- const TOP_K_FOR_RANKING = 100; // 进入复杂排序的候选数上限(语义相似度初筛后保留的结果数,过大会增加排序成本)
5
+ const SIMILARITY_THRESHOLD = 0.5;
6
+ const TOP_K = 20;
8
7
  const inMemorySymbols = [
9
8
  {
10
9
  id: 1,
@@ -42,6 +41,7 @@ function parseEmbedding(raw) {
42
41
  }
43
42
  if (typeof raw === 'string') {
44
43
  try {
44
+ // pgvector 返回 "[x1,x2,...]",恰好是合法 JSON 数组
45
45
  const j = JSON.parse(raw);
46
46
  if (!Array.isArray(j))
47
47
  return null;
@@ -85,7 +85,7 @@ function getMetaArray(meta, key) {
85
85
  export class SymbolRepository {
86
86
  pool;
87
87
  constructor() {
88
- this.pool = getMySqlPool();
88
+ this.pool = getPool();
89
89
  }
90
90
  async search(query, type) {
91
91
  if (!this.pool) {
@@ -97,31 +97,28 @@ export class SymbolRepository {
97
97
  (s.description ?? '').toLowerCase().includes(q)));
98
98
  });
99
99
  }
100
- const params = [`%${query}%`];
100
+ const params = [
101
+ `%${query}%`,
102
+ SEARCHABLE_STATUS,
103
+ ];
101
104
  let sql = `
102
- SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at
103
- FROM ${env.mysqlSymbolsTable}
104
- WHERE (name LIKE ? OR description LIKE ?)
105
- AND status = ${SEARCHABLE_STATUS}
105
+ SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
106
+ FROM ${env.symbolsTable}
107
+ WHERE (name ILIKE $1 OR description ILIKE $1)
108
+ AND status = $2
106
109
  `;
107
- params.push(`%${query}%`);
108
110
  if (type) {
109
- sql += ' AND type = ?';
110
111
  params.push(type);
112
+ sql += ` AND type = $${params.length}`;
111
113
  }
112
114
  sql += ' ORDER BY usage_count DESC LIMIT 20';
113
- const [rows] = await this.pool.query(sql, params);
115
+ const { rows } = await this.pool.query(sql, params);
114
116
  return rows.map((r) => mapRow(r));
115
117
  }
116
118
  /**
117
- * Phase 5:对自然语言查询做向量检索,启用分桶采样策略,返回代码
118
- 块与余弦相似度。
119
- * 分桶策略:
120
- * - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
121
- * - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
122
- floor(catLimit / pathCount)) 条
123
- * 最终选择topK,进入排序
124
- */
119
+ * 语义向量检索:将 query 嵌入后用 pgvector <=> 运算符(cosine distance)在数据库内完成相似度排序。
120
+ * 不再需要在 Node 拉取全量向量做内存计算。
121
+ */
125
122
  async searchSemanticHits(query, opts) {
126
123
  if (!env.embeddingServiceUrl) {
127
124
  throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
@@ -129,111 +126,47 @@ export class SymbolRepository {
129
126
  if (!this.pool) {
130
127
  return [];
131
128
  }
132
- const candidateLimit = opts?.candidateLimit ?? 3000;
133
- const limit = opts?.limit ?? TOP_K_FOR_RANKING;
134
- const type = opts?.type;
129
+ const limit = opts?.limit ?? TOP_K;
135
130
  const client = createEmbeddingClient(env.embeddingServiceUrl);
136
131
  const [queryVec] = await client.embed([query.trim()]);
137
132
  if (!queryVec?.length) {
138
133
  throw new Error('查询向量为空');
139
134
  }
140
- // 查询足够的数据以支持分桶采样(3倍候选数以覆盖各桶)
141
- const fetchLimit = candidateLimit * 3;
135
+ // pgvector 向量字面量格式:[x1,x2,...]
136
+ const vecLiteral = `[${queryVec.join(',')}]`;
137
+ const params = [vecLiteral, SEARCHABLE_STATUS];
138
+ // 1 - cosine_distance = cosine_similarity;多取一倍候选后在应用层过阈值
142
139
  let sql = `
143
- SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at, embedding
144
- FROM ${env.mysqlSymbolsTable}
140
+ SELECT id, name, type, category, path, description, content, meta::text AS meta,
141
+ usage_count, created_at,
142
+ 1 - (embedding <=> $1::vector) AS similarity
143
+ FROM ${env.symbolsTable}
145
144
  WHERE embedding IS NOT NULL
146
- AND status = ${SEARCHABLE_STATUS}
145
+ AND status = $2
147
146
  `;
148
- const params = [];
149
- if (type) {
150
- sql += ' AND type = ?';
151
- params.push(type);
147
+ if (opts?.type) {
148
+ params.push(opts.type);
149
+ sql += ` AND type = $${params.length}`;
152
150
  }
153
- sql += ' DESC LIMIT ?';
154
- params.push(fetchLimit);
155
- const [rows] = await this.pool.query(sql, params);
156
- const withVec = rows
157
- .map((r) => mapRow(r, { includeEmbedding: true }))
158
- .filter((s) => s.embedding && s.embedding.length === queryVec.length);
159
- // 分桶采样:按 category + path 两层分桶
160
- const sampled = this.bucketSampling(withVec, candidateLimit);
161
- return sampled
162
- .map((s) => {
163
- const sim = cosineSimilarity(queryVec, s.embedding);
164
- const { embedding: _, ...rest } = s;
165
- return { symbol: rest, similarity: sim };
166
- })
167
- .filter((x) => x.similarity >= THREADHOLD_SIMILARITY_BEFORE_RANKED) // 初筛阈值,过滤掉明显不相关的结果
168
- .sort((a, b) => b.similarity - a.similarity)
151
+ params.push(limit * 2); // 多取一倍以便 SIMILARITY_THRESHOLD 过滤后仍有足量结果
152
+ sql += ` ORDER BY embedding <=> $1::vector LIMIT $${params.length}`;
153
+ const { rows } = await this.pool.query(sql, params);
154
+ return rows
155
+ .map((r) => ({
156
+ symbol: mapRow(r),
157
+ similarity: Number(r.similarity),
158
+ }))
159
+ .filter((x) => x.similarity >= SIMILARITY_THRESHOLD)
169
160
  .slice(0, limit);
170
161
  }
171
- /**
172
- * 分桶采样核心逻辑
173
- * - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
174
- * - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
175
- floor(catLimit / pathCount)) 条
176
- */
177
- bucketSampling(symbols, limit) {
178
- if (symbols.length === 0)
179
- return [];
180
- // 按 category 分组
181
- const categoryGroups = new Map();
182
- for (const s of symbols) {
183
- const cat = s.category ?? '__null__';
184
- if (!categoryGroups.has(cat)) {
185
- categoryGroups.set(cat, []);
186
- }
187
- categoryGroups.get(cat).push(s);
188
- }
189
- const total = symbols.length;
190
- const sampled = [];
191
- // 第一层:按 category 占比计算采样数,保底10条
192
- for (const [, catSymbols] of categoryGroups) {
193
- const catCount = catSymbols.length;
194
- const catRatio = catCount / total;
195
- const catLimit = Math.max(10, Math.floor(limit * catRatio));
196
- // 按 path 分组(提取目录部分)
197
- const pathGroups = new Map();
198
- for (const s of catSymbols) {
199
- const dir = s.path.includes('/')
200
- ? s.path.slice(0, s.path.lastIndexOf('/'))
201
- : '__root__';
202
- if (!pathGroups.has(dir)) {
203
- pathGroups.set(dir, []);
204
- }
205
- pathGroups.get(dir).push(s);
206
- }
207
- const pathCount = pathGroups.size;
208
- const perPathSample = Math.max(5, Math.floor(catLimit / pathCount));
209
- // 第二层:每个 path 子桶内乱序后采样
210
- for (const pathSymbols of pathGroups.values()) {
211
- // 原地乱序(Fisher- Y ates)
212
- for (let i = pathSymbols.length - 1; i > 0; i--) {
213
- const j = Math.floor(Math.random() * (i + 1));
214
- [pathSymbols[i], pathSymbols[j]] = [
215
- pathSymbols[j],
216
- pathSymbols[i],
217
- ];
218
- }
219
- const pathSampleCount = Math.min(perPathSample, pathSymbols.length);
220
- sampled.push(...pathSymbols.slice(0, pathSampleCount));
221
- if (sampled.length >= limit)
222
- break;
223
- }
224
- if (sampled.length >= limit)
225
- break;
226
- }
227
- return sampled.slice(0, limit);
228
- }
229
162
  async getByName(name) {
230
163
  if (!this.pool) {
231
164
  return (inMemorySymbols.find((s) => s.name.toLowerCase() === name.toLowerCase()) ?? null);
232
165
  }
233
- const [rows] = await this.pool.query(`
234
- SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at
235
- FROM ${env.mysqlSymbolsTable}
236
- WHERE name = ?
166
+ const { rows } = await this.pool.query(`
167
+ SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
168
+ FROM ${env.symbolsTable}
169
+ WHERE name = $1
237
170
  LIMIT 1
238
171
  `, [name]);
239
172
  if (rows.length === 0) {
@@ -254,8 +187,8 @@ export class SymbolRepository {
254
187
  }
255
188
  return false;
256
189
  }
257
- const [result] = await this.pool.query(`UPDATE ${env.mysqlSymbolsTable} SET usage_count = usage_count + 1 WHERE id = ?`, [symbolId]);
258
- return result.affectedRows > 0;
190
+ const result = await this.pool.query(`UPDATE ${env.symbolsTable} SET usage_count = usage_count + 1 WHERE id = $1`, [symbolId]);
191
+ return result.rowCount !== null && result.rowCount > 0;
259
192
  }
260
193
  async searchByStructure(fields, opts) {
261
194
  const normalized = fields.map((f) => f.trim()).filter(Boolean);
@@ -286,21 +219,21 @@ export class SymbolRepository {
286
219
  }
287
220
  const params = [];
288
221
  let sql = `
289
- SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at
290
- FROM ${env.mysqlSymbolsTable}
222
+ SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
223
+ FROM ${env.symbolsTable}
291
224
  WHERE 1 = 1
292
225
  `;
293
226
  if (type) {
294
- sql += ' AND type = ?';
295
227
  params.push(type);
228
+ sql += ` AND type = $${params.length}`;
296
229
  }
297
230
  if (category) {
298
- sql += ' AND category LIKE ?';
299
231
  params.push(`%${category}%`);
232
+ sql += ` AND category ILIKE $${params.length}`;
300
233
  }
301
- sql += ' ORDER BY usage_count DESC LIMIT ?';
302
234
  params.push(Math.max(limit * 5, 50));
303
- const [rows] = await this.pool.query(sql, params);
235
+ sql += ` ORDER BY usage_count DESC LIMIT $${params.length}`;
236
+ const { rows } = await this.pool.query(sql, params);
304
237
  return rows
305
238
  .map((r) => mapRow(r))
306
239
  .filter(matchesAll)
@@ -1,13 +1,16 @@
1
1
  /**
2
2
  * BullMQ embedding 队列 producer。
3
3
  *
4
- * 设计要点:
5
- * - jobId = semanticHash 相同语义模板自动去重,N 个符号相同 hash 只入队一次
6
- * - CI 流程只负责 enqueue,worker 异步消费,CI 不阻塞
7
- * - 调用方在进程退出前需调用 closeEmbeddingQueue() 释放连接
4
+ * 去重策略:
5
+ * - 同一 CI run 内:ci-index.ts new Set(hashes) 去重后再入队,Redis 层无需 jobId 去重
6
+ * - CI run 的向量缓存:由 worker 查询 DB(status=online AND semantic_hash=?)决定是否调 API
7
+ * - 不使用 jobId,避免 BullMQ completed 状态残留导致后续 run 任务被跳过
8
+ *
9
+ * CI 流程只负责 enqueue,worker 异步消费,CI 不阻塞。
10
+ * 调用方在进程退出前需调用 closeEmbeddingQueue() 释放连接。
8
11
  */
9
12
  import { Queue } from 'bullmq';
10
- import Redis from 'ioredis';
13
+ import { Redis } from 'ioredis';
11
14
  import { env } from '../config/env.js';
12
15
  let _queue = null;
13
16
  let _connection = null;
@@ -21,31 +24,29 @@ function getQueue() {
21
24
  }
22
25
  return _queue;
23
26
  }
24
- /**
25
- * 将一个 semanticHash 对应的 embedding 任务入队。
26
- * 若队列中已存在相同 jobId(semanticHash)的待处理任务,BullMQ 自动忽略重复请求。
27
- */
28
- export async function enqueueEmbedding(semanticHash) {
29
- await getQueue().add('embed', { semanticHash }, {
30
- jobId: semanticHash, // 去重键:相同 hash 幂等
27
+ /** 单个 semanticHash 入队 */
28
+ export async function enqueueEmbedding(semanticHash, symbolsTable) {
29
+ await getQueue().add('embed', { semanticHash, symbolsTable: symbolsTable ?? env.symbolsTable }, {
31
30
  attempts: 5,
32
31
  backoff: { type: 'exponential', delay: 5_000 },
33
32
  });
34
33
  }
35
- /** 批量入队,适合全量扫描场景 */
36
- export async function enqueueEmbeddingBatch(semanticHashes) {
34
+ /**
35
+ * 批量入队(同一 CI run 内已由调用方 new Set 去重)。
36
+ * worker 消费时查 DB 决定是否真正调 embedding API。
37
+ */
38
+ export async function enqueueEmbeddingBatch(semanticHashes, symbolsTable) {
39
+ const table = symbolsTable ?? env.symbolsTable;
37
40
  const queue = getQueue();
38
41
  const jobs = semanticHashes.map((hash) => ({
39
42
  name: 'embed',
40
- data: { semanticHash: hash },
43
+ data: { semanticHash: hash, symbolsTable: table },
41
44
  opts: {
42
- jobId: hash,
43
45
  attempts: 5,
44
46
  backoff: { type: 'exponential', delay: 5_000 },
45
47
  },
46
48
  }));
47
- // BullMQ addBulk 会跳过已存在 jobId 的任务
48
- await queue.addBulkJobs(jobs);
49
+ await queue.addBulk(jobs);
49
50
  }
50
51
  /** 进程退出前关闭连接(CI 脚本必须调用,否则进程挂起) */
51
52
  export async function closeEmbeddingQueue() {
@@ -1,64 +1,108 @@
1
- import { resolve } from 'node:path';
2
- import { loadProjectDotenv } from '../config/env.js';
3
- import { getMySqlPool } from '../db/mysql.js';
4
- import { indexedRowToEmbedText } from '../indexer/embedText.js';
5
- import { indexProject } from '../indexer/indexProject.js';
1
+ import { resolve, join } from 'node:path';
2
+ import { readFileSync } from 'node:fs';
3
+ import fg from 'fast-glob';
4
+ import { env } from '../config/env.js';
5
+ import { getPool } from '../db/postgres.js';
6
+ import { getAllTableSQLs } from '../db/schema.js';
7
+ import { indexProject, DEFAULT_IGNORE } from '../indexer/indexProject.js';
6
8
  import { upsertSymbols } from '../indexer/persistSymbols.js';
7
- import { initCategoryEmbeddings, resolveCategory, } from '../indexer/categoryClassifier.js';
8
- import { createEmbeddingClient, embedAll, } from '../services/embeddingClient.js';
9
+ import { computeFileHash } from '../indexer/tsAstNormalizer.js';
10
+ import { getRelativePathForDisplay } from '../indexer/heuristics.js';
11
+ import { enqueueEmbeddingBatch, closeEmbeddingQueue, } from '../services/embeddingQueue.js';
12
+ import { SYMBOL_STATUS } from '../config/symbolStatus.js';
9
13
  export async function runReindex(options = {}) {
10
14
  const projectRoot = resolve(options.projectRoot ?? process.cwd());
11
- const { dryRun = false } = options;
12
- // 1️ 加载第三方 .env:只覆盖未定义的变量 保留 MCP Server 自身配置
13
- loadProjectDotenv(projectRoot);
14
- // 2️ 打印生效的环境变量(便于调试)
15
- console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, ` +
16
- `MYSQL_HOST=${process.env.MYSQL_HOST}`);
17
- // 3️⃣ 只有需要写入数据库时才检查 MySQL 并建立连接
18
- const embeddingServiceUrl = process.env.EMBEDDING_SERVICE_URL;
19
- if (!dryRun && embeddingServiceUrl) {
20
- // 初始化 category embeddings
21
- await initCategoryEmbeddings();
22
- }
15
+ const { dryRun = false, forceRebuild = false } = options;
16
+ console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, forceRebuild=${forceRebuild}, PG_URL=${process.env.PG_URL ? '(set)' : '(not set)'}, SYMBOLS_TABLE=${env.symbolsTable}`);
23
17
  let pool = null;
24
18
  if (!dryRun) {
25
- pool = getMySqlPool();
26
- await pool.query('SELECT 1'); // 测试连接
27
- console.error('[reindex] MySQL connection successful');
19
+ pool = getPool();
20
+ await pool.query('SELECT 1');
21
+ console.error('[reindex] PostgreSQL connection successful');
22
+ // 确保 extension + table + indexes 存在(幂等,多租户表名安全)
23
+ for (const sql of getAllTableSQLs()) {
24
+ await pool.query(sql);
25
+ }
26
+ console.error(`[reindex] schema ready: ${env.symbolsTable}`);
27
+ }
28
+ // ─── 1. glob 解析出全量文件列表(绝对路径)──────────────────────────
29
+ const ignore = [...DEFAULT_IGNORE, ...(options.ignore ?? [])];
30
+ const patterns = (options.globPatterns ?? ['src/**/*.{ts,tsx}']).map((p) => p.startsWith('/') ? p : join(projectRoot, p).replace(/\\/g, '/'));
31
+ const allFiles = await fg(patterns, {
32
+ absolute: true,
33
+ ignore,
34
+ onlyFiles: true,
35
+ dot: false,
36
+ });
37
+ console.error(`[reindex] glob found ${allFiles.length} file(s)`);
38
+ // ─── 2. file_hash 过滤:跳过 AST 未变的文件(CPU 优化)────────────────
39
+ // forceRebuild 时跳过此过滤,file_hash 不可复用(模板/模型变更时相同文件产出不同 content)
40
+ let filesToIndex = allFiles;
41
+ let skippedFiles = 0;
42
+ if (!forceRebuild && pool && allFiles.length > 0) {
43
+ // 计算所有文件当前 hash
44
+ const currentFileHashes = new Map(); // relPath → hash
45
+ for (const absPath of allFiles) {
46
+ const content = readFileSync(absPath, 'utf-8');
47
+ const relPath = getRelativePathForDisplay(projectRoot, absPath);
48
+ currentFileHashes.set(relPath, computeFileHash(content));
49
+ }
50
+ // 一次性批量查 DB 已有的 file_hash
51
+ const relPaths = [...currentFileHashes.keys()];
52
+ const { rows: dbRows } = await pool.query(`SELECT DISTINCT path, file_hash FROM ${env.symbolsTable}
53
+ WHERE path = ANY($1) AND file_hash IS NOT NULL`, [relPaths]);
54
+ const dbFileHash = new Map(dbRows.map((r) => [r.path, r.file_hash]));
55
+ filesToIndex = allFiles.filter((absPath) => {
56
+ const relPath = getRelativePathForDisplay(projectRoot, absPath);
57
+ return currentFileHashes.get(relPath) !== dbFileHash.get(relPath);
58
+ });
59
+ skippedFiles = allFiles.length - filesToIndex.length;
60
+ console.error(`[reindex] file_hash: ${skippedFiles} unchanged (skipped), ${filesToIndex.length} changed (to parse)`);
28
61
  }
29
- let rows = await indexProject({
62
+ else if (forceRebuild) {
63
+ console.error(`[reindex] forceRebuild=true, skipping file_hash filter — parsing all ${allFiles.length} file(s)`);
64
+ }
65
+ if (filesToIndex.length === 0) {
66
+ console.error('[reindex] all files unchanged, nothing to do');
67
+ return {
68
+ projectRoot,
69
+ extractedCount: 0,
70
+ skippedFiles,
71
+ enqueuedCount: 0,
72
+ upserted: false,
73
+ };
74
+ }
75
+ // ─── 3. 只对变更文件做 AST 解析 ──────────────────────────────────
76
+ const rows = await indexProject({
30
77
  projectRoot,
31
- globPatterns: options.globPatterns,
32
- ignore: options.ignore,
78
+ globPatterns: filesToIndex,
33
79
  });
34
- console.error(`[reindex] extracted ${rows.length} symbol(s) from ${projectRoot}`);
35
- let embeddingsComputed = false;
36
- let embeddingPayload;
37
- if (!options.dryRun && rows.length > 0 && embeddingServiceUrl) {
38
- try {
39
- const client = createEmbeddingClient(embeddingServiceUrl);
40
- // 先实现ts语义模板,js保留原逻辑
41
- const texts = rows.map((row) => row.semantic_hash ?? indexedRowToEmbedText(row));
42
- const vecs = await embedAll(client, texts);
43
- console.error('==vecs', vecs?.length);
44
- // 生成category
45
- rows = await resolveCategory(rows, vecs);
46
- embeddingPayload = vecs;
47
- embeddingsComputed = true;
80
+ console.error(`[reindex] extracted ${rows.length} symbol(s) from ${filesToIndex.length} changed file(s)`);
81
+ // ─── 4. 写库(全部 pending)→ 入队,worker 异步处理 embedding + category ──
82
+ const nullPayload = rows.map(() => null);
83
+ const pendingHashes = [
84
+ ...new Set(rows.map((r) => r.semantic_hash).filter(Boolean)),
85
+ ];
86
+ if (!dryRun) {
87
+ // forceRebuild:先清空 DB 中已有的 embedding,使 worker cache check 必然 miss
88
+ if (forceRebuild && pendingHashes.length > 0) {
89
+ await pool.query(`UPDATE ${env.symbolsTable}
90
+ SET embedding = NULL, status = $1
91
+ WHERE semantic_hash = ANY($2)`, [SYMBOL_STATUS.PENDING, pendingHashes]);
92
+ console.error(`[reindex] forceRebuild: cleared embeddings for ${pendingHashes.length} semantic_hash(es)`);
48
93
  }
49
- catch (err) {
50
- console.error('[reindex] embedding skipped (service error):', err);
51
- embeddingPayload = rows.map(() => null);
94
+ await upsertSymbols(pool, rows, nullPayload);
95
+ if (pendingHashes.length > 0) {
96
+ await enqueueEmbeddingBatch(pendingHashes, env.symbolsTable);
97
+ console.error(`[reindex] enqueued ${pendingHashes.length} semantic_hash(es) → worker will handle embedding asynchronously`);
52
98
  }
99
+ await closeEmbeddingQueue();
53
100
  }
54
- if (!options.dryRun) {
55
- await upsertSymbols(pool, rows, embeddingPayload);
56
- }
57
- console.error('===out', JSON.stringify(rows));
58
101
  return {
59
102
  projectRoot,
60
103
  extractedCount: rows.length,
61
- upserted: !options.dryRun,
62
- embeddingsComputed,
104
+ skippedFiles,
105
+ enqueuedCount: pendingHashes.length,
106
+ upserted: !dryRun,
63
107
  };
64
108
  }
@@ -5,7 +5,9 @@ export const getSymbolDetailInput = z.object({
5
5
  export function createGetSymbolDetailTool(repository) {
6
6
  return {
7
7
  name: 'get_symbol_detail',
8
- description: '按名称获取单个代码块的完整详情。',
8
+ description: '获取单个代码块的完整详情(含源码、参数类型、调用关系、副作用)。\n' +
9
+ '仅在以下情况调用:search_symbols 返回的摘要信息不足以判断是否适用(如签名模糊、副作用不明确)。\n' +
10
+ '通常对 top 1-3 候选调用,不要对所有结果批量调用。',
9
11
  inputSchema: getSymbolDetailInput.shape,
10
12
  handler: async (input) => {
11
13
  const symbol = await repository.getByName(input.name);
@@ -6,7 +6,9 @@ export const incUsageInput = z.object({
6
6
  export function createIncUsageTool(repository) {
7
7
  return {
8
8
  name: 'inc_usage',
9
- description: '当开发者采纳了某个推荐代码块时,调用此工具记录。usage_count +1,用于后续排序优化。',
9
+ description: '在用户明确确认"采纳推荐"后调用,记录复用行为用于排序优化(usage_count +1)。\n' +
10
+ '注意:仅在用户主动确认采纳时调用,不要在推荐后自动调用。\n' +
11
+ 'symbolId 从 search_symbols 或 search_by_structure 返回结果的 id 字段获取。',
10
12
  inputSchema: incUsageInput.shape,
11
13
  handler: async (input) => {
12
14
  const success = await repository.incUsage(input.symbolId);
@@ -15,7 +17,10 @@ export function createIncUsageTool(repository) {
15
17
  content: [
16
18
  {
17
19
  type: 'text',
18
- text: JSON.stringify({ error: '未找到该代码块', symbolId: input.symbolId }, null, 2),
20
+ text: JSON.stringify({
21
+ error: '未找到该代码块',
22
+ symbolId: input.symbolId,
23
+ }, null, 2),
19
24
  },
20
25
  ],
21
26
  };
@@ -24,7 +29,11 @@ export function createIncUsageTool(repository) {
24
29
  content: [
25
30
  {
26
31
  type: 'text',
27
- text: JSON.stringify({ ok: true, symbolId: input.symbolId, message: 'usage_count 已 +1' }, null, 2),
32
+ text: JSON.stringify({
33
+ ok: true,
34
+ symbolId: input.symbolId,
35
+ message: 'usage_count 已 +1',
36
+ }, null, 2),
28
37
  },
29
38
  ],
30
39
  };
@@ -9,7 +9,9 @@ export const reindexInput = z.object({
9
9
  export function createReindexTool() {
10
10
  return {
11
11
  name: 'reindex',
12
- description: '重建源码代码块索引并写入 MySQL;设置 dryRun=true 时仅预览抽取数量,不落库、不调用嵌入服务。若配置 EMBEDDING_SERVICE_URL,非 dryRun 时会写入向量列。',
12
+ description: '⚠️ 高成本操作(耗时可能超过数分钟),仅在用户明确要求"重建索引"时调用,不要因搜索结果不佳而自动调用。\n' +
13
+ '重建源码代码块索引并写入 MySQL。设置 dryRun=true 时仅预览抽取数量,不落库。\n' +
14
+ '写入后 embedding 由后台 worker 异步处理,队列清空后打印完成信号。',
13
15
  inputSchema: reindexInput.shape,
14
16
  handler: async (input) => {
15
17
  const startedAt = Date.now();
@@ -13,7 +13,9 @@ export const searchByStructureInput = z.object({
13
13
  export function createSearchByStructureTool(repository) {
14
14
  return {
15
15
  name: 'search_by_structure',
16
- description: '通过结构化字段(如 props/params/properties/hooks)搜索代码块,适用于 API 形态的查询。',
16
+ description: '按代码块的结构字段(props/params/hooks)检索,适合已知接口形态时使用。\n' +
17
+ '示例:需要一个接受 value、onChange、error 三个 prop 的输入组件 → fields: ["value", "onChange", "error"], type: "component"\n' +
18
+ '与 search_symbols 配合:先语义检索候选,再用本工具做 API 结构过滤以精确匹配。',
17
19
  inputSchema: searchByStructureInput.shape,
18
20
  handler: async (input) => {
19
21
  const rows = await repository.searchByStructure(input.fields, {
@@ -10,12 +10,16 @@ export const searchSymbolsInput = z.object({
10
10
  semantic: z.boolean().optional().default(false),
11
11
  limit: z.number().int().min(1).max(100).optional().default(20),
12
12
  });
13
- const THREADHOLD_SIMILARITY_FOR_FINAL = 0.6;
13
+ const SCORE_THRESHOLD_FOR_FINAL = 0.45; // 综合排序分阈值(语义相似度占50%权重,原始0.5相似度 ≈ 综合0.35起)
14
14
  const TOP_K_FOR_FINAL_RESULTS = 20; // 结果上限,返回相似度高的,保证数据质量
15
15
  export function createSearchSymbolsTool(repository) {
16
16
  return {
17
17
  name: 'search_symbols',
18
- description: '通过关键词和可选的类型搜索代码块。设置 semantic=true 可进行自然语言/意图式搜索(此功能需要 embedding 服务 + 已索引的向量)。',
18
+ description: '搜索项目中已有的可复用代码块(函数、组件、Hook、类型等)。在生成新代码之前必须先调用本工具,确认是否已有实现。\n' +
19
+ '- 有明确名称时(如 "useDebounce"):semantic=false(默认),直接关键词检索\n' +
20
+ '- 描述功能意图时(如 "防抖"、"处理表单提交"):semantic=true,进行语义检索(需 embedding 服务已就绪)\n' +
21
+ '- 不确定 type 时省略该参数,不要猜测\n' +
22
+ '- 返回结果含 semanticSimilarity 字段:>0.85 高置信度可直接推荐,0.6-0.85 需结合 description 判断,<0.6 说明可能无合适实现',
19
23
  inputSchema: searchSymbolsInput.shape,
20
24
  handler: async (input) => {
21
25
  if (input.semantic) {
@@ -38,8 +42,7 @@ export function createSearchSymbolsTool(repository) {
38
42
  reasonDetail: item.reason,
39
43
  semanticSimilarity: Number((simById.get(item.symbol.id) ?? 0).toFixed(4)),
40
44
  }))
41
- .filter((x) => x.semanticSimilarity >=
42
- THREADHOLD_SIMILARITY_FOR_FINAL) // 阈值过滤,去掉明显不相关的结果
45
+ .filter((x) => x.score >= SCORE_THRESHOLD_FOR_FINAL) // 基于综合排序分过滤,保留 usage/recency 高的结果
43
46
  .slice(0, TOP_K_FOR_FINAL_RESULTS)
44
47
  : hits.map((h) => ({
45
48
  id: h.symbol.id,
@@ -73,7 +76,7 @@ export function createSearchSymbolsTool(repository) {
73
76
  reason: item.reason.summary,
74
77
  reasonDetail: item.reason,
75
78
  }))
76
- .filter((x) => x.score >= THREADHOLD_SIMILARITY_FOR_FINAL) // 阈值过滤,去掉明显不相关的结果
79
+ .filter((x) => x.score >= SCORE_THRESHOLD_FOR_FINAL) // 基于综合排序分过滤
77
80
  .slice(0, TOP_K_FOR_FINAL_RESULTS)
78
81
  : rows.map((r) => ({
79
82
  id: r.id,