@lorrylurui/code-intelligence-mcp 1.2.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/cli/ci-index-cli.js +40 -21
- package/dist/cli/ci-index.js +23 -27
- package/dist/cli/embedding-worker-cli.js +4 -4
- package/dist/cli/index-codebase-cli.js +64 -0
- package/dist/config/env.js +53 -77
- package/dist/db/postgres.js +13 -0
- package/dist/db/schema.js +40 -25
- package/dist/indexer/categoryClassifier.js +3 -3
- package/dist/indexer/indexProject.js +1 -1
- package/dist/indexer/persistSymbols.js +47 -26
- package/dist/prompts/reusableCodeAdvisorPrompt.js +19 -9
- package/dist/repositories/symbolRepository.js +52 -119
- package/dist/services/embeddingQueue.js +19 -18
- package/dist/services/reindex.js +92 -48
- package/dist/tools/getSymbolDetail.js +3 -1
- package/dist/tools/incUsage.js +12 -3
- package/dist/tools/reindex.js +3 -1
- package/dist/tools/searchByStructure.js +3 -1
- package/dist/tools/searchSymbols.js +8 -5
- package/dist/workers/embeddingWorker.js +86 -41
- package/package.json +4 -2
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { env } from '../config/env.js';
|
|
2
|
-
import {
|
|
2
|
+
import { getAllTableSQLs } from '../db/schema.js';
|
|
3
|
+
import { SYMBOL_STATUS } from '../config/symbolStatus.js';
|
|
3
4
|
/**
|
|
4
5
|
* 依赖表上 `(path, name)` 唯一键:新行插入,已存在则更新类型/描述/内容与 meta;**不**修改 `usage_count`。
|
|
5
6
|
* @param rows 来自 `indexProject`;空数组时立即返回,不开启事务。
|
|
6
7
|
* @param embeddings 与 `rows` 等长;某项为 `null` 表示本行不更新已有 `embedding`(新行则写入 NULL)。
|
|
7
|
-
*
|
|
8
|
+
* - 有值 → status 置为 online(2)
|
|
9
|
+
* - null → 新行写 pending(1),已有行保持原 status
|
|
8
10
|
*/
|
|
9
11
|
export async function upsertSymbols(pool, rows, embeddings) {
|
|
10
12
|
if (rows.length === 0)
|
|
@@ -12,30 +14,48 @@ export async function upsertSymbols(pool, rows, embeddings) {
|
|
|
12
14
|
if (embeddings && embeddings.length !== rows.length) {
|
|
13
15
|
throw new Error('upsertSymbols: embeddings length must match rows');
|
|
14
16
|
}
|
|
15
|
-
const actor = process.env.GITHUB_USERNAME?.trim() || '
|
|
16
|
-
await pool.
|
|
17
|
-
const sql = `
|
|
18
|
-
INSERT INTO ${env.mysqlSymbolsTable} (name, type, category, path, description, content, meta, insert_user, updated_user, embedding, semantic_hash, file_hash)
|
|
19
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
20
|
-
ON DUPLICATE KEY UPDATE
|
|
21
|
-
type = VALUES(type),
|
|
22
|
-
category = VALUES(category),
|
|
23
|
-
description = VALUES(description),
|
|
24
|
-
content = VALUES(content),
|
|
25
|
-
meta = VALUES(meta),
|
|
26
|
-
updated_user = VALUES(updated_user),
|
|
27
|
-
embedding = CASE WHEN VALUES(embedding) IS NOT NULL THEN VALUES(embedding) ELSE embedding END,
|
|
28
|
-
semantic_hash = VALUES(semantic_hash),
|
|
29
|
-
file_hash = VALUES(file_hash)
|
|
30
|
-
`;
|
|
31
|
-
const conn = await pool.getConnection();
|
|
17
|
+
const actor = process.env.GITHUB_USERNAME?.trim() || 'system';
|
|
18
|
+
const client = await pool.connect();
|
|
32
19
|
try {
|
|
33
|
-
|
|
20
|
+
// 确保 extension + 表 + 基础索引存在
|
|
21
|
+
for (const sql of getAllTableSQLs()) {
|
|
22
|
+
await client.query(sql);
|
|
23
|
+
}
|
|
24
|
+
await client.query('BEGIN');
|
|
25
|
+
const t = env.symbolsTable;
|
|
26
|
+
const sql = `
|
|
27
|
+
INSERT INTO ${t}
|
|
28
|
+
(name, type, category, path, description, content, meta,
|
|
29
|
+
insert_user, updated_user, embedding, semantic_hash, file_hash, status)
|
|
30
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb, $8, $9, $10::vector, $11, $12, $13)
|
|
31
|
+
ON CONFLICT (path, name) DO UPDATE SET
|
|
32
|
+
type = EXCLUDED.type,
|
|
33
|
+
category = EXCLUDED.category,
|
|
34
|
+
description = EXCLUDED.description,
|
|
35
|
+
content = EXCLUDED.content,
|
|
36
|
+
meta = EXCLUDED.meta,
|
|
37
|
+
updated_user = EXCLUDED.updated_user,
|
|
38
|
+
embedding = CASE
|
|
39
|
+
WHEN EXCLUDED.embedding IS NOT NULL THEN EXCLUDED.embedding
|
|
40
|
+
WHEN EXCLUDED.semantic_hash != ${t}.semantic_hash THEN NULL
|
|
41
|
+
ELSE ${t}.embedding
|
|
42
|
+
END,
|
|
43
|
+
semantic_hash = EXCLUDED.semantic_hash,
|
|
44
|
+
file_hash = EXCLUDED.file_hash,
|
|
45
|
+
status = CASE
|
|
46
|
+
WHEN EXCLUDED.embedding IS NOT NULL THEN ${SYMBOL_STATUS.ONLINE}
|
|
47
|
+
WHEN EXCLUDED.semantic_hash != ${t}.semantic_hash THEN ${SYMBOL_STATUS.PENDING}
|
|
48
|
+
ELSE ${t}.status
|
|
49
|
+
END,
|
|
50
|
+
updated_at = NOW()
|
|
51
|
+
`;
|
|
34
52
|
for (let i = 0; i < rows.length; i++) {
|
|
35
53
|
const r = rows[i];
|
|
36
54
|
const emb = embeddings?.[i];
|
|
37
|
-
|
|
38
|
-
|
|
55
|
+
// pgvector 接受 "[x1,x2,...]" 格式字符串
|
|
56
|
+
const vecStr = emb != null ? `[${emb.join(',')}]` : null;
|
|
57
|
+
const statusVal = vecStr !== null ? SYMBOL_STATUS.ONLINE : SYMBOL_STATUS.PENDING;
|
|
58
|
+
await client.query(sql, [
|
|
39
59
|
r.name,
|
|
40
60
|
r.type,
|
|
41
61
|
r.category,
|
|
@@ -45,18 +65,19 @@ export async function upsertSymbols(pool, rows, embeddings) {
|
|
|
45
65
|
JSON.stringify(r.meta),
|
|
46
66
|
actor,
|
|
47
67
|
actor,
|
|
48
|
-
|
|
68
|
+
vecStr, // $10 → cast as vector, null 时写 NULL
|
|
49
69
|
r.semantic_hash,
|
|
50
70
|
r.file_hash,
|
|
71
|
+
statusVal,
|
|
51
72
|
]);
|
|
52
73
|
}
|
|
53
|
-
await
|
|
74
|
+
await client.query('COMMIT');
|
|
54
75
|
}
|
|
55
76
|
catch (e) {
|
|
56
|
-
await
|
|
77
|
+
await client.query('ROLLBACK');
|
|
57
78
|
throw e;
|
|
58
79
|
}
|
|
59
80
|
finally {
|
|
60
|
-
|
|
81
|
+
client.release();
|
|
61
82
|
}
|
|
62
83
|
}
|
|
@@ -7,7 +7,7 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
7
7
|
|
|
8
8
|
当用户需要可复用代码或实现类需求时,按顺序执行:
|
|
9
9
|
|
|
10
|
-
1. 调用 search_symbols 检索候选,type 根据用户需求传(component/
|
|
10
|
+
1. 调用 search_symbols 检索候选,type 根据用户需求传(component/function/hook/class/type/interface);描述功能意图时设置 semantic=true
|
|
11
11
|
2. 如果用户指定了结构过滤条件(props/params/properties/hooks),额外调用 search_by_structure 做结构匹配
|
|
12
12
|
3. 先 search_symbols(limit=20) 拉候选,再对 Top 3 调用 get_symbol_detail 做深度判断
|
|
13
13
|
4. 若仅凭签名/摘要无法判断,对最相关的若干候选调用 get_symbol_detail 获取详情
|
|
@@ -16,9 +16,25 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
16
16
|
- **API 是否简单**、入参是否合适
|
|
17
17
|
- **依赖与副作用**风险
|
|
18
18
|
- **复用安全性**(稳定性、耦合度、是否便于扩展)
|
|
19
|
-
6. 给出**唯一首选**推荐,并说明理由,同时使用 **AskUserQuestion
|
|
19
|
+
6. 给出**唯一首选**推荐,并说明理由,同时使用 **AskUserQuestion** 工具,提供两个选项:
|
|
20
20
|
- 采纳推荐
|
|
21
21
|
- 取消
|
|
22
|
+
7. 用户选择"采纳推荐"后,立即调用 inc_usage 工具记录该行为(symbolId 从搜索结果的 id 字段获取),不要遗漏此步骤。
|
|
23
|
+
|
|
24
|
+
## 不适用场景
|
|
25
|
+
|
|
26
|
+
以下情况不要调用搜索工具:
|
|
27
|
+
- 用户只是问代码如何写(概念性问题),不需要检索已有实现
|
|
28
|
+
- 用户明确说"新建一个"、"自己实现"、"不用已有的"
|
|
29
|
+
- 查询过于通用(如只说"utils"),先与用户确认具体需求再搜索
|
|
30
|
+
|
|
31
|
+
## 搜索结果判断
|
|
32
|
+
|
|
33
|
+
根据 semanticSimilarity 决定推荐置信度:
|
|
34
|
+
- **> 0.85**:高置信度,可直接推荐
|
|
35
|
+
- **0.6 – 0.85**:中等置信度,需结合 description 和 get_symbol_detail 综合判断
|
|
36
|
+
- **< 0.6**:低置信度,说明可能无合适实现,明确告知用户
|
|
37
|
+
- **空结果**:明确说"未找到已有实现",不要凭空推荐
|
|
22
38
|
|
|
23
39
|
## 回复结构
|
|
24
40
|
|
|
@@ -30,7 +46,7 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
30
46
|
- **理由:** 1~3 条要点
|
|
31
47
|
- **其他候选:** 简要列出及取舍(同步标注副作用)
|
|
32
48
|
- **用法提示:** 结合用户场景的最小集成说明
|
|
33
|
-
- **是否采纳:**
|
|
49
|
+
- **是否采纳:** 展示两个选项:选项1. 采纳推荐 选项2. 取消。等待用户确认
|
|
34
50
|
|
|
35
51
|
## 约束
|
|
36
52
|
|
|
@@ -38,12 +54,6 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
38
54
|
- 若无合适代码块,明确说明,并给出最接近的选项及差距。
|
|
39
55
|
- 推理简洁,面向落地实现。
|
|
40
56
|
|
|
41
|
-
## 使用反馈
|
|
42
|
-
|
|
43
|
-
当选择‘采纳推荐’必须调用 inc_usage 工具记录采纳行为,调用格式如下:
|
|
44
|
-
“inc_usage({ symbolId: <选中的代码块 id> })”
|
|
45
|
-
其中 symbolId 从 search_symbols 或 search_by_structure 返回结果的 id 字段获取。这条记录会用于后续排序优化。
|
|
46
|
-
|
|
47
57
|
## 更多示例
|
|
48
58
|
|
|
49
59
|
与仓库内 \`.cursor/skills/reusable-code-advisor/examples.md\` 中的示例一致(在 Cursor 或本地打开该文件查看)。
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import { env } from '../config/env.js';
|
|
2
|
-
import {
|
|
2
|
+
import { getPool } from '../db/postgres.js';
|
|
3
3
|
import { createEmbeddingClient } from '../services/embeddingClient.js';
|
|
4
|
-
import { cosineSimilarity } from '../services/vectorMath.js';
|
|
5
4
|
import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
|
|
6
|
-
const
|
|
7
|
-
const
|
|
5
|
+
const SIMILARITY_THRESHOLD = 0.5;
|
|
6
|
+
const TOP_K = 20;
|
|
8
7
|
const inMemorySymbols = [
|
|
9
8
|
{
|
|
10
9
|
id: 1,
|
|
@@ -42,6 +41,7 @@ function parseEmbedding(raw) {
|
|
|
42
41
|
}
|
|
43
42
|
if (typeof raw === 'string') {
|
|
44
43
|
try {
|
|
44
|
+
// pgvector 返回 "[x1,x2,...]",恰好是合法 JSON 数组
|
|
45
45
|
const j = JSON.parse(raw);
|
|
46
46
|
if (!Array.isArray(j))
|
|
47
47
|
return null;
|
|
@@ -85,7 +85,7 @@ function getMetaArray(meta, key) {
|
|
|
85
85
|
export class SymbolRepository {
|
|
86
86
|
pool;
|
|
87
87
|
constructor() {
|
|
88
|
-
this.pool =
|
|
88
|
+
this.pool = getPool();
|
|
89
89
|
}
|
|
90
90
|
async search(query, type) {
|
|
91
91
|
if (!this.pool) {
|
|
@@ -97,31 +97,28 @@ export class SymbolRepository {
|
|
|
97
97
|
(s.description ?? '').toLowerCase().includes(q)));
|
|
98
98
|
});
|
|
99
99
|
}
|
|
100
|
-
const params = [
|
|
100
|
+
const params = [
|
|
101
|
+
`%${query}%`,
|
|
102
|
+
SEARCHABLE_STATUS,
|
|
103
|
+
];
|
|
101
104
|
let sql = `
|
|
102
|
-
SELECT id, name, type, category, path, description, content,
|
|
103
|
-
FROM ${env.
|
|
104
|
-
WHERE (name
|
|
105
|
-
AND status = $
|
|
105
|
+
SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
|
|
106
|
+
FROM ${env.symbolsTable}
|
|
107
|
+
WHERE (name ILIKE $1 OR description ILIKE $1)
|
|
108
|
+
AND status = $2
|
|
106
109
|
`;
|
|
107
|
-
params.push(`%${query}%`);
|
|
108
110
|
if (type) {
|
|
109
|
-
sql += ' AND type = ?';
|
|
110
111
|
params.push(type);
|
|
112
|
+
sql += ` AND type = $${params.length}`;
|
|
111
113
|
}
|
|
112
114
|
sql += ' ORDER BY usage_count DESC LIMIT 20';
|
|
113
|
-
const
|
|
115
|
+
const { rows } = await this.pool.query(sql, params);
|
|
114
116
|
return rows.map((r) => mapRow(r));
|
|
115
117
|
}
|
|
116
118
|
/**
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
* - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
|
|
121
|
-
* - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
|
|
122
|
-
floor(catLimit / pathCount)) 条
|
|
123
|
-
* 最终选择topK,进入排序
|
|
124
|
-
*/
|
|
119
|
+
* 语义向量检索:将 query 嵌入后用 pgvector <=> 运算符(cosine distance)在数据库内完成相似度排序。
|
|
120
|
+
* 不再需要在 Node 拉取全量向量做内存计算。
|
|
121
|
+
*/
|
|
125
122
|
async searchSemanticHits(query, opts) {
|
|
126
123
|
if (!env.embeddingServiceUrl) {
|
|
127
124
|
throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
|
|
@@ -129,111 +126,47 @@ export class SymbolRepository {
|
|
|
129
126
|
if (!this.pool) {
|
|
130
127
|
return [];
|
|
131
128
|
}
|
|
132
|
-
const
|
|
133
|
-
const limit = opts?.limit ?? TOP_K_FOR_RANKING;
|
|
134
|
-
const type = opts?.type;
|
|
129
|
+
const limit = opts?.limit ?? TOP_K;
|
|
135
130
|
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
136
131
|
const [queryVec] = await client.embed([query.trim()]);
|
|
137
132
|
if (!queryVec?.length) {
|
|
138
133
|
throw new Error('查询向量为空');
|
|
139
134
|
}
|
|
140
|
-
//
|
|
141
|
-
const
|
|
135
|
+
// pgvector 向量字面量格式:[x1,x2,...]
|
|
136
|
+
const vecLiteral = `[${queryVec.join(',')}]`;
|
|
137
|
+
const params = [vecLiteral, SEARCHABLE_STATUS];
|
|
138
|
+
// 1 - cosine_distance = cosine_similarity;多取一倍候选后在应用层过阈值
|
|
142
139
|
let sql = `
|
|
143
|
-
SELECT id, name, type, category, path, description, content,
|
|
144
|
-
|
|
140
|
+
SELECT id, name, type, category, path, description, content, meta::text AS meta,
|
|
141
|
+
usage_count, created_at,
|
|
142
|
+
1 - (embedding <=> $1::vector) AS similarity
|
|
143
|
+
FROM ${env.symbolsTable}
|
|
145
144
|
WHERE embedding IS NOT NULL
|
|
146
|
-
AND status = $
|
|
145
|
+
AND status = $2
|
|
147
146
|
`;
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
sql +=
|
|
151
|
-
params.push(type);
|
|
147
|
+
if (opts?.type) {
|
|
148
|
+
params.push(opts.type);
|
|
149
|
+
sql += ` AND type = $${params.length}`;
|
|
152
150
|
}
|
|
153
|
-
|
|
154
|
-
params.
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
.map((r) =>
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
.map((s) => {
|
|
163
|
-
const sim = cosineSimilarity(queryVec, s.embedding);
|
|
164
|
-
const { embedding: _, ...rest } = s;
|
|
165
|
-
return { symbol: rest, similarity: sim };
|
|
166
|
-
})
|
|
167
|
-
.filter((x) => x.similarity >= THREADHOLD_SIMILARITY_BEFORE_RANKED) // 初筛阈值,过滤掉明显不相关的结果
|
|
168
|
-
.sort((a, b) => b.similarity - a.similarity)
|
|
151
|
+
params.push(limit * 2); // 多取一倍以便 SIMILARITY_THRESHOLD 过滤后仍有足量结果
|
|
152
|
+
sql += ` ORDER BY embedding <=> $1::vector LIMIT $${params.length}`;
|
|
153
|
+
const { rows } = await this.pool.query(sql, params);
|
|
154
|
+
return rows
|
|
155
|
+
.map((r) => ({
|
|
156
|
+
symbol: mapRow(r),
|
|
157
|
+
similarity: Number(r.similarity),
|
|
158
|
+
}))
|
|
159
|
+
.filter((x) => x.similarity >= SIMILARITY_THRESHOLD)
|
|
169
160
|
.slice(0, limit);
|
|
170
161
|
}
|
|
171
|
-
/**
|
|
172
|
-
* 分桶采样核心逻辑
|
|
173
|
-
* - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
|
|
174
|
-
* - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
|
|
175
|
-
floor(catLimit / pathCount)) 条
|
|
176
|
-
*/
|
|
177
|
-
bucketSampling(symbols, limit) {
|
|
178
|
-
if (symbols.length === 0)
|
|
179
|
-
return [];
|
|
180
|
-
// 按 category 分组
|
|
181
|
-
const categoryGroups = new Map();
|
|
182
|
-
for (const s of symbols) {
|
|
183
|
-
const cat = s.category ?? '__null__';
|
|
184
|
-
if (!categoryGroups.has(cat)) {
|
|
185
|
-
categoryGroups.set(cat, []);
|
|
186
|
-
}
|
|
187
|
-
categoryGroups.get(cat).push(s);
|
|
188
|
-
}
|
|
189
|
-
const total = symbols.length;
|
|
190
|
-
const sampled = [];
|
|
191
|
-
// 第一层:按 category 占比计算采样数,保底10条
|
|
192
|
-
for (const [, catSymbols] of categoryGroups) {
|
|
193
|
-
const catCount = catSymbols.length;
|
|
194
|
-
const catRatio = catCount / total;
|
|
195
|
-
const catLimit = Math.max(10, Math.floor(limit * catRatio));
|
|
196
|
-
// 按 path 分组(提取目录部分)
|
|
197
|
-
const pathGroups = new Map();
|
|
198
|
-
for (const s of catSymbols) {
|
|
199
|
-
const dir = s.path.includes('/')
|
|
200
|
-
? s.path.slice(0, s.path.lastIndexOf('/'))
|
|
201
|
-
: '__root__';
|
|
202
|
-
if (!pathGroups.has(dir)) {
|
|
203
|
-
pathGroups.set(dir, []);
|
|
204
|
-
}
|
|
205
|
-
pathGroups.get(dir).push(s);
|
|
206
|
-
}
|
|
207
|
-
const pathCount = pathGroups.size;
|
|
208
|
-
const perPathSample = Math.max(5, Math.floor(catLimit / pathCount));
|
|
209
|
-
// 第二层:每个 path 子桶内乱序后采样
|
|
210
|
-
for (const pathSymbols of pathGroups.values()) {
|
|
211
|
-
// 原地乱序(Fisher- Y ates)
|
|
212
|
-
for (let i = pathSymbols.length - 1; i > 0; i--) {
|
|
213
|
-
const j = Math.floor(Math.random() * (i + 1));
|
|
214
|
-
[pathSymbols[i], pathSymbols[j]] = [
|
|
215
|
-
pathSymbols[j],
|
|
216
|
-
pathSymbols[i],
|
|
217
|
-
];
|
|
218
|
-
}
|
|
219
|
-
const pathSampleCount = Math.min(perPathSample, pathSymbols.length);
|
|
220
|
-
sampled.push(...pathSymbols.slice(0, pathSampleCount));
|
|
221
|
-
if (sampled.length >= limit)
|
|
222
|
-
break;
|
|
223
|
-
}
|
|
224
|
-
if (sampled.length >= limit)
|
|
225
|
-
break;
|
|
226
|
-
}
|
|
227
|
-
return sampled.slice(0, limit);
|
|
228
|
-
}
|
|
229
162
|
async getByName(name) {
|
|
230
163
|
if (!this.pool) {
|
|
231
164
|
return (inMemorySymbols.find((s) => s.name.toLowerCase() === name.toLowerCase()) ?? null);
|
|
232
165
|
}
|
|
233
|
-
const
|
|
234
|
-
SELECT id, name, type, category, path, description, content,
|
|
235
|
-
FROM ${env.
|
|
236
|
-
WHERE name =
|
|
166
|
+
const { rows } = await this.pool.query(`
|
|
167
|
+
SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
|
|
168
|
+
FROM ${env.symbolsTable}
|
|
169
|
+
WHERE name = $1
|
|
237
170
|
LIMIT 1
|
|
238
171
|
`, [name]);
|
|
239
172
|
if (rows.length === 0) {
|
|
@@ -254,8 +187,8 @@ export class SymbolRepository {
|
|
|
254
187
|
}
|
|
255
188
|
return false;
|
|
256
189
|
}
|
|
257
|
-
const
|
|
258
|
-
return result.
|
|
190
|
+
const result = await this.pool.query(`UPDATE ${env.symbolsTable} SET usage_count = usage_count + 1 WHERE id = $1`, [symbolId]);
|
|
191
|
+
return result.rowCount !== null && result.rowCount > 0;
|
|
259
192
|
}
|
|
260
193
|
async searchByStructure(fields, opts) {
|
|
261
194
|
const normalized = fields.map((f) => f.trim()).filter(Boolean);
|
|
@@ -286,21 +219,21 @@ export class SymbolRepository {
|
|
|
286
219
|
}
|
|
287
220
|
const params = [];
|
|
288
221
|
let sql = `
|
|
289
|
-
SELECT id, name, type, category, path, description, content,
|
|
290
|
-
FROM ${env.
|
|
222
|
+
SELECT id, name, type, category, path, description, content, meta::text AS meta, usage_count, created_at
|
|
223
|
+
FROM ${env.symbolsTable}
|
|
291
224
|
WHERE 1 = 1
|
|
292
225
|
`;
|
|
293
226
|
if (type) {
|
|
294
|
-
sql += ' AND type = ?';
|
|
295
227
|
params.push(type);
|
|
228
|
+
sql += ` AND type = $${params.length}`;
|
|
296
229
|
}
|
|
297
230
|
if (category) {
|
|
298
|
-
sql += ' AND category LIKE ?';
|
|
299
231
|
params.push(`%${category}%`);
|
|
232
|
+
sql += ` AND category ILIKE $${params.length}`;
|
|
300
233
|
}
|
|
301
|
-
sql += ' ORDER BY usage_count DESC LIMIT ?';
|
|
302
234
|
params.push(Math.max(limit * 5, 50));
|
|
303
|
-
|
|
235
|
+
sql += ` ORDER BY usage_count DESC LIMIT $${params.length}`;
|
|
236
|
+
const { rows } = await this.pool.query(sql, params);
|
|
304
237
|
return rows
|
|
305
238
|
.map((r) => mapRow(r))
|
|
306
239
|
.filter(matchesAll)
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* BullMQ embedding 队列 producer。
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* -
|
|
6
|
-
* - CI
|
|
7
|
-
* -
|
|
4
|
+
* 去重策略:
|
|
5
|
+
* - 同一 CI run 内:ci-index.ts 用 new Set(hashes) 去重后再入队,Redis 层无需 jobId 去重
|
|
6
|
+
* - 跨 CI run 的向量缓存:由 worker 查询 DB(status=online AND semantic_hash=?)决定是否调 API
|
|
7
|
+
* - 不使用 jobId,避免 BullMQ completed 状态残留导致后续 run 任务被跳过
|
|
8
|
+
*
|
|
9
|
+
* CI 流程只负责 enqueue,worker 异步消费,CI 不阻塞。
|
|
10
|
+
* 调用方在进程退出前需调用 closeEmbeddingQueue() 释放连接。
|
|
8
11
|
*/
|
|
9
12
|
import { Queue } from 'bullmq';
|
|
10
|
-
import Redis from 'ioredis';
|
|
13
|
+
import { Redis } from 'ioredis';
|
|
11
14
|
import { env } from '../config/env.js';
|
|
12
15
|
let _queue = null;
|
|
13
16
|
let _connection = null;
|
|
@@ -21,31 +24,29 @@ function getQueue() {
|
|
|
21
24
|
}
|
|
22
25
|
return _queue;
|
|
23
26
|
}
|
|
24
|
-
/**
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
*/
|
|
28
|
-
export async function enqueueEmbedding(semanticHash) {
|
|
29
|
-
await getQueue().add('embed', { semanticHash }, {
|
|
30
|
-
jobId: semanticHash, // 去重键:相同 hash 幂等
|
|
27
|
+
/** 单个 semanticHash 入队 */
|
|
28
|
+
export async function enqueueEmbedding(semanticHash, symbolsTable) {
|
|
29
|
+
await getQueue().add('embed', { semanticHash, symbolsTable: symbolsTable ?? env.symbolsTable }, {
|
|
31
30
|
attempts: 5,
|
|
32
31
|
backoff: { type: 'exponential', delay: 5_000 },
|
|
33
32
|
});
|
|
34
33
|
}
|
|
35
|
-
/**
|
|
36
|
-
|
|
34
|
+
/**
|
|
35
|
+
* 批量入队(同一 CI run 内已由调用方 new Set 去重)。
|
|
36
|
+
* worker 消费时查 DB 决定是否真正调 embedding API。
|
|
37
|
+
*/
|
|
38
|
+
export async function enqueueEmbeddingBatch(semanticHashes, symbolsTable) {
|
|
39
|
+
const table = symbolsTable ?? env.symbolsTable;
|
|
37
40
|
const queue = getQueue();
|
|
38
41
|
const jobs = semanticHashes.map((hash) => ({
|
|
39
42
|
name: 'embed',
|
|
40
|
-
data: { semanticHash: hash },
|
|
43
|
+
data: { semanticHash: hash, symbolsTable: table },
|
|
41
44
|
opts: {
|
|
42
|
-
jobId: hash,
|
|
43
45
|
attempts: 5,
|
|
44
46
|
backoff: { type: 'exponential', delay: 5_000 },
|
|
45
47
|
},
|
|
46
48
|
}));
|
|
47
|
-
|
|
48
|
-
await queue.addBulkJobs(jobs);
|
|
49
|
+
await queue.addBulk(jobs);
|
|
49
50
|
}
|
|
50
51
|
/** 进程退出前关闭连接(CI 脚本必须调用,否则进程挂起) */
|
|
51
52
|
export async function closeEmbeddingQueue() {
|
package/dist/services/reindex.js
CHANGED
|
@@ -1,64 +1,108 @@
|
|
|
1
|
-
import { resolve } from 'node:path';
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
1
|
+
import { resolve, join } from 'node:path';
|
|
2
|
+
import { readFileSync } from 'node:fs';
|
|
3
|
+
import fg from 'fast-glob';
|
|
4
|
+
import { env } from '../config/env.js';
|
|
5
|
+
import { getPool } from '../db/postgres.js';
|
|
6
|
+
import { getAllTableSQLs } from '../db/schema.js';
|
|
7
|
+
import { indexProject, DEFAULT_IGNORE } from '../indexer/indexProject.js';
|
|
6
8
|
import { upsertSymbols } from '../indexer/persistSymbols.js';
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
+
import { computeFileHash } from '../indexer/tsAstNormalizer.js';
|
|
10
|
+
import { getRelativePathForDisplay } from '../indexer/heuristics.js';
|
|
11
|
+
import { enqueueEmbeddingBatch, closeEmbeddingQueue, } from '../services/embeddingQueue.js';
|
|
12
|
+
import { SYMBOL_STATUS } from '../config/symbolStatus.js';
|
|
9
13
|
export async function runReindex(options = {}) {
|
|
10
14
|
const projectRoot = resolve(options.projectRoot ?? process.cwd());
|
|
11
|
-
const { dryRun = false } = options;
|
|
12
|
-
|
|
13
|
-
loadProjectDotenv(projectRoot);
|
|
14
|
-
// 2️ 打印生效的环境变量(便于调试)
|
|
15
|
-
console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, ` +
|
|
16
|
-
`MYSQL_HOST=${process.env.MYSQL_HOST}`);
|
|
17
|
-
// 3️⃣ 只有需要写入数据库时才检查 MySQL 并建立连接
|
|
18
|
-
const embeddingServiceUrl = process.env.EMBEDDING_SERVICE_URL;
|
|
19
|
-
if (!dryRun && embeddingServiceUrl) {
|
|
20
|
-
// 初始化 category embeddings
|
|
21
|
-
await initCategoryEmbeddings();
|
|
22
|
-
}
|
|
15
|
+
const { dryRun = false, forceRebuild = false } = options;
|
|
16
|
+
console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, forceRebuild=${forceRebuild}, PG_URL=${process.env.PG_URL ? '(set)' : '(not set)'}, SYMBOLS_TABLE=${env.symbolsTable}`);
|
|
23
17
|
let pool = null;
|
|
24
18
|
if (!dryRun) {
|
|
25
|
-
pool =
|
|
26
|
-
await pool.query('SELECT 1');
|
|
27
|
-
console.error('[reindex]
|
|
19
|
+
pool = getPool();
|
|
20
|
+
await pool.query('SELECT 1');
|
|
21
|
+
console.error('[reindex] PostgreSQL connection successful');
|
|
22
|
+
// 确保 extension + table + indexes 存在(幂等,多租户表名安全)
|
|
23
|
+
for (const sql of getAllTableSQLs()) {
|
|
24
|
+
await pool.query(sql);
|
|
25
|
+
}
|
|
26
|
+
console.error(`[reindex] schema ready: ${env.symbolsTable}`);
|
|
27
|
+
}
|
|
28
|
+
// ─── 1. glob 解析出全量文件列表(绝对路径)──────────────────────────
|
|
29
|
+
const ignore = [...DEFAULT_IGNORE, ...(options.ignore ?? [])];
|
|
30
|
+
const patterns = (options.globPatterns ?? ['src/**/*.{ts,tsx}']).map((p) => p.startsWith('/') ? p : join(projectRoot, p).replace(/\\/g, '/'));
|
|
31
|
+
const allFiles = await fg(patterns, {
|
|
32
|
+
absolute: true,
|
|
33
|
+
ignore,
|
|
34
|
+
onlyFiles: true,
|
|
35
|
+
dot: false,
|
|
36
|
+
});
|
|
37
|
+
console.error(`[reindex] glob found ${allFiles.length} file(s)`);
|
|
38
|
+
// ─── 2. file_hash 过滤:跳过 AST 未变的文件(CPU 优化)────────────────
|
|
39
|
+
// forceRebuild 时跳过此过滤,file_hash 不可复用(模板/模型变更时相同文件产出不同 content)
|
|
40
|
+
let filesToIndex = allFiles;
|
|
41
|
+
let skippedFiles = 0;
|
|
42
|
+
if (!forceRebuild && pool && allFiles.length > 0) {
|
|
43
|
+
// 计算所有文件当前 hash
|
|
44
|
+
const currentFileHashes = new Map(); // relPath → hash
|
|
45
|
+
for (const absPath of allFiles) {
|
|
46
|
+
const content = readFileSync(absPath, 'utf-8');
|
|
47
|
+
const relPath = getRelativePathForDisplay(projectRoot, absPath);
|
|
48
|
+
currentFileHashes.set(relPath, computeFileHash(content));
|
|
49
|
+
}
|
|
50
|
+
// 一次性批量查 DB 已有的 file_hash
|
|
51
|
+
const relPaths = [...currentFileHashes.keys()];
|
|
52
|
+
const { rows: dbRows } = await pool.query(`SELECT DISTINCT path, file_hash FROM ${env.symbolsTable}
|
|
53
|
+
WHERE path = ANY($1) AND file_hash IS NOT NULL`, [relPaths]);
|
|
54
|
+
const dbFileHash = new Map(dbRows.map((r) => [r.path, r.file_hash]));
|
|
55
|
+
filesToIndex = allFiles.filter((absPath) => {
|
|
56
|
+
const relPath = getRelativePathForDisplay(projectRoot, absPath);
|
|
57
|
+
return currentFileHashes.get(relPath) !== dbFileHash.get(relPath);
|
|
58
|
+
});
|
|
59
|
+
skippedFiles = allFiles.length - filesToIndex.length;
|
|
60
|
+
console.error(`[reindex] file_hash: ${skippedFiles} unchanged (skipped), ${filesToIndex.length} changed (to parse)`);
|
|
28
61
|
}
|
|
29
|
-
|
|
62
|
+
else if (forceRebuild) {
|
|
63
|
+
console.error(`[reindex] forceRebuild=true, skipping file_hash filter — parsing all ${allFiles.length} file(s)`);
|
|
64
|
+
}
|
|
65
|
+
if (filesToIndex.length === 0) {
|
|
66
|
+
console.error('[reindex] all files unchanged, nothing to do');
|
|
67
|
+
return {
|
|
68
|
+
projectRoot,
|
|
69
|
+
extractedCount: 0,
|
|
70
|
+
skippedFiles,
|
|
71
|
+
enqueuedCount: 0,
|
|
72
|
+
upserted: false,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
// ─── 3. 只对变更文件做 AST 解析 ──────────────────────────────────
|
|
76
|
+
const rows = await indexProject({
|
|
30
77
|
projectRoot,
|
|
31
|
-
globPatterns:
|
|
32
|
-
ignore: options.ignore,
|
|
78
|
+
globPatterns: filesToIndex,
|
|
33
79
|
});
|
|
34
|
-
console.error(`[reindex] extracted ${rows.length} symbol(s) from ${
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
embeddingsComputed = true;
|
|
80
|
+
console.error(`[reindex] extracted ${rows.length} symbol(s) from ${filesToIndex.length} changed file(s)`);
|
|
81
|
+
// ─── 4. 写库(全部 pending)→ 入队,worker 异步处理 embedding + category ──
|
|
82
|
+
const nullPayload = rows.map(() => null);
|
|
83
|
+
const pendingHashes = [
|
|
84
|
+
...new Set(rows.map((r) => r.semantic_hash).filter(Boolean)),
|
|
85
|
+
];
|
|
86
|
+
if (!dryRun) {
|
|
87
|
+
// forceRebuild:先清空 DB 中已有的 embedding,使 worker cache check 必然 miss
|
|
88
|
+
if (forceRebuild && pendingHashes.length > 0) {
|
|
89
|
+
await pool.query(`UPDATE ${env.symbolsTable}
|
|
90
|
+
SET embedding = NULL, status = $1
|
|
91
|
+
WHERE semantic_hash = ANY($2)`, [SYMBOL_STATUS.PENDING, pendingHashes]);
|
|
92
|
+
console.error(`[reindex] forceRebuild: cleared embeddings for ${pendingHashes.length} semantic_hash(es)`);
|
|
48
93
|
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
94
|
+
await upsertSymbols(pool, rows, nullPayload);
|
|
95
|
+
if (pendingHashes.length > 0) {
|
|
96
|
+
await enqueueEmbeddingBatch(pendingHashes, env.symbolsTable);
|
|
97
|
+
console.error(`[reindex] enqueued ${pendingHashes.length} semantic_hash(es) → worker will handle embedding asynchronously`);
|
|
52
98
|
}
|
|
99
|
+
await closeEmbeddingQueue();
|
|
53
100
|
}
|
|
54
|
-
if (!options.dryRun) {
|
|
55
|
-
await upsertSymbols(pool, rows, embeddingPayload);
|
|
56
|
-
}
|
|
57
|
-
console.error('===out', JSON.stringify(rows));
|
|
58
101
|
return {
|
|
59
102
|
projectRoot,
|
|
60
103
|
extractedCount: rows.length,
|
|
61
|
-
|
|
62
|
-
|
|
104
|
+
skippedFiles,
|
|
105
|
+
enqueuedCount: pendingHashes.length,
|
|
106
|
+
upserted: !dryRun,
|
|
63
107
|
};
|
|
64
108
|
}
|
|
@@ -5,7 +5,9 @@ export const getSymbolDetailInput = z.object({
|
|
|
5
5
|
export function createGetSymbolDetailTool(repository) {
|
|
6
6
|
return {
|
|
7
7
|
name: 'get_symbol_detail',
|
|
8
|
-
description: '
|
|
8
|
+
description: '获取单个代码块的完整详情(含源码、参数类型、调用关系、副作用)。\n' +
|
|
9
|
+
'仅在以下情况调用:search_symbols 返回的摘要信息不足以判断是否适用(如签名模糊、副作用不明确)。\n' +
|
|
10
|
+
'通常对 top 1-3 候选调用,不要对所有结果批量调用。',
|
|
9
11
|
inputSchema: getSymbolDetailInput.shape,
|
|
10
12
|
handler: async (input) => {
|
|
11
13
|
const symbol = await repository.getByName(input.name);
|