@lorrylurui/code-intelligence-mcp 1.1.15 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -3
- package/dist/cli/ci-index-cli.js +85 -0
- package/dist/cli/ci-index.js +76 -0
- package/dist/cli/detect-duplicates.js +1 -6
- package/dist/cli/embedding-worker-cli.js +35 -0
- package/dist/cli/index-codebase-cli.js +64 -0
- package/dist/cli/index-codebase.js +5 -4
- package/dist/config/env.js +53 -81
- package/dist/config/symbolStatus.js +8 -0
- package/dist/db/mysql.js +3 -6
- package/dist/db/postgres.js +13 -0
- package/dist/db/schema.js +41 -19
- package/dist/indexer/astNormalizer.js +201 -0
- package/dist/indexer/babelParser.js +40 -15
- package/dist/indexer/categoryClassifier.js +129 -0
- package/dist/indexer/embedText.js +9 -7
- package/dist/indexer/heuristics.js +42 -23
- package/dist/indexer/indexProject.js +146 -56
- package/dist/indexer/jsAstNormalizer.js +201 -0
- package/dist/indexer/persistSymbols.js +49 -24
- package/dist/indexer/tsAstNormalizer.js +363 -0
- package/dist/prompts/reusableCodeAdvisorPrompt.js +21 -8
- package/dist/repositories/symbolRepository.js +53 -46
- package/dist/services/embeddingQueue.js +57 -0
- package/dist/services/reindex.js +90 -43
- package/dist/tools/getSymbolDetail.js +3 -1
- package/dist/tools/incUsage.js +12 -3
- package/dist/tools/reindex.js +3 -1
- package/dist/tools/searchByStructure.js +6 -2
- package/dist/tools/searchSymbols.js +18 -4
- package/dist/workers/embeddingWorker.js +145 -0
- package/package.json +10 -5
package/dist/services/reindex.js
CHANGED
|
@@ -1,61 +1,108 @@
|
|
|
1
|
-
import { resolve } from 'node:path';
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
1
|
+
import { resolve, join } from 'node:path';
|
|
2
|
+
import { readFileSync } from 'node:fs';
|
|
3
|
+
import fg from 'fast-glob';
|
|
4
|
+
import { env } from '../config/env.js';
|
|
5
|
+
import { getPool } from '../db/postgres.js';
|
|
6
|
+
import { getAllTableSQLs } from '../db/schema.js';
|
|
7
|
+
import { indexProject, DEFAULT_IGNORE } from '../indexer/indexProject.js';
|
|
6
8
|
import { upsertSymbols } from '../indexer/persistSymbols.js';
|
|
7
|
-
import {
|
|
9
|
+
import { computeFileHash } from '../indexer/tsAstNormalizer.js';
|
|
10
|
+
import { getRelativePathForDisplay } from '../indexer/heuristics.js';
|
|
11
|
+
import { enqueueEmbeddingBatch, closeEmbeddingQueue, } from '../services/embeddingQueue.js';
|
|
12
|
+
import { SYMBOL_STATUS } from '../config/symbolStatus.js';
|
|
8
13
|
export async function runReindex(options = {}) {
|
|
9
14
|
const projectRoot = resolve(options.projectRoot ?? process.cwd());
|
|
10
|
-
const { dryRun = false } = options;
|
|
11
|
-
|
|
12
|
-
loadProjectDotenv(projectRoot);
|
|
13
|
-
// 2️ 打印生效的环境变量(便于调试)
|
|
14
|
-
console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, ` +
|
|
15
|
-
`MYSQL_ENABLED=${process.env.MYSQL_ENABLED}, ` +
|
|
16
|
-
`MYSQL_HOST=${process.env.MYSQL_HOST}`);
|
|
17
|
-
// 3️⃣ 只有需要写入数据库时才检查 MySQL 并建立连接
|
|
18
|
-
// 注意:直接检查 process.env,因为 env.mysqlEnabled 是模块加载时计算的,不会反映 loadProjectDotenv 的更新
|
|
19
|
-
const mysqlEnabled = process.env.MYSQL_ENABLED === 'true';
|
|
20
|
-
const embeddingServiceUrl = process.env.EMBEDDING_SERVICE_URL;
|
|
15
|
+
const { dryRun = false, forceRebuild = false } = options;
|
|
16
|
+
console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, forceRebuild=${forceRebuild}, PG_URL=${process.env.PG_URL ? '(set)' : '(not set)'}, SYMBOLS_TABLE=${env.symbolsTable}`);
|
|
21
17
|
let pool = null;
|
|
22
18
|
if (!dryRun) {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
19
|
+
pool = getPool();
|
|
20
|
+
await pool.query('SELECT 1');
|
|
21
|
+
console.error('[reindex] PostgreSQL connection successful');
|
|
22
|
+
// 确保 extension + table + indexes 存在(幂等,多租户表名安全)
|
|
23
|
+
for (const sql of getAllTableSQLs()) {
|
|
24
|
+
await pool.query(sql);
|
|
26
25
|
}
|
|
27
|
-
|
|
28
|
-
await pool.query('SELECT 1'); // 测试连接
|
|
29
|
-
console.error('[reindex] MySQL connection successful');
|
|
26
|
+
console.error(`[reindex] schema ready: ${env.symbolsTable}`);
|
|
30
27
|
}
|
|
28
|
+
// ─── 1. glob 解析出全量文件列表(绝对路径)──────────────────────────
|
|
29
|
+
const ignore = [...DEFAULT_IGNORE, ...(options.ignore ?? [])];
|
|
30
|
+
const patterns = (options.globPatterns ?? ['src/**/*.{ts,tsx}']).map((p) => p.startsWith('/') ? p : join(projectRoot, p).replace(/\\/g, '/'));
|
|
31
|
+
const allFiles = await fg(patterns, {
|
|
32
|
+
absolute: true,
|
|
33
|
+
ignore,
|
|
34
|
+
onlyFiles: true,
|
|
35
|
+
dot: false,
|
|
36
|
+
});
|
|
37
|
+
console.error(`[reindex] glob found ${allFiles.length} file(s)`);
|
|
38
|
+
// ─── 2. file_hash 过滤:跳过 AST 未变的文件(CPU 优化)────────────────
|
|
39
|
+
// forceRebuild 时跳过此过滤,file_hash 不可复用(模板/模型变更时相同文件产出不同 content)
|
|
40
|
+
let filesToIndex = allFiles;
|
|
41
|
+
let skippedFiles = 0;
|
|
42
|
+
if (!forceRebuild && pool && allFiles.length > 0) {
|
|
43
|
+
// 计算所有文件当前 hash
|
|
44
|
+
const currentFileHashes = new Map(); // relPath → hash
|
|
45
|
+
for (const absPath of allFiles) {
|
|
46
|
+
const content = readFileSync(absPath, 'utf-8');
|
|
47
|
+
const relPath = getRelativePathForDisplay(projectRoot, absPath);
|
|
48
|
+
currentFileHashes.set(relPath, computeFileHash(content));
|
|
49
|
+
}
|
|
50
|
+
// 一次性批量查 DB 已有的 file_hash
|
|
51
|
+
const relPaths = [...currentFileHashes.keys()];
|
|
52
|
+
const { rows: dbRows } = await pool.query(`SELECT DISTINCT path, file_hash FROM ${env.symbolsTable}
|
|
53
|
+
WHERE path = ANY($1) AND file_hash IS NOT NULL`, [relPaths]);
|
|
54
|
+
const dbFileHash = new Map(dbRows.map((r) => [r.path, r.file_hash]));
|
|
55
|
+
filesToIndex = allFiles.filter((absPath) => {
|
|
56
|
+
const relPath = getRelativePathForDisplay(projectRoot, absPath);
|
|
57
|
+
return currentFileHashes.get(relPath) !== dbFileHash.get(relPath);
|
|
58
|
+
});
|
|
59
|
+
skippedFiles = allFiles.length - filesToIndex.length;
|
|
60
|
+
console.error(`[reindex] file_hash: ${skippedFiles} unchanged (skipped), ${filesToIndex.length} changed (to parse)`);
|
|
61
|
+
}
|
|
62
|
+
else if (forceRebuild) {
|
|
63
|
+
console.error(`[reindex] forceRebuild=true, skipping file_hash filter — parsing all ${allFiles.length} file(s)`);
|
|
64
|
+
}
|
|
65
|
+
if (filesToIndex.length === 0) {
|
|
66
|
+
console.error('[reindex] all files unchanged, nothing to do');
|
|
67
|
+
return {
|
|
68
|
+
projectRoot,
|
|
69
|
+
extractedCount: 0,
|
|
70
|
+
skippedFiles,
|
|
71
|
+
enqueuedCount: 0,
|
|
72
|
+
upserted: false,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
// ─── 3. 只对变更文件做 AST 解析 ──────────────────────────────────
|
|
31
76
|
const rows = await indexProject({
|
|
32
77
|
projectRoot,
|
|
33
|
-
globPatterns:
|
|
34
|
-
ignore: options.ignore,
|
|
78
|
+
globPatterns: filesToIndex,
|
|
35
79
|
});
|
|
36
|
-
console.error(`[reindex] extracted ${rows.length} symbol(s) from ${
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
80
|
+
console.error(`[reindex] extracted ${rows.length} symbol(s) from ${filesToIndex.length} changed file(s)`);
|
|
81
|
+
// ─── 4. 写库(全部 pending)→ 入队,worker 异步处理 embedding + category ──
|
|
82
|
+
const nullPayload = rows.map(() => null);
|
|
83
|
+
const pendingHashes = [
|
|
84
|
+
...new Set(rows.map((r) => r.semantic_hash).filter(Boolean)),
|
|
85
|
+
];
|
|
86
|
+
if (!dryRun) {
|
|
87
|
+
// forceRebuild:先清空 DB 中已有的 embedding,使 worker cache check 必然 miss
|
|
88
|
+
if (forceRebuild && pendingHashes.length > 0) {
|
|
89
|
+
await pool.query(`UPDATE ${env.symbolsTable}
|
|
90
|
+
SET embedding = NULL, status = $1
|
|
91
|
+
WHERE semantic_hash = ANY($2)`, [SYMBOL_STATUS.PENDING, pendingHashes]);
|
|
92
|
+
console.error(`[reindex] forceRebuild: cleared embeddings for ${pendingHashes.length} semantic_hash(es)`);
|
|
46
93
|
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
94
|
+
await upsertSymbols(pool, rows, nullPayload);
|
|
95
|
+
if (pendingHashes.length > 0) {
|
|
96
|
+
await enqueueEmbeddingBatch(pendingHashes, env.symbolsTable);
|
|
97
|
+
console.error(`[reindex] enqueued ${pendingHashes.length} semantic_hash(es) → worker will handle embedding asynchronously`);
|
|
50
98
|
}
|
|
51
|
-
|
|
52
|
-
if (!options.dryRun) {
|
|
53
|
-
await upsertSymbols(pool, rows, embeddingPayload);
|
|
99
|
+
await closeEmbeddingQueue();
|
|
54
100
|
}
|
|
55
101
|
return {
|
|
56
102
|
projectRoot,
|
|
57
103
|
extractedCount: rows.length,
|
|
58
|
-
|
|
59
|
-
|
|
104
|
+
skippedFiles,
|
|
105
|
+
enqueuedCount: pendingHashes.length,
|
|
106
|
+
upserted: !dryRun,
|
|
60
107
|
};
|
|
61
108
|
}
|
|
@@ -5,7 +5,9 @@ export const getSymbolDetailInput = z.object({
|
|
|
5
5
|
export function createGetSymbolDetailTool(repository) {
|
|
6
6
|
return {
|
|
7
7
|
name: 'get_symbol_detail',
|
|
8
|
-
description: '
|
|
8
|
+
description: '获取单个代码块的完整详情(含源码、参数类型、调用关系、副作用)。\n' +
|
|
9
|
+
'仅在以下情况调用:search_symbols 返回的摘要信息不足以判断是否适用(如签名模糊、副作用不明确)。\n' +
|
|
10
|
+
'通常对 top 1-3 候选调用,不要对所有结果批量调用。',
|
|
9
11
|
inputSchema: getSymbolDetailInput.shape,
|
|
10
12
|
handler: async (input) => {
|
|
11
13
|
const symbol = await repository.getByName(input.name);
|
package/dist/tools/incUsage.js
CHANGED
|
@@ -6,7 +6,9 @@ export const incUsageInput = z.object({
|
|
|
6
6
|
export function createIncUsageTool(repository) {
|
|
7
7
|
return {
|
|
8
8
|
name: 'inc_usage',
|
|
9
|
-
description: '
|
|
9
|
+
description: '在用户明确确认"采纳推荐"后调用,记录复用行为用于排序优化(usage_count +1)。\n' +
|
|
10
|
+
'注意:仅在用户主动确认采纳时调用,不要在推荐后自动调用。\n' +
|
|
11
|
+
'symbolId 从 search_symbols 或 search_by_structure 返回结果的 id 字段获取。',
|
|
10
12
|
inputSchema: incUsageInput.shape,
|
|
11
13
|
handler: async (input) => {
|
|
12
14
|
const success = await repository.incUsage(input.symbolId);
|
|
@@ -15,7 +17,10 @@ export function createIncUsageTool(repository) {
|
|
|
15
17
|
content: [
|
|
16
18
|
{
|
|
17
19
|
type: 'text',
|
|
18
|
-
text: JSON.stringify({
|
|
20
|
+
text: JSON.stringify({
|
|
21
|
+
error: '未找到该代码块',
|
|
22
|
+
symbolId: input.symbolId,
|
|
23
|
+
}, null, 2),
|
|
19
24
|
},
|
|
20
25
|
],
|
|
21
26
|
};
|
|
@@ -24,7 +29,11 @@ export function createIncUsageTool(repository) {
|
|
|
24
29
|
content: [
|
|
25
30
|
{
|
|
26
31
|
type: 'text',
|
|
27
|
-
text: JSON.stringify({
|
|
32
|
+
text: JSON.stringify({
|
|
33
|
+
ok: true,
|
|
34
|
+
symbolId: input.symbolId,
|
|
35
|
+
message: 'usage_count 已 +1',
|
|
36
|
+
}, null, 2),
|
|
28
37
|
},
|
|
29
38
|
],
|
|
30
39
|
};
|
package/dist/tools/reindex.js
CHANGED
|
@@ -9,7 +9,9 @@ export const reindexInput = z.object({
|
|
|
9
9
|
export function createReindexTool() {
|
|
10
10
|
return {
|
|
11
11
|
name: 'reindex',
|
|
12
|
-
description: '
|
|
12
|
+
description: '⚠️ 高成本操作(耗时可能超过数分钟),仅在用户明确要求"重建索引"时调用,不要因搜索结果不佳而自动调用。\n' +
|
|
13
|
+
'重建源码代码块索引并写入 MySQL。设置 dryRun=true 时仅预览抽取数量,不落库。\n' +
|
|
14
|
+
'写入后 embedding 由后台 worker 异步处理,队列清空后打印完成信号。',
|
|
13
15
|
inputSchema: reindexInput.shape,
|
|
14
16
|
handler: async (input) => {
|
|
15
17
|
const startedAt = Date.now();
|
|
@@ -3,7 +3,9 @@ import { z } from 'zod';
|
|
|
3
3
|
import { rankSymbols } from '../services/ranking.js';
|
|
4
4
|
export const searchByStructureInput = z.object({
|
|
5
5
|
fields: z.array(z.string().min(1)).min(1),
|
|
6
|
-
type: z
|
|
6
|
+
type: z
|
|
7
|
+
.enum(['component', 'function', 'hook', 'type', 'interface', 'class'])
|
|
8
|
+
.optional(),
|
|
7
9
|
category: z.string().optional(),
|
|
8
10
|
limit: z.number().int().min(1).max(100).optional().default(20),
|
|
9
11
|
ranked: z.boolean().optional().default(true),
|
|
@@ -11,7 +13,9 @@ export const searchByStructureInput = z.object({
|
|
|
11
13
|
export function createSearchByStructureTool(repository) {
|
|
12
14
|
return {
|
|
13
15
|
name: 'search_by_structure',
|
|
14
|
-
description: '
|
|
16
|
+
description: '按代码块的结构字段(props/params/hooks)检索,适合已知接口形态时使用。\n' +
|
|
17
|
+
'示例:需要一个接受 value、onChange、error 三个 prop 的输入组件 → fields: ["value", "onChange", "error"], type: "component"\n' +
|
|
18
|
+
'与 search_symbols 配合:先语义检索候选,再用本工具做 API 结构过滤以精确匹配。',
|
|
15
19
|
inputSchema: searchByStructureInput.shape,
|
|
16
20
|
handler: async (input) => {
|
|
17
21
|
const rows = await repository.searchByStructure(input.fields, {
|
|
@@ -2,16 +2,24 @@ import { z } from 'zod';
|
|
|
2
2
|
import { rankSemanticHits, rankSymbols } from '../services/ranking.js';
|
|
3
3
|
export const searchSymbolsInput = z.object({
|
|
4
4
|
query: z.string().min(1),
|
|
5
|
-
type: z
|
|
5
|
+
type: z
|
|
6
|
+
.enum(['component', 'function', 'hook', 'type', 'interface', 'class'])
|
|
7
|
+
.optional(),
|
|
6
8
|
ranked: z.boolean().optional().default(true),
|
|
7
9
|
/** Phase 5:自然语言 / 描述句检索(需 EMBEDDING_SERVICE_URL + 索引已写入 embedding) */
|
|
8
10
|
semantic: z.boolean().optional().default(false),
|
|
9
11
|
limit: z.number().int().min(1).max(100).optional().default(20),
|
|
10
12
|
});
|
|
13
|
+
const SCORE_THRESHOLD_FOR_FINAL = 0.45; // 综合排序分阈值(语义相似度占50%权重,原始0.5相似度 ≈ 综合0.35起)
|
|
14
|
+
const TOP_K_FOR_FINAL_RESULTS = 20; // 结果上限,返回相似度高的,保证数据质量
|
|
11
15
|
export function createSearchSymbolsTool(repository) {
|
|
12
16
|
return {
|
|
13
17
|
name: 'search_symbols',
|
|
14
|
-
description: '
|
|
18
|
+
description: '搜索项目中已有的可复用代码块(函数、组件、Hook、类型等)。在生成新代码之前必须先调用本工具,确认是否已有实现。\n' +
|
|
19
|
+
'- 有明确名称时(如 "useDebounce"):semantic=false(默认),直接关键词检索\n' +
|
|
20
|
+
'- 描述功能意图时(如 "防抖"、"处理表单提交"):semantic=true,进行语义检索(需 embedding 服务已就绪)\n' +
|
|
21
|
+
'- 不确定 type 时省略该参数,不要猜测\n' +
|
|
22
|
+
'- 返回结果含 semanticSimilarity 字段:>0.85 高置信度可直接推荐,0.6-0.85 需结合 description 判断,<0.6 说明可能无合适实现',
|
|
15
23
|
inputSchema: searchSymbolsInput.shape,
|
|
16
24
|
handler: async (input) => {
|
|
17
25
|
if (input.semantic) {
|
|
@@ -21,7 +29,8 @@ export function createSearchSymbolsTool(repository) {
|
|
|
21
29
|
});
|
|
22
30
|
const simById = new Map(hits.map((h) => [h.symbol.id, h.similarity]));
|
|
23
31
|
const resultRows = input.ranked
|
|
24
|
-
? rankSemanticHits(hits)
|
|
32
|
+
? rankSemanticHits(hits)
|
|
33
|
+
.map((item) => ({
|
|
25
34
|
id: item.symbol.id,
|
|
26
35
|
name: item.symbol.name,
|
|
27
36
|
type: item.symbol.type,
|
|
@@ -33,6 +42,8 @@ export function createSearchSymbolsTool(repository) {
|
|
|
33
42
|
reasonDetail: item.reason,
|
|
34
43
|
semanticSimilarity: Number((simById.get(item.symbol.id) ?? 0).toFixed(4)),
|
|
35
44
|
}))
|
|
45
|
+
.filter((x) => x.score >= SCORE_THRESHOLD_FOR_FINAL) // 基于综合排序分过滤,保留 usage/recency 高的结果
|
|
46
|
+
.slice(0, TOP_K_FOR_FINAL_RESULTS)
|
|
36
47
|
: hits.map((h) => ({
|
|
37
48
|
id: h.symbol.id,
|
|
38
49
|
name: h.symbol.name,
|
|
@@ -53,7 +64,8 @@ export function createSearchSymbolsTool(repository) {
|
|
|
53
64
|
}
|
|
54
65
|
const rows = await repository.search(input.query, input.type);
|
|
55
66
|
const resultRows = input.ranked
|
|
56
|
-
? rankSymbols(input.query, rows)
|
|
67
|
+
? rankSymbols(input.query, rows)
|
|
68
|
+
.map((item) => ({
|
|
57
69
|
id: item.symbol.id,
|
|
58
70
|
name: item.symbol.name,
|
|
59
71
|
type: item.symbol.type,
|
|
@@ -64,6 +76,8 @@ export function createSearchSymbolsTool(repository) {
|
|
|
64
76
|
reason: item.reason.summary,
|
|
65
77
|
reasonDetail: item.reason,
|
|
66
78
|
}))
|
|
79
|
+
.filter((x) => x.score >= SCORE_THRESHOLD_FOR_FINAL) // 基于综合排序分过滤
|
|
80
|
+
.slice(0, TOP_K_FOR_FINAL_RESULTS)
|
|
67
81
|
: rows.map((r) => ({
|
|
68
82
|
id: r.id,
|
|
69
83
|
name: r.name,
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BullMQ embedding worker(常驻消费进程)。
|
|
3
|
+
*
|
|
4
|
+
* 流程:
|
|
5
|
+
* 1. 收到 job { semanticHash }
|
|
6
|
+
* 2. 查 semantic_hash 缓存:若已有 status=online 的符号带 embedding → 直接复用(0 次 API 调用)
|
|
7
|
+
* 3. 缓存未命中 → 取一条 pending 行构建语义文本 → 调 embedding API
|
|
8
|
+
* 4. 批量 UPDATE:所有 semantic_hash 相同且 status=pending 的行一次性写入向量并置 online
|
|
9
|
+
*
|
|
10
|
+
* 并发/限流:
|
|
11
|
+
* - concurrency 控制同时处理的 job 数(默认 5)
|
|
12
|
+
* - BullMQ limiter 控制全局 RPM(默认 100/min,留 buffer 低于 OpenAI 3000 RPM)
|
|
13
|
+
*
|
|
14
|
+
* 大仓分片:
|
|
15
|
+
* - 直接启动多个 worker 进程(同一 Redis)即可水平扩展,BullMQ 原生分布式协调
|
|
16
|
+
*/
|
|
17
|
+
import { Worker, QueueEvents } from 'bullmq';
|
|
18
|
+
import { Redis } from 'ioredis';
|
|
19
|
+
import { env } from '../config/env.js';
|
|
20
|
+
import { getPool } from '../db/postgres.js';
|
|
21
|
+
import { createEmbeddingClient } from '../services/embeddingClient.js';
|
|
22
|
+
import { indexedRowToEmbedText } from '../indexer/embedText.js';
|
|
23
|
+
import { initCategoryEmbeddings, resolveCategory, } from '../indexer/categoryClassifier.js';
|
|
24
|
+
import { SYMBOL_STATUS } from '../config/symbolStatus.js';
|
|
25
|
+
async function processEmbedJob(job, pool) {
|
|
26
|
+
const { semanticHash } = job.data;
|
|
27
|
+
// 优先使用 job payload 里的表名(跨项目场景),降级到 env(单项目场景)
|
|
28
|
+
const table = job.data.symbolsTable ?? env.symbolsTable;
|
|
29
|
+
const shortHash = semanticHash.slice(0, 10);
|
|
30
|
+
const embedClient = createEmbeddingClient(env.embeddingServiceUrl);
|
|
31
|
+
const ts = () => new Date().toISOString();
|
|
32
|
+
// Step 1: 缓存命中检查 —— 相同 semantic_hash 已有 online 向量
|
|
33
|
+
const { rows: cached } = await pool.query(`SELECT embedding FROM ${table}
|
|
34
|
+
WHERE semantic_hash = $1 AND status = $2 AND embedding IS NOT NULL
|
|
35
|
+
LIMIT 1`, [semanticHash, SYMBOL_STATUS.ONLINE]);
|
|
36
|
+
let vector;
|
|
37
|
+
if (cached.length > 0) {
|
|
38
|
+
// Cache hit: 直接复用已有向量,0 次 API 调用
|
|
39
|
+
// pgvector 返回字符串 "[x1,x2,...]", JSON.parse 可直接解析
|
|
40
|
+
vector =
|
|
41
|
+
typeof cached[0].embedding === 'string'
|
|
42
|
+
? JSON.parse(cached[0].embedding)
|
|
43
|
+
: cached[0].embedding;
|
|
44
|
+
// cache hit 时只需把 pending 行的向量补齐(有可能是新增的同语义符号)
|
|
45
|
+
const cacheResult = await pool.query(`UPDATE ${table}
|
|
46
|
+
SET embedding = $1::vector, status = $2
|
|
47
|
+
WHERE semantic_hash = $3 AND status = $4`, [
|
|
48
|
+
`[${vector.join(',')}]`,
|
|
49
|
+
SYMBOL_STATUS.ONLINE,
|
|
50
|
+
semanticHash,
|
|
51
|
+
SYMBOL_STATUS.PENDING,
|
|
52
|
+
]);
|
|
53
|
+
console.error(`[worker] ✅ cache hit [${ts()}] table=${table} hash=${shortHash}… updated ${cacheResult.rowCount ?? 0} row(s) (0 API calls)`);
|
|
54
|
+
return { updatedRows: cacheResult.rowCount ?? 0 };
|
|
55
|
+
}
|
|
56
|
+
// Cache miss: 取一条 pending 行做 embedding
|
|
57
|
+
const { rows: pending } = await pool.query(`SELECT name, type, category, path, description, content, meta
|
|
58
|
+
FROM ${table}
|
|
59
|
+
WHERE semantic_hash = $1 AND status = $2
|
|
60
|
+
LIMIT 1`, [semanticHash, SYMBOL_STATUS.PENDING]);
|
|
61
|
+
if (pending.length === 0) {
|
|
62
|
+
// 所有行已被并发 worker 处理,幂等退出
|
|
63
|
+
console.error(`[worker] ⚠️ skip [${ts()}] table=${table} hash=${shortHash}… (no pending rows)`);
|
|
64
|
+
return { updatedRows: 0 };
|
|
65
|
+
}
|
|
66
|
+
const row = pending[0];
|
|
67
|
+
const meta = typeof row.meta === 'string' ? JSON.parse(row.meta) : (row.meta ?? {});
|
|
68
|
+
const rowObj = { ...row, meta };
|
|
69
|
+
console.error(`[worker] 🔄 embedding [${ts()}] table=${table} hash=${shortHash}… ${row.path}:${row.name}`);
|
|
70
|
+
// 与 reindex 保持一致:优先用 content(语义模板),降级用 indexedRowToEmbedText
|
|
71
|
+
const doc = row.content ?? indexedRowToEmbedText(rowObj);
|
|
72
|
+
const vectors = await embedClient.embed([doc]);
|
|
73
|
+
vector = vectors[0];
|
|
74
|
+
// 生成 category(规则 → embedding → LLM 三层融合)
|
|
75
|
+
const [resolvedRow] = await resolveCategory([rowObj], [vector]);
|
|
76
|
+
const resolvedCategory = resolvedRow.category ?? null;
|
|
77
|
+
// Step 2: 批量写入 —— 覆盖所有相同 semantic_hash 的 pending 行
|
|
78
|
+
const result = await pool.query(`UPDATE ${table}
|
|
79
|
+
SET embedding = $1::vector, status = $2, category = COALESCE($3, category)
|
|
80
|
+
WHERE semantic_hash = $4 AND status = $5`, [
|
|
81
|
+
`[${vector.join(',')}]`,
|
|
82
|
+
SYMBOL_STATUS.ONLINE,
|
|
83
|
+
resolvedCategory,
|
|
84
|
+
semanticHash,
|
|
85
|
+
SYMBOL_STATUS.PENDING,
|
|
86
|
+
]);
|
|
87
|
+
console.error(`[worker] ✓ done [${ts()}] table=${table} hash=${shortHash}… category=${resolvedCategory ?? 'null'} updated ${result.rowCount ?? 0} row(s)`);
|
|
88
|
+
return { updatedRows: result.rowCount ?? 0 };
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* 启动 embedding worker,返回包含 stop() 的句柄。
|
|
92
|
+
*/
|
|
93
|
+
export async function startEmbeddingWorker(opts = {}) {
|
|
94
|
+
const { concurrency = 5, rpmLimit = 100 } = opts;
|
|
95
|
+
const connection = new Redis(env.redisUrl, {
|
|
96
|
+
maxRetriesPerRequest: null,
|
|
97
|
+
enableReadyCheck: false,
|
|
98
|
+
});
|
|
99
|
+
// 独立连接监听队列事件(BullMQ 要求不共用 Worker 连接)
|
|
100
|
+
const eventsConnection = new Redis(env.redisUrl, {
|
|
101
|
+
maxRetriesPerRequest: null,
|
|
102
|
+
enableReadyCheck: false,
|
|
103
|
+
});
|
|
104
|
+
const queueEvents = new QueueEvents('embedding', {
|
|
105
|
+
connection: eventsConnection,
|
|
106
|
+
});
|
|
107
|
+
const pool = getPool();
|
|
108
|
+
// 预热 category embeddings(仅在服务启动时调用一次)
|
|
109
|
+
if (env.embeddingServiceUrl) {
|
|
110
|
+
await initCategoryEmbeddings();
|
|
111
|
+
console.error('[embedding-worker] category embeddings initialized');
|
|
112
|
+
}
|
|
113
|
+
const worker = new Worker('embedding', (job) => processEmbedJob(job, pool), {
|
|
114
|
+
connection,
|
|
115
|
+
concurrency,
|
|
116
|
+
// 全局限流:所有 worker 进程共享,防止触发 OpenAI rate limit
|
|
117
|
+
limiter: { max: rpmLimit, duration: 60_000 },
|
|
118
|
+
// 完成后立即从 Redis 清除,避免 jobId 残留导致下次同 hash 无法入队
|
|
119
|
+
removeOnComplete: { count: 0 },
|
|
120
|
+
removeOnFail: { count: 100 },
|
|
121
|
+
});
|
|
122
|
+
// 累计统计:每次 drained 后重置
|
|
123
|
+
const stats = { completed: 0, updatedRows: 0 };
|
|
124
|
+
worker.on('completed', (_job, result) => {
|
|
125
|
+
stats.completed++;
|
|
126
|
+
stats.updatedRows += result?.updatedRows ?? 0;
|
|
127
|
+
});
|
|
128
|
+
worker.on('failed', (job, err) => {
|
|
129
|
+
console.error(`[worker] ✗ failed [${new Date().toISOString()}] table=${job?.data?.symbolsTable ?? env.symbolsTable} hash=${job?.data?.semanticHash?.slice(0, 10)}… err=${err.message}`);
|
|
130
|
+
});
|
|
131
|
+
worker.on('error', (err) => {
|
|
132
|
+
console.error(`[worker] error: ${err.message}`);
|
|
133
|
+
});
|
|
134
|
+
// 队列清空时打汇总(全量 reindex 入队后监听,确认所有 embedding 已处理)
|
|
135
|
+
queueEvents.on('drained', () => {
|
|
136
|
+
console.error(`[worker] ✅ queue drained [${new Date().toISOString()}] completed=${stats.completed} jobs rows_updated=${stats.updatedRows}`);
|
|
137
|
+
stats.completed = 0;
|
|
138
|
+
stats.updatedRows = 0;
|
|
139
|
+
});
|
|
140
|
+
const stop = async () => {
|
|
141
|
+
await worker.close();
|
|
142
|
+
await queueEvents.close();
|
|
143
|
+
};
|
|
144
|
+
return { worker, stop };
|
|
145
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lorrylurui/code-intelligence-mcp",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.1",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "MCP server 提供仓库内可复用代码块(ts/tsx/js/jsx/css/less)的索引和查询能力,支持基于代码上下文的智能推荐。",
|
|
6
6
|
"type": "module",
|
|
@@ -10,34 +10,39 @@
|
|
|
10
10
|
],
|
|
11
11
|
"bin": {
|
|
12
12
|
"code-intelligence-mcp": "./dist/index.js",
|
|
13
|
-
"code-intelligence-index": "./dist/cli/index-codebase.js"
|
|
13
|
+
"code-intelligence-index": "./dist/cli/index-codebase-cli.js"
|
|
14
14
|
},
|
|
15
15
|
"scripts": {
|
|
16
16
|
"dev": "tsx watch --clear-screen=false --exclude node_modules --exclude dist src/index.ts",
|
|
17
17
|
"dev:mcp": "node ./scripts/mcp-dev-watch.mjs",
|
|
18
18
|
"build": "tsc -p tsconfig.json",
|
|
19
19
|
"start": "node dist/index.js",
|
|
20
|
-
"index": "tsx src/cli/index-codebase.ts",
|
|
21
|
-
"
|
|
20
|
+
"index": "tsx src/cli/index-codebase-cli.ts",
|
|
21
|
+
"ci-index": "tsx src/cli/ci-index-cli.ts",
|
|
22
|
+
"worker:embedding": "tsx src/cli/embedding-worker-cli.ts",
|
|
22
23
|
"embedding:dev": "cd embedding-service && python3 -m uvicorn app:app --host 127.0.0.1 --port 8765",
|
|
23
24
|
"docker:up": "docker compose up -d",
|
|
24
25
|
"docker:down": "docker compose down",
|
|
25
|
-
"docker:logs": "docker compose logs -f
|
|
26
|
+
"docker:logs": "docker compose logs -f postgres"
|
|
26
27
|
},
|
|
27
28
|
"dependencies": {
|
|
28
29
|
"@babel/parser": "^7.29.2",
|
|
29
30
|
"@babel/types": "^7.29.0",
|
|
30
31
|
"@modelcontextprotocol/sdk": "^1.12.3",
|
|
31
32
|
"@types/react": "^19.2.14",
|
|
33
|
+
"bullmq": "^5.74.1",
|
|
32
34
|
"dotenv": "^16.4.5",
|
|
33
35
|
"fast-glob": "^3.3.2",
|
|
36
|
+
"ioredis": "^5.10.1",
|
|
34
37
|
"mysql2": "^3.11.3",
|
|
38
|
+
"pg": "^8.20.0",
|
|
35
39
|
"react": "^19.2.4",
|
|
36
40
|
"ts-morph": "^25.0.0",
|
|
37
41
|
"zod": "^3.23.8"
|
|
38
42
|
},
|
|
39
43
|
"devDependencies": {
|
|
40
44
|
"@types/node": "^22.10.1",
|
|
45
|
+
"@types/pg": "^8.20.0",
|
|
41
46
|
"tsx": "^4.19.2",
|
|
42
47
|
"typescript": "^5.6.3"
|
|
43
48
|
},
|