@lorrylurui/code-intelligence-mcp 1.1.14 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -598
- package/dist/cli/ci-index-cli.js +66 -0
- package/dist/cli/ci-index.js +80 -0
- package/dist/cli/detect-duplicates.js +1 -6
- package/dist/cli/embedding-worker-cli.js +35 -0
- package/dist/cli/index-codebase.js +6 -7
- package/dist/config/env.js +3 -102
- package/dist/config/symbolStatus.js +8 -0
- package/dist/db/mysql.js +3 -6
- package/dist/db/schema.js +9 -2
- package/dist/indexer/astNormalizer.js +201 -0
- package/dist/indexer/babelParser.js +257 -28
- package/dist/indexer/categoryClassifier.js +129 -0
- package/dist/indexer/embedText.js +9 -7
- package/dist/indexer/extractMeta.js +7 -2
- package/dist/indexer/heuristics.js +42 -23
- package/dist/indexer/indexProject.js +145 -55
- package/dist/indexer/jsAstNormalizer.js +201 -0
- package/dist/indexer/persistSymbols.js +7 -3
- package/dist/indexer/tsAstNormalizer.js +363 -0
- package/dist/prompts/reusableCodeAdvisorPrompt.js +6 -3
- package/dist/repositories/symbolRepository.js +81 -7
- package/dist/services/embeddingQueue.js +56 -0
- package/dist/services/reindex.js +12 -9
- package/dist/tools/searchByStructure.js +3 -1
- package/dist/tools/searchSymbols.js +14 -3
- package/dist/workers/embeddingWorker.js +100 -0
- package/package.json +7 -4
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tsAstNormalizer.ts
|
|
3
|
+
* 对 ts-morph Node 做语义级标准化,生成 semantic_hash。
|
|
4
|
+
*
|
|
5
|
+
* 去掉:参数名、泛型参数名、函数体实现、空白格式、字面量值
|
|
6
|
+
* 保留:参数类型结构、返回类型、sideEffects、hooks
|
|
7
|
+
*/
|
|
8
|
+
import { createHash } from 'node:crypto';
|
|
9
|
+
import { Node, SyntaxKind } from 'ts-morph';
|
|
10
|
+
// ─────────────────────────────────────────────
|
|
11
|
+
// 内置类型白名单:不替换为 $T
|
|
12
|
+
// ─────────────────────────────────────────────
|
|
13
|
+
const BUILTIN_TYPES = new Set([
|
|
14
|
+
'string',
|
|
15
|
+
'number',
|
|
16
|
+
'boolean',
|
|
17
|
+
'void',
|
|
18
|
+
'null',
|
|
19
|
+
'undefined',
|
|
20
|
+
'never',
|
|
21
|
+
'unknown',
|
|
22
|
+
'any',
|
|
23
|
+
'object',
|
|
24
|
+
'symbol',
|
|
25
|
+
'bigint',
|
|
26
|
+
'Promise',
|
|
27
|
+
'Array',
|
|
28
|
+
'Record',
|
|
29
|
+
'Map',
|
|
30
|
+
'Set',
|
|
31
|
+
'WeakMap',
|
|
32
|
+
'WeakSet',
|
|
33
|
+
'Partial',
|
|
34
|
+
'Required',
|
|
35
|
+
'Readonly',
|
|
36
|
+
'Pick',
|
|
37
|
+
'Omit',
|
|
38
|
+
'Exclude',
|
|
39
|
+
'Extract',
|
|
40
|
+
'NonNullable',
|
|
41
|
+
'ReturnType',
|
|
42
|
+
'InstanceType',
|
|
43
|
+
'React',
|
|
44
|
+
'ReactNode',
|
|
45
|
+
'ReactElement',
|
|
46
|
+
'FC',
|
|
47
|
+
'MouseEvent',
|
|
48
|
+
'KeyboardEvent',
|
|
49
|
+
'ChangeEvent',
|
|
50
|
+
'HTMLElement',
|
|
51
|
+
'HTMLDivElement',
|
|
52
|
+
'HTMLInputElement',
|
|
53
|
+
'CSSProperties',
|
|
54
|
+
'RefObject',
|
|
55
|
+
'MutableRefObject',
|
|
56
|
+
]);
|
|
57
|
+
function normalizeTypeName(name) {
|
|
58
|
+
if (BUILTIN_TYPES.has(name))
|
|
59
|
+
return name;
|
|
60
|
+
if (/^T[A-Z]/.test(name) || (name.length === 1 && /[A-Z]/.test(name)))
|
|
61
|
+
return '$T';
|
|
62
|
+
return name;
|
|
63
|
+
}
|
|
64
|
+
function normalizeTypeString(typeStr) {
|
|
65
|
+
return typeStr
|
|
66
|
+
.replace(/\b([A-Z][A-Za-z0-9]*)\b/g, (match) => normalizeTypeName(match))
|
|
67
|
+
.replace(/\s+/g, ' ')
|
|
68
|
+
.trim();
|
|
69
|
+
}
|
|
70
|
+
// 从类型节点提取属性列表(递归处理嵌套对象)
|
|
71
|
+
function extractPropertiesFromType(typeNode) {
|
|
72
|
+
const props = [];
|
|
73
|
+
// 处理 { a: string, b: number } 这种字面量对象类型
|
|
74
|
+
if (Node.isTypeLiteral(typeNode)) {
|
|
75
|
+
for (const member of typeNode.getMembers()) {
|
|
76
|
+
if (Node.isPropertySignature(member)) {
|
|
77
|
+
const name = member.getName();
|
|
78
|
+
const typeNode = member.getTypeNode();
|
|
79
|
+
const typeStr = typeNode
|
|
80
|
+
? normalizeTypeString(typeNode.getText())
|
|
81
|
+
: '$unknown';
|
|
82
|
+
const optional = member.hasQuestionToken() ? '?' : '';
|
|
83
|
+
props.push(`${name}${optional}:${typeStr}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// 处理交叉类型 A & B
|
|
88
|
+
if (Node.isIntersectionTypeNode(typeNode)) {
|
|
89
|
+
for (const member of typeNode.getChildren()) {
|
|
90
|
+
if (Node.isTypeLiteral(member)) {
|
|
91
|
+
props.push(...extractPropertiesFromType(member));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return props;
|
|
96
|
+
}
|
|
97
|
+
// ─────────────────────────────────────────────
|
|
98
|
+
// normalizeNode:递归遍历 AST,输出标准化字符串
|
|
99
|
+
// ─────────────────────────────────────────────
|
|
100
|
+
export function normalizeNode(node) {
|
|
101
|
+
const paramNames = new Map();
|
|
102
|
+
let paramIdx = 0;
|
|
103
|
+
function allocParam(name) {
|
|
104
|
+
if (!paramNames.has(name))
|
|
105
|
+
paramNames.set(name, `$p${paramIdx++}`);
|
|
106
|
+
return paramNames.get(name);
|
|
107
|
+
}
|
|
108
|
+
function visit(n) {
|
|
109
|
+
const kind = n.getKind();
|
|
110
|
+
// 函数体 → {}(不关心实现)
|
|
111
|
+
if (kind === SyntaxKind.Block)
|
|
112
|
+
return '{}';
|
|
113
|
+
// 参数:只保留类型,去参数名
|
|
114
|
+
if (Node.isParameterDeclaration(n)) {
|
|
115
|
+
const nameNode = n.getNameNode();
|
|
116
|
+
const typeNode = n.getTypeNode();
|
|
117
|
+
const typeStr = typeNode
|
|
118
|
+
? normalizeTypeString(typeNode.getText())
|
|
119
|
+
: '$unknown';
|
|
120
|
+
const prefix = n.isRestParameter() ? '...' : '';
|
|
121
|
+
const suffix = n.hasInitializer() ? '=$default' : '';
|
|
122
|
+
// 解构参数:{ userId, options }: Config → {}:Config
|
|
123
|
+
if (Node.isObjectBindingPattern(nameNode) ||
|
|
124
|
+
Node.isArrayBindingPattern(nameNode)) {
|
|
125
|
+
return `${prefix}{}:${typeStr}${suffix}`;
|
|
126
|
+
}
|
|
127
|
+
allocParam(n.getName());
|
|
128
|
+
return `${prefix}$p:${typeStr}${suffix}`;
|
|
129
|
+
}
|
|
130
|
+
// 泛型参数:T / TData → $T
|
|
131
|
+
if (Node.isTypeParameterDeclaration(n)) {
|
|
132
|
+
const constraint = n.getConstraint();
|
|
133
|
+
return `$T${constraint ? ` extends ${normalizeTypeString(constraint.getText())}` : ''}`;
|
|
134
|
+
}
|
|
135
|
+
// 类型引用:标准化名称
|
|
136
|
+
if (Node.isTypeReference(n))
|
|
137
|
+
return normalizeTypeString(n.getText());
|
|
138
|
+
// JSX:只记录存在
|
|
139
|
+
if (Node.isJsxElement(n) || Node.isJsxSelfClosingElement(n))
|
|
140
|
+
return '<JSX/>';
|
|
141
|
+
// 字面量 → 占位符
|
|
142
|
+
if (kind === SyntaxKind.StringLiteral)
|
|
143
|
+
return '"$s"';
|
|
144
|
+
if (kind === SyntaxKind.NumericLiteral ||
|
|
145
|
+
kind === SyntaxKind.BigIntLiteral)
|
|
146
|
+
return '$n';
|
|
147
|
+
if (kind === SyntaxKind.TrueKeyword || kind === SyntaxKind.FalseKeyword)
|
|
148
|
+
return '$b';
|
|
149
|
+
const children = n.getChildren();
|
|
150
|
+
if (children.length === 0)
|
|
151
|
+
return n.getText();
|
|
152
|
+
return children.map(visit).join('');
|
|
153
|
+
}
|
|
154
|
+
return visit(node).replace(/\s+/g, ' ').trim();
|
|
155
|
+
}
|
|
156
|
+
function normalizeParameter(param, index) {
|
|
157
|
+
const typeNode = param.getTypeNode();
|
|
158
|
+
const typeStr = typeNode
|
|
159
|
+
? normalizeTypeWithStructure(typeNode)
|
|
160
|
+
: '$unknown';
|
|
161
|
+
const prefix = param.isRestParameter() ? '...' : '';
|
|
162
|
+
const suffix = param.hasInitializer() ? '=$default' : '';
|
|
163
|
+
const nameNode = param.getNameNode();
|
|
164
|
+
// 解构参数:{ id, opts }: Config → { id: $d, opts: $d }:{id:xx, opts:xx}
|
|
165
|
+
if (Node.isObjectBindingPattern(nameNode) ||
|
|
166
|
+
Node.isArrayBindingPattern(nameNode)) {
|
|
167
|
+
const elements = nameNode.getElements();
|
|
168
|
+
const destrProps = [];
|
|
169
|
+
for (const el of elements) {
|
|
170
|
+
if (Node.isOmittedExpression(el))
|
|
171
|
+
continue;
|
|
172
|
+
const elAny = el;
|
|
173
|
+
const name = elAny.getName?.() ?? 'unknown';
|
|
174
|
+
// 检查是否有问号(可选属性)
|
|
175
|
+
const hasQ = el
|
|
176
|
+
.getChildren()
|
|
177
|
+
.some((c) => c.getKind() === SyntaxKind.QuestionToken);
|
|
178
|
+
// 检查是否有默认值,有则使用默认值的类型
|
|
179
|
+
const initializer = el.getInitializer();
|
|
180
|
+
let valueType;
|
|
181
|
+
if (initializer) {
|
|
182
|
+
// 用初始值表达式的类型
|
|
183
|
+
valueType = normalizeTypeWithStructure(initializer);
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
valueType = '$d';
|
|
187
|
+
}
|
|
188
|
+
destrProps.push(`${name}${hasQ ? '?' : ''}:${valueType}`);
|
|
189
|
+
}
|
|
190
|
+
// 排序 key, 防止仅仅因为顺序不一样导致hash不稳定
|
|
191
|
+
destrProps.sort();
|
|
192
|
+
return `${prefix}{${destrProps.join(',')}}:${typeStr}${suffix}`;
|
|
193
|
+
}
|
|
194
|
+
return `${prefix}$p${index}:${typeStr}${suffix}`;
|
|
195
|
+
}
|
|
196
|
+
function normalizeTypeWithStructure(typeNode) {
|
|
197
|
+
// 如果是类型字面量 { name: string, age: number }
|
|
198
|
+
if (Node.isTypeLiteral(typeNode)) {
|
|
199
|
+
const props = typeNode
|
|
200
|
+
.getMembers()
|
|
201
|
+
.filter(Node.isPropertySignature)
|
|
202
|
+
.map((prop) => {
|
|
203
|
+
const propTypeNode = prop.getTypeNode();
|
|
204
|
+
const propType = propTypeNode
|
|
205
|
+
? normalizeTypeWithStructure(propTypeNode)
|
|
206
|
+
: '$unknown';
|
|
207
|
+
return `${prop.getName()}${prop.hasQuestionToken() ? '?' : ''}:${propType}`;
|
|
208
|
+
});
|
|
209
|
+
return `{${props.join(',')}}`;
|
|
210
|
+
}
|
|
211
|
+
// 如果是类型引用(e.g. Param, Array<string>, Promise<User>)
|
|
212
|
+
if (Node.isTypeReference(typeNode)) {
|
|
213
|
+
const typeName = typeNode.getTypeName().getText();
|
|
214
|
+
// 优先处理泛型参数(不管是否在白名单中)
|
|
215
|
+
const typeArgs = typeNode.getTypeArguments();
|
|
216
|
+
if (typeArgs.length > 0) {
|
|
217
|
+
const normalizedArgs = typeArgs.map((arg) => normalizeTypeWithStructure(arg));
|
|
218
|
+
return `${typeName}<${normalizedArgs.join(',')}>`;
|
|
219
|
+
}
|
|
220
|
+
// 没有泛型参数时,检查是否是基础类型
|
|
221
|
+
if (BUILTIN_TYPES.has(typeName)) {
|
|
222
|
+
return typeName;
|
|
223
|
+
}
|
|
224
|
+
// 无泛型的类型引用,尝试解析实际类型
|
|
225
|
+
try {
|
|
226
|
+
const type = typeNode.getType();
|
|
227
|
+
const symbol = type.getSymbol();
|
|
228
|
+
if (symbol) {
|
|
229
|
+
const declarations = symbol.getDeclarations();
|
|
230
|
+
if (declarations.length > 0) {
|
|
231
|
+
const decl = declarations[0];
|
|
232
|
+
// 如果是接口声明,递归处理
|
|
233
|
+
if (Node.isInterfaceDeclaration(decl)) {
|
|
234
|
+
const props = decl.getProperties().map((prop) => {
|
|
235
|
+
const propTypeNode = prop.getTypeNode();
|
|
236
|
+
const propType = propTypeNode
|
|
237
|
+
? normalizeTypeWithStructure(propTypeNode)
|
|
238
|
+
: '$unknown';
|
|
239
|
+
return `${prop.getName()}${prop.hasQuestionToken() ? '?' : ''}:${propType}`;
|
|
240
|
+
});
|
|
241
|
+
return `{${props.join(',')}}`;
|
|
242
|
+
}
|
|
243
|
+
// 如果是类型别名
|
|
244
|
+
if (Node.isTypeAliasDeclaration(decl)) {
|
|
245
|
+
const aliasedType = decl.getTypeNode();
|
|
246
|
+
if (aliasedType) {
|
|
247
|
+
return normalizeTypeWithStructure(aliasedType);
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
catch (e) {
|
|
254
|
+
// 解析失败,回退到类型名
|
|
255
|
+
}
|
|
256
|
+
return typeName;
|
|
257
|
+
}
|
|
258
|
+
// 其他情况(联合类型、交叉类型、基础类型等)
|
|
259
|
+
return normalizeTypeString(typeNode.getText());
|
|
260
|
+
}
|
|
261
|
+
export function extractNormalizedSignature(node) {
|
|
262
|
+
// 函数声明 / 箭头函数 / 函数表达式
|
|
263
|
+
if (Node.isFunctionDeclaration(node) ||
|
|
264
|
+
Node.isArrowFunction(node) ||
|
|
265
|
+
Node.isFunctionExpression(node)) {
|
|
266
|
+
const typeParams = Node.isFunctionDeclaration(node)
|
|
267
|
+
? node.getTypeParameters().map((tp) => normalizeNode(tp))
|
|
268
|
+
: [];
|
|
269
|
+
const params = node
|
|
270
|
+
.getParameters()
|
|
271
|
+
.sort()
|
|
272
|
+
.map((p, i) => normalizeParameter(p, i));
|
|
273
|
+
const retNode = node.getReturnTypeNode?.();
|
|
274
|
+
const returnType = retNode
|
|
275
|
+
? normalizeTypeString(retNode.getText())
|
|
276
|
+
: '$inferred';
|
|
277
|
+
const tpStr = typeParams.length ? `<${typeParams.join(',')}>` : '';
|
|
278
|
+
return `fn${tpStr}(${params.join(',')})=>${returnType}`;
|
|
279
|
+
}
|
|
280
|
+
// 变量声明(const foo = () => {})
|
|
281
|
+
if (Node.isVariableDeclaration(node)) {
|
|
282
|
+
const init = node.getInitializer();
|
|
283
|
+
if (init &&
|
|
284
|
+
(Node.isArrowFunction(init) || Node.isFunctionExpression(init))) {
|
|
285
|
+
return extractNormalizedSignature(init);
|
|
286
|
+
}
|
|
287
|
+
return normalizeNode(node);
|
|
288
|
+
}
|
|
289
|
+
// interface:提取所有属性,模板化名称
|
|
290
|
+
if (Node.isInterfaceDeclaration(node)) {
|
|
291
|
+
const props = node.getProperties().map((prop) => {
|
|
292
|
+
const typeNode = prop.getTypeNode();
|
|
293
|
+
const typeStr = typeNode
|
|
294
|
+
? normalizeTypeString(typeNode.getText())
|
|
295
|
+
: '$unknown';
|
|
296
|
+
const optional = prop.hasQuestionToken() ? '?' : '';
|
|
297
|
+
return `${prop.getName()}${optional}:${typeStr}`;
|
|
298
|
+
});
|
|
299
|
+
const extendsClause = node.getExtends();
|
|
300
|
+
const extendsStr = extendsClause.length
|
|
301
|
+
? ` extends ${extendsClause.map((e) => normalizeTypeString(e.getText())).join(',')}`
|
|
302
|
+
: '';
|
|
303
|
+
return `interface{${props.sort().join(';')}}${extendsStr}`;
|
|
304
|
+
}
|
|
305
|
+
// type alias(对象类型):提取结构化信息
|
|
306
|
+
if (Node.isTypeAliasDeclaration(node)) {
|
|
307
|
+
const typeNode = node.getTypeNode();
|
|
308
|
+
if (typeNode) {
|
|
309
|
+
// 处理 type Foo = { ... }
|
|
310
|
+
const props = extractPropertiesFromType(typeNode);
|
|
311
|
+
if (props.length > 0) {
|
|
312
|
+
return `type{${props.sort().join(';')}}`;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
return normalizeNode(node);
|
|
316
|
+
}
|
|
317
|
+
// class:只取方法签名列表
|
|
318
|
+
if (Node.isClassDeclaration(node)) {
|
|
319
|
+
const methods = node
|
|
320
|
+
.getMethods()
|
|
321
|
+
.sort()
|
|
322
|
+
.map((m) => {
|
|
323
|
+
const params = m
|
|
324
|
+
.getParameters()
|
|
325
|
+
.sort()
|
|
326
|
+
.map((p) => normalizeNode(p));
|
|
327
|
+
const retNode = m.getReturnTypeNode();
|
|
328
|
+
const ret = retNode
|
|
329
|
+
? normalizeTypeString(retNode.getText())
|
|
330
|
+
: '$inferred';
|
|
331
|
+
return `${m.getName()}(${params.sort().join(',')})=>${ret}`;
|
|
332
|
+
});
|
|
333
|
+
return `class{${methods.join(';')}}`;
|
|
334
|
+
}
|
|
335
|
+
return normalizeNode(node);
|
|
336
|
+
}
|
|
337
|
+
// ─────────────────────────────────────────────
|
|
338
|
+
// computeSemanticHash
|
|
339
|
+
// 纳入:标准化签名 + name + type + description + sideEffects + hooks
|
|
340
|
+
// 排除:参数名、实现、格式、callers/callees
|
|
341
|
+
// ─────────────────────────────────────────────
|
|
342
|
+
export function computeSemanticHash(row) {
|
|
343
|
+
const node = row.node || null;
|
|
344
|
+
const meta = row.meta || {};
|
|
345
|
+
const stable = {
|
|
346
|
+
name: row.name,
|
|
347
|
+
type: row.type,
|
|
348
|
+
description: row.description ?? null,
|
|
349
|
+
signature: node ? extractNormalizedSignature(node) : '',
|
|
350
|
+
sideEffects: [
|
|
351
|
+
...(meta.sideEffects ?? []),
|
|
352
|
+
].sort(),
|
|
353
|
+
hooks: [...(meta.hooks ?? [])].sort(),
|
|
354
|
+
};
|
|
355
|
+
const stableStr = JSON.stringify(stable);
|
|
356
|
+
return [createHash('sha256').update(stableStr).digest('hex'), stableStr];
|
|
357
|
+
}
|
|
358
|
+
// ─────────────────────────────────────────────
|
|
359
|
+
// computeFileHash:对文件原始内容
|
|
360
|
+
// ─────────────────────────────────────────────
|
|
361
|
+
export function computeFileHash(fileContent) {
|
|
362
|
+
return createHash('sha256').update(fileContent).digest('hex');
|
|
363
|
+
}
|
|
@@ -16,7 +16,9 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
16
16
|
- **API 是否简单**、入参是否合适
|
|
17
17
|
- **依赖与副作用**风险
|
|
18
18
|
- **复用安全性**(稳定性、耦合度、是否便于扩展)
|
|
19
|
-
6.
|
|
19
|
+
6. 给出**唯一首选**推荐,并说明理由,同时使用 **AskUserQuestion **工具,提供两个选项:
|
|
20
|
+
- 采纳推荐
|
|
21
|
+
- 取消
|
|
20
22
|
|
|
21
23
|
## 回复结构
|
|
22
24
|
|
|
@@ -28,6 +30,7 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
28
30
|
- **理由:** 1~3 条要点
|
|
29
31
|
- **其他候选:** 简要列出及取舍(同步标注副作用)
|
|
30
32
|
- **用法提示:** 结合用户场景的最小集成说明
|
|
33
|
+
- **是否采纳:** 展示两个选项: 选项1.采纳推荐 选项2.取消。等待用户确认
|
|
31
34
|
|
|
32
35
|
## 约束
|
|
33
36
|
|
|
@@ -37,8 +40,8 @@ const REUSABLE_CODE_ADVISOR_MARKDOWN = `# 可复用代码推荐
|
|
|
37
40
|
|
|
38
41
|
## 使用反馈
|
|
39
42
|
|
|
40
|
-
|
|
41
|
-
|
|
43
|
+
当选择‘采纳推荐’必须调用 inc_usage 工具记录采纳行为,调用格式如下:
|
|
44
|
+
“inc_usage({ symbolId: <选中的代码块 id> })”
|
|
42
45
|
其中 symbolId 从 search_symbols 或 search_by_structure 返回结果的 id 字段获取。这条记录会用于后续排序优化。
|
|
43
46
|
|
|
44
47
|
## 更多示例
|
|
@@ -2,6 +2,9 @@ import { env } from '../config/env.js';
|
|
|
2
2
|
import { getMySqlPool } from '../db/mysql.js';
|
|
3
3
|
import { createEmbeddingClient } from '../services/embeddingClient.js';
|
|
4
4
|
import { cosineSimilarity } from '../services/vectorMath.js';
|
|
5
|
+
import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
|
|
6
|
+
const THREADHOLD_SIMILARITY_BEFORE_RANKED = 0.5;
|
|
7
|
+
const TOP_K_FOR_RANKING = 100; // 进入复杂排序的候选数上限(语义相似度初筛后保留的结果数,过大会增加排序成本)
|
|
5
8
|
const inMemorySymbols = [
|
|
6
9
|
{
|
|
7
10
|
id: 1,
|
|
@@ -18,7 +21,7 @@ const inMemorySymbols = [
|
|
|
18
21
|
{
|
|
19
22
|
id: 2,
|
|
20
23
|
name: 'formatDate',
|
|
21
|
-
type: '
|
|
24
|
+
type: 'function',
|
|
22
25
|
category: 'date',
|
|
23
26
|
path: 'src/utils/date.ts',
|
|
24
27
|
description: 'Format date to YYYY-MM-DD',
|
|
@@ -99,6 +102,7 @@ export class SymbolRepository {
|
|
|
99
102
|
SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at
|
|
100
103
|
FROM ${env.mysqlSymbolsTable}
|
|
101
104
|
WHERE (name LIKE ? OR description LIKE ?)
|
|
105
|
+
AND status = ${SEARCHABLE_STATUS}
|
|
102
106
|
`;
|
|
103
107
|
params.push(`%${query}%`);
|
|
104
108
|
if (type) {
|
|
@@ -110,8 +114,14 @@ export class SymbolRepository {
|
|
|
110
114
|
return rows.map((r) => mapRow(r));
|
|
111
115
|
}
|
|
112
116
|
/**
|
|
113
|
-
|
|
114
|
-
|
|
117
|
+
* Phase 5:对自然语言查询做向量检索,启用分桶采样策略,返回代码
|
|
118
|
+
块与余弦相似度。
|
|
119
|
+
* 分桶策略:
|
|
120
|
+
* - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
|
|
121
|
+
* - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
|
|
122
|
+
floor(catLimit / pathCount)) 条
|
|
123
|
+
* 最终选择topK,进入排序
|
|
124
|
+
*/
|
|
115
125
|
async searchSemanticHits(query, opts) {
|
|
116
126
|
if (!env.embeddingServiceUrl) {
|
|
117
127
|
throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
|
|
@@ -120,38 +130,102 @@ export class SymbolRepository {
|
|
|
120
130
|
return [];
|
|
121
131
|
}
|
|
122
132
|
const candidateLimit = opts?.candidateLimit ?? 3000;
|
|
123
|
-
const limit = opts?.limit ??
|
|
133
|
+
const limit = opts?.limit ?? TOP_K_FOR_RANKING;
|
|
124
134
|
const type = opts?.type;
|
|
125
135
|
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
126
136
|
const [queryVec] = await client.embed([query.trim()]);
|
|
127
137
|
if (!queryVec?.length) {
|
|
128
138
|
throw new Error('查询向量为空');
|
|
129
139
|
}
|
|
140
|
+
// 查询足够的数据以支持分桶采样(3倍候选数以覆盖各桶)
|
|
141
|
+
const fetchLimit = candidateLimit * 3;
|
|
130
142
|
let sql = `
|
|
131
143
|
SELECT id, name, type, category, path, description, content, CAST(meta AS CHAR) AS meta, usage_count, created_at, embedding
|
|
132
144
|
FROM ${env.mysqlSymbolsTable}
|
|
133
145
|
WHERE embedding IS NOT NULL
|
|
146
|
+
AND status = ${SEARCHABLE_STATUS}
|
|
134
147
|
`;
|
|
135
148
|
const params = [];
|
|
136
149
|
if (type) {
|
|
137
150
|
sql += ' AND type = ?';
|
|
138
151
|
params.push(type);
|
|
139
152
|
}
|
|
140
|
-
sql += '
|
|
141
|
-
params.push(
|
|
153
|
+
sql += ' DESC LIMIT ?';
|
|
154
|
+
params.push(fetchLimit);
|
|
142
155
|
const [rows] = await this.pool.query(sql, params);
|
|
143
156
|
const withVec = rows
|
|
144
157
|
.map((r) => mapRow(r, { includeEmbedding: true }))
|
|
145
158
|
.filter((s) => s.embedding && s.embedding.length === queryVec.length);
|
|
146
|
-
|
|
159
|
+
// 分桶采样:按 category + path 两层分桶
|
|
160
|
+
const sampled = this.bucketSampling(withVec, candidateLimit);
|
|
161
|
+
return sampled
|
|
147
162
|
.map((s) => {
|
|
148
163
|
const sim = cosineSimilarity(queryVec, s.embedding);
|
|
149
164
|
const { embedding: _, ...rest } = s;
|
|
150
165
|
return { symbol: rest, similarity: sim };
|
|
151
166
|
})
|
|
167
|
+
.filter((x) => x.similarity >= THREADHOLD_SIMILARITY_BEFORE_RANKED) // 初筛阈值,过滤掉明显不相关的结果
|
|
152
168
|
.sort((a, b) => b.similarity - a.similarity)
|
|
153
169
|
.slice(0, limit);
|
|
154
170
|
}
|
|
171
|
+
/**
|
|
172
|
+
* 分桶采样核心逻辑
|
|
173
|
+
* - 第一层:按 category 占比计算每个分类应采样条数(保底10条)
|
|
174
|
+
* - 第二层:每个 path 子桶内乱序后采样 Math.max(5,
|
|
175
|
+
floor(catLimit / pathCount)) 条
|
|
176
|
+
*/
|
|
177
|
+
bucketSampling(symbols, limit) {
|
|
178
|
+
if (symbols.length === 0)
|
|
179
|
+
return [];
|
|
180
|
+
// 按 category 分组
|
|
181
|
+
const categoryGroups = new Map();
|
|
182
|
+
for (const s of symbols) {
|
|
183
|
+
const cat = s.category ?? '__null__';
|
|
184
|
+
if (!categoryGroups.has(cat)) {
|
|
185
|
+
categoryGroups.set(cat, []);
|
|
186
|
+
}
|
|
187
|
+
categoryGroups.get(cat).push(s);
|
|
188
|
+
}
|
|
189
|
+
const total = symbols.length;
|
|
190
|
+
const sampled = [];
|
|
191
|
+
// 第一层:按 category 占比计算采样数,保底10条
|
|
192
|
+
for (const [, catSymbols] of categoryGroups) {
|
|
193
|
+
const catCount = catSymbols.length;
|
|
194
|
+
const catRatio = catCount / total;
|
|
195
|
+
const catLimit = Math.max(10, Math.floor(limit * catRatio));
|
|
196
|
+
// 按 path 分组(提取目录部分)
|
|
197
|
+
const pathGroups = new Map();
|
|
198
|
+
for (const s of catSymbols) {
|
|
199
|
+
const dir = s.path.includes('/')
|
|
200
|
+
? s.path.slice(0, s.path.lastIndexOf('/'))
|
|
201
|
+
: '__root__';
|
|
202
|
+
if (!pathGroups.has(dir)) {
|
|
203
|
+
pathGroups.set(dir, []);
|
|
204
|
+
}
|
|
205
|
+
pathGroups.get(dir).push(s);
|
|
206
|
+
}
|
|
207
|
+
const pathCount = pathGroups.size;
|
|
208
|
+
const perPathSample = Math.max(5, Math.floor(catLimit / pathCount));
|
|
209
|
+
// 第二层:每个 path 子桶内乱序后采样
|
|
210
|
+
for (const pathSymbols of pathGroups.values()) {
|
|
211
|
+
// 原地乱序(Fisher- Y ates)
|
|
212
|
+
for (let i = pathSymbols.length - 1; i > 0; i--) {
|
|
213
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
214
|
+
[pathSymbols[i], pathSymbols[j]] = [
|
|
215
|
+
pathSymbols[j],
|
|
216
|
+
pathSymbols[i],
|
|
217
|
+
];
|
|
218
|
+
}
|
|
219
|
+
const pathSampleCount = Math.min(perPathSample, pathSymbols.length);
|
|
220
|
+
sampled.push(...pathSymbols.slice(0, pathSampleCount));
|
|
221
|
+
if (sampled.length >= limit)
|
|
222
|
+
break;
|
|
223
|
+
}
|
|
224
|
+
if (sampled.length >= limit)
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
return sampled.slice(0, limit);
|
|
228
|
+
}
|
|
155
229
|
async getByName(name) {
|
|
156
230
|
if (!this.pool) {
|
|
157
231
|
return (inMemorySymbols.find((s) => s.name.toLowerCase() === name.toLowerCase()) ?? null);
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BullMQ embedding 队列 producer。
|
|
3
|
+
*
|
|
4
|
+
* 设计要点:
|
|
5
|
+
* - jobId = semanticHash → 相同语义模板自动去重,N 个符号相同 hash 只入队一次
|
|
6
|
+
* - CI 流程只负责 enqueue,worker 异步消费,CI 不阻塞
|
|
7
|
+
* - 调用方在进程退出前需调用 closeEmbeddingQueue() 释放连接
|
|
8
|
+
*/
|
|
9
|
+
import { Queue } from 'bullmq';
|
|
10
|
+
import Redis from 'ioredis';
|
|
11
|
+
import { env } from '../config/env.js';
|
|
12
|
+
let _queue = null;
|
|
13
|
+
let _connection = null;
|
|
14
|
+
function getQueue() {
|
|
15
|
+
if (!_queue) {
|
|
16
|
+
_connection = new Redis(env.redisUrl, {
|
|
17
|
+
maxRetriesPerRequest: null, // BullMQ required
|
|
18
|
+
enableReadyCheck: false,
|
|
19
|
+
});
|
|
20
|
+
_queue = new Queue('embedding', { connection: _connection });
|
|
21
|
+
}
|
|
22
|
+
return _queue;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* 将一个 semanticHash 对应的 embedding 任务入队。
|
|
26
|
+
* 若队列中已存在相同 jobId(semanticHash)的待处理任务,BullMQ 自动忽略重复请求。
|
|
27
|
+
*/
|
|
28
|
+
export async function enqueueEmbedding(semanticHash) {
|
|
29
|
+
await getQueue().add('embed', { semanticHash }, {
|
|
30
|
+
jobId: semanticHash, // 去重键:相同 hash 幂等
|
|
31
|
+
attempts: 5,
|
|
32
|
+
backoff: { type: 'exponential', delay: 5_000 },
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
/** 批量入队,适合全量扫描场景 */
|
|
36
|
+
export async function enqueueEmbeddingBatch(semanticHashes) {
|
|
37
|
+
const queue = getQueue();
|
|
38
|
+
const jobs = semanticHashes.map((hash) => ({
|
|
39
|
+
name: 'embed',
|
|
40
|
+
data: { semanticHash: hash },
|
|
41
|
+
opts: {
|
|
42
|
+
jobId: hash,
|
|
43
|
+
attempts: 5,
|
|
44
|
+
backoff: { type: 'exponential', delay: 5_000 },
|
|
45
|
+
},
|
|
46
|
+
}));
|
|
47
|
+
// BullMQ addBulk 会跳过已存在 jobId 的任务
|
|
48
|
+
await queue.addBulkJobs(jobs);
|
|
49
|
+
}
|
|
50
|
+
/** 进程退出前关闭连接(CI 脚本必须调用,否则进程挂起) */
|
|
51
|
+
export async function closeEmbeddingQueue() {
|
|
52
|
+
await _queue?.close();
|
|
53
|
+
await _connection?.quit();
|
|
54
|
+
_queue = null;
|
|
55
|
+
_connection = null;
|
|
56
|
+
}
|
package/dist/services/reindex.js
CHANGED
|
@@ -4,6 +4,7 @@ import { getMySqlPool } from '../db/mysql.js';
|
|
|
4
4
|
import { indexedRowToEmbedText } from '../indexer/embedText.js';
|
|
5
5
|
import { indexProject } from '../indexer/indexProject.js';
|
|
6
6
|
import { upsertSymbols } from '../indexer/persistSymbols.js';
|
|
7
|
+
import { initCategoryEmbeddings, resolveCategory, } from '../indexer/categoryClassifier.js';
|
|
7
8
|
import { createEmbeddingClient, embedAll, } from '../services/embeddingClient.js';
|
|
8
9
|
export async function runReindex(options = {}) {
|
|
9
10
|
const projectRoot = resolve(options.projectRoot ?? process.cwd());
|
|
@@ -12,23 +13,20 @@ export async function runReindex(options = {}) {
|
|
|
12
13
|
loadProjectDotenv(projectRoot);
|
|
13
14
|
// 2️ 打印生效的环境变量(便于调试)
|
|
14
15
|
console.error(`[reindex] projectRoot=${projectRoot}, dryRun=${dryRun}, ` +
|
|
15
|
-
`MYSQL_ENABLED=${process.env.MYSQL_ENABLED}, ` +
|
|
16
16
|
`MYSQL_HOST=${process.env.MYSQL_HOST}`);
|
|
17
17
|
// 3️⃣ 只有需要写入数据库时才检查 MySQL 并建立连接
|
|
18
|
-
// 注意:直接检查 process.env,因为 env.mysqlEnabled 是模块加载时计算的,不会反映 loadProjectDotenv 的更新
|
|
19
|
-
const mysqlEnabled = process.env.MYSQL_ENABLED === 'true';
|
|
20
18
|
const embeddingServiceUrl = process.env.EMBEDDING_SERVICE_URL;
|
|
19
|
+
if (!dryRun && embeddingServiceUrl) {
|
|
20
|
+
// 初始化 category embeddings
|
|
21
|
+
await initCategoryEmbeddings();
|
|
22
|
+
}
|
|
21
23
|
let pool = null;
|
|
22
24
|
if (!dryRun) {
|
|
23
|
-
if (!mysqlEnabled) {
|
|
24
|
-
throw new Error(`最新!${JSON.stringify(process.env)}执行 reindex 写入数据库需要 MYSQL_ENABLED=true。' +
|
|
25
|
-
'第三方项目可在 .env 中配置此变量(未配置则使用 MCP Server 本地配置)。`);
|
|
26
|
-
}
|
|
27
25
|
pool = getMySqlPool();
|
|
28
26
|
await pool.query('SELECT 1'); // 测试连接
|
|
29
27
|
console.error('[reindex] MySQL connection successful');
|
|
30
28
|
}
|
|
31
|
-
|
|
29
|
+
let rows = await indexProject({
|
|
32
30
|
projectRoot,
|
|
33
31
|
globPatterns: options.globPatterns,
|
|
34
32
|
ignore: options.ignore,
|
|
@@ -39,8 +37,12 @@ export async function runReindex(options = {}) {
|
|
|
39
37
|
if (!options.dryRun && rows.length > 0 && embeddingServiceUrl) {
|
|
40
38
|
try {
|
|
41
39
|
const client = createEmbeddingClient(embeddingServiceUrl);
|
|
42
|
-
|
|
40
|
+
// 先实现ts语义模板,js保留原逻辑
|
|
41
|
+
const texts = rows.map((row) => row.semantic_hash ?? indexedRowToEmbedText(row));
|
|
43
42
|
const vecs = await embedAll(client, texts);
|
|
43
|
+
console.error('==vecs', vecs?.length);
|
|
44
|
+
// 生成category
|
|
45
|
+
rows = await resolveCategory(rows, vecs);
|
|
44
46
|
embeddingPayload = vecs;
|
|
45
47
|
embeddingsComputed = true;
|
|
46
48
|
}
|
|
@@ -52,6 +54,7 @@ export async function runReindex(options = {}) {
|
|
|
52
54
|
if (!options.dryRun) {
|
|
53
55
|
await upsertSymbols(pool, rows, embeddingPayload);
|
|
54
56
|
}
|
|
57
|
+
console.error('===out', JSON.stringify(rows));
|
|
55
58
|
return {
|
|
56
59
|
projectRoot,
|
|
57
60
|
extractedCount: rows.length,
|
|
@@ -3,7 +3,9 @@ import { z } from 'zod';
|
|
|
3
3
|
import { rankSymbols } from '../services/ranking.js';
|
|
4
4
|
export const searchByStructureInput = z.object({
|
|
5
5
|
fields: z.array(z.string().min(1)).min(1),
|
|
6
|
-
type: z
|
|
6
|
+
type: z
|
|
7
|
+
.enum(['component', 'function', 'hook', 'type', 'interface', 'class'])
|
|
8
|
+
.optional(),
|
|
7
9
|
category: z.string().optional(),
|
|
8
10
|
limit: z.number().int().min(1).max(100).optional().default(20),
|
|
9
11
|
ranked: z.boolean().optional().default(true),
|