@lorrylurui/code-intelligence-mcp 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config/env.js +9 -0
- package/dist/config/tuning.js +114 -0
- package/dist/db/schema.js +37 -0
- package/dist/indexer/chunkText.js +164 -0
- package/dist/indexer/embedText.js +2 -2
- package/dist/repositories/chunkRepository.js +181 -0
- package/dist/repositories/symbolRepository.js +4 -5
- package/dist/server/createServer.js +5 -1
- package/dist/services/contextAssembler.js +150 -0
- package/dist/services/ranking.js +37 -39
- package/dist/services/recommendationService.js +325 -104
- package/dist/tools/queryDocs.js +113 -0
- package/dist/tools/searchSymbols.js +3 -2
- package/dist/types/chunk.js +1 -0
- package/package.json +1 -1
package/dist/config/env.js
CHANGED
|
@@ -60,6 +60,15 @@ export const env = {
|
|
|
60
60
|
get symbolsTable() {
|
|
61
61
|
return process.env.SYMBOLS_TABLE ?? 'symbols';
|
|
62
62
|
},
|
|
63
|
+
/**
|
|
64
|
+
* 文档 chunks 表名。
|
|
65
|
+
* 默认跟随 SYMBOLS_TABLE 派生,例如:symbols -> symbols_chunks,repo_a_symbols -> repo_a_symbols_chunks。
|
|
66
|
+
* 如需显式覆盖,仍可单独传入 CHUNKS_TABLE。
|
|
67
|
+
*/
|
|
68
|
+
get chunksTable() {
|
|
69
|
+
const symbolsTable = process.env.SYMBOLS_TABLE ?? 'symbols';
|
|
70
|
+
return process.env.CHUNKS_TABLE ?? `${symbolsTable}_chunks`;
|
|
71
|
+
},
|
|
63
72
|
/** Python FastAPI 嵌入服务根 URL,如 http://127.0.0.1:8765 */
|
|
64
73
|
get embeddingServiceUrl() {
|
|
65
74
|
return (process.env.EMBEDDING_SERVICE_URL ?? '').trim();
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tuning.ts — 所有可调参数的集中配置。
|
|
3
|
+
*
|
|
4
|
+
* 生产环境中需要多次微调的阈值、权重和限制值均在此定义,
|
|
5
|
+
* 禁止在业务代码里直接写魔法数字。
|
|
6
|
+
*/
|
|
7
|
+
// ─── Chunk 切分参数 (chunkText.ts) ───────────────────────────────────────────
|
|
8
|
+
/** 目标 chunk 字符数:达到此值后尽快在下一个边界处收敛当前块 */
|
|
9
|
+
export const CHUNK_TARGET_CHARS = 900;
|
|
10
|
+
/** 单个 chunk 最大字符数:超过此值必须做二次切分 */
|
|
11
|
+
export const CHUNK_MAX_CHARS = 1200;
|
|
12
|
+
/** 相邻 chunk 的重叠字符数:用于减少边界信息丢失 */
|
|
13
|
+
export const CHUNK_OVERLAP_CHARS = 120;
|
|
14
|
+
/** 句子/换行边界的最小位置比例:不足此比例则不回退到该边界,避免生成过短的 chunk */
|
|
15
|
+
export const CHUNK_SENTENCE_BREAK_MIN_RATIO = 0.6;
|
|
16
|
+
/** chunk 摘要的最大字符数(仅用于展示与 embedding 辅助信息) */
|
|
17
|
+
export const CHUNK_SUMMARY_MAX_CHARS = 160;
|
|
18
|
+
// ─── Chunk 语义检索参数 (chunkRepository.ts) ─────────────────────────────────
|
|
19
|
+
/** 最低 cosine 相似度:低于此值的 chunk 不返回给调用方 */
|
|
20
|
+
export const CHUNK_SIMILARITY_THRESHOLD = 0;
|
|
21
|
+
/** 语义检索默认返回的 chunk 数量上限 */
|
|
22
|
+
export const CHUNK_TOP_K = 8;
|
|
23
|
+
// ─── Symbol 语义检索参数 (symbolRepository.ts) ───────────────────────────────
|
|
24
|
+
/** 最低 cosine 相似度:低于此值的 symbol 不返回给调用方 */
|
|
25
|
+
export const SYMBOL_SIMILARITY_THRESHOLD = 0;
|
|
26
|
+
/** 语义检索默认返回的 symbol 数量上限 */
|
|
27
|
+
export const SYMBOL_TOP_K = 20;
|
|
28
|
+
// ─── Embedding 文本截断 (embedText.ts) ──────────────────────────────────────
|
|
29
|
+
/** 源码片段送入 embedding 前的最大字符数,超出部分截断 */
|
|
30
|
+
export const EMBED_MAX_CONTENT_LENGTH = 1200;
|
|
31
|
+
// ─── 排名权重 (ranking.ts) ────────────────────────────────────────────────────
|
|
32
|
+
/** 综合排名四维度权重,总和须为 1 */
|
|
33
|
+
export const RANK_WEIGHTS = {
|
|
34
|
+
textMatch: 0.5,
|
|
35
|
+
usage: 0.3,
|
|
36
|
+
recency: 0.1,
|
|
37
|
+
commonPath: 0.1,
|
|
38
|
+
};
|
|
39
|
+
/** 每匹配到一个 callee 名称所增加的分数 */
|
|
40
|
+
export const CALLEE_MATCH_SCORE_PER_MATCH = 0.05;
|
|
41
|
+
/** callee 匹配分数的上限(防止大量 callee 匹配主导总分) */
|
|
42
|
+
export const CALLEE_MATCH_SCORE_MAX = 0.2;
|
|
43
|
+
/**
|
|
44
|
+
* Token 重叠度评分阶梯(按顺序匹配,首个满足条件的阶梯生效)。
|
|
45
|
+
* - minMatches: 查询 token 中至少命中的数量
|
|
46
|
+
* - minRatio: 命中比例下限
|
|
47
|
+
* - score: 对应的文本匹配分数
|
|
48
|
+
*/
|
|
49
|
+
export const TOKEN_OVERLAP_TIERS = [
|
|
50
|
+
{ minMatches: 4, minRatio: 0.45, score: 0.78 },
|
|
51
|
+
{ minMatches: 3, minRatio: 0.3, score: 0.68 },
|
|
52
|
+
{ minMatches: 2, minRatio: 0.18, score: 0.56 },
|
|
53
|
+
];
|
|
54
|
+
/** 非 token-overlap 类型的文本匹配固定分数 */
|
|
55
|
+
export const TEXT_MATCH_SCORES = {
|
|
56
|
+
nameContains: 0.85,
|
|
57
|
+
descriptionContains: 0.65,
|
|
58
|
+
weak: 0.2,
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* 时效性评分阶梯(按 maxDays 升序评估,首个满足条件的阶梯生效)。
|
|
62
|
+
* - maxDays: 创建距今天数上限
|
|
63
|
+
* - score: 对应的时效分数
|
|
64
|
+
*/
|
|
65
|
+
export const RECENCY_SCORE_TIERS = [
|
|
66
|
+
{ maxDays: 7, score: 1.0 },
|
|
67
|
+
{ maxDays: 30, score: 0.8 },
|
|
68
|
+
{ maxDays: 90, score: 0.6 },
|
|
69
|
+
{ maxDays: 180, score: 0.4 },
|
|
70
|
+
];
|
|
71
|
+
/** 无 createdAt 时使用的默认时效分数 */
|
|
72
|
+
export const RECENCY_SCORE_DEFAULT = 0.4;
|
|
73
|
+
/** createdAt 超出所有阶梯(>180天)时的最低时效分数 */
|
|
74
|
+
export const RECENCY_SCORE_OLDEST = 0.25;
|
|
75
|
+
/** 路径含 /common/ 或 /shared/ 的符号所获得的路径维度分数 */
|
|
76
|
+
export const COMMON_PATH_SCORE_YES = 1;
|
|
77
|
+
/** 路径不在 common/shared 目录时的路径维度分数 */
|
|
78
|
+
export const COMMON_PATH_SCORE_NO = 0.35;
|
|
79
|
+
/** 使用频率评分的 log 底数除数(值越大,曲线越平坦) */
|
|
80
|
+
export const USAGE_SCORE_LOG_DIVISOR = 3;
|
|
81
|
+
/** 语义相似度高于此值时,输出"语义相似度高"标签 */
|
|
82
|
+
export const SEMANTIC_REASON_THRESHOLD_HIGH = 0.55;
|
|
83
|
+
/** 语义相似度高于此值时,输出"语义相关"标签 */
|
|
84
|
+
export const SEMANTIC_REASON_THRESHOLD_MED = 0.4;
|
|
85
|
+
/** 使用频率分高于此值时,输出"使用频率高"标签 */
|
|
86
|
+
export const USAGE_REASON_THRESHOLD_HIGH = 0.6;
|
|
87
|
+
// ─── 推荐质量门控阈值 (recommendationService.ts) ─────────────────────────────
|
|
88
|
+
/** 候选通过质量门控所需的最低综合分数 */
|
|
89
|
+
export const MIN_RECOMMENDATION_SCORE = {
|
|
90
|
+
semantic: 0.5,
|
|
91
|
+
keyword: 0.45,
|
|
92
|
+
};
|
|
93
|
+
/** 语义模式下文本维度分数须达到的下限(用于高置信度路径) */
|
|
94
|
+
export const MIN_SEMANTIC_TEXT_MATCH_SCORE = 0.6;
|
|
95
|
+
/** 名称/路径字面命中时,放宽的综合分数下限 */
|
|
96
|
+
export const MIN_LITERAL_MATCH_SCORE = 0.18;
|
|
97
|
+
/** props/hooks 结构字段全部命中时,放宽的综合分数下限 */
|
|
98
|
+
export const REQUIRED_FIELD_FALLBACK_MIN_SCORE = 0.4;
|
|
99
|
+
/** 字面命中(名称或文件名匹配查询词)时,对优先级分数增加的值 */
|
|
100
|
+
export const LITERAL_MATCH_PRIORITY_BOOST = 0.22;
|
|
101
|
+
/** 路径为 demo/example 风格时,对优先级分数扣减的值 */
|
|
102
|
+
export const DEMO_PATH_PRIORITY_PENALTY = 0.18;
|
|
103
|
+
// ─── 搜索工具结果过滤 (tools/searchSymbols.ts) ───────────────────────────────
|
|
104
|
+
/** 最终返回结果所需的最低综合评分 */
|
|
105
|
+
export const SEARCH_SCORE_THRESHOLD = 0.45;
|
|
106
|
+
/** 最终返回的 symbol 数量上限 */
|
|
107
|
+
export const SEARCH_TOP_K = 20;
|
|
108
|
+
// ─── RAG 上下文组装参数 (services/contextAssembler.ts) ───────────────────────
|
|
109
|
+
/** 每个命中 chunk 向前后各扩展的邻块数量(减少边界截断信息丢失) */
|
|
110
|
+
export const CONTEXT_ADJACENT_RADIUS = 1;
|
|
111
|
+
/** 注入 prompt 的上下文总字符数预算(超出则从相似度最低的 chunk 开始截断) */
|
|
112
|
+
export const CONTEXT_MAX_CHARS = 6000;
|
|
113
|
+
/** 邻块扩展后保留的最大 chunk 数量 */
|
|
114
|
+
export const CONTEXT_MAX_CHUNKS = 12;
|
package/dist/db/schema.js
CHANGED
|
@@ -49,3 +49,40 @@ export function getAllTableSQLs() {
|
|
|
49
49
|
...getSymbolsIndexSQLs(),
|
|
50
50
|
];
|
|
51
51
|
}
|
|
52
|
+
export function getChunksTableSQL() {
|
|
53
|
+
const tableName = env.chunksTable;
|
|
54
|
+
return `CREATE TABLE IF NOT EXISTS ${tableName} (
|
|
55
|
+
id SERIAL PRIMARY KEY,
|
|
56
|
+
source_id VARCHAR(255),
|
|
57
|
+
title TEXT NOT NULL,
|
|
58
|
+
path TEXT NOT NULL,
|
|
59
|
+
chunk_index INT NOT NULL,
|
|
60
|
+
chunk_count INT NOT NULL,
|
|
61
|
+
content TEXT NOT NULL,
|
|
62
|
+
summary TEXT,
|
|
63
|
+
category VARCHAR(255),
|
|
64
|
+
meta JSONB,
|
|
65
|
+
embedding vector(384),
|
|
66
|
+
semantic_hash VARCHAR(64) NOT NULL,
|
|
67
|
+
status SMALLINT NOT NULL DEFAULT ${DEFAULT_STATUS_ON_UPSERT},
|
|
68
|
+
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
69
|
+
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
70
|
+
CONSTRAINT uk_${tableName}_path_chunk UNIQUE (path, chunk_index)
|
|
71
|
+
)`;
|
|
72
|
+
}
|
|
73
|
+
export function getChunksIndexSQLs() {
|
|
74
|
+
const t = env.chunksTable;
|
|
75
|
+
return [
|
|
76
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_source_id ON ${t}(source_id)`,
|
|
77
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_semantic_hash ON ${t}(semantic_hash)`,
|
|
78
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_status ON ${t}(status)`,
|
|
79
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_path ON ${t}(path)`,
|
|
80
|
+
];
|
|
81
|
+
}
|
|
82
|
+
export function getAllChunkTableSQLs() {
|
|
83
|
+
return [
|
|
84
|
+
getEnsureExtensionSQL(),
|
|
85
|
+
getChunksTableSQL(),
|
|
86
|
+
...getChunksIndexSQLs(),
|
|
87
|
+
];
|
|
88
|
+
}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { CHUNK_MAX_CHARS, CHUNK_OVERLAP_CHARS, CHUNK_SENTENCE_BREAK_MIN_RATIO, CHUNK_SUMMARY_MAX_CHARS, CHUNK_TARGET_CHARS, } from '../config/tuning.js';
|
|
3
|
+
// 统一换行并去掉首尾空白,避免切分时混入无意义差异。
|
|
4
|
+
function normalizeText(content) {
|
|
5
|
+
return content.replace(/\r\n/g, '\n').trim();
|
|
6
|
+
}
|
|
7
|
+
// 第一阶段:先按 Markdown 风格结构拆成 heading / paragraph / code 三类 block。
|
|
8
|
+
function splitCodeAwareBlocks(content) {
|
|
9
|
+
const normalized = normalizeText(content);
|
|
10
|
+
if (!normalized)
|
|
11
|
+
return [];
|
|
12
|
+
const lines = normalized.split('\n');
|
|
13
|
+
const blocks = [];
|
|
14
|
+
let buffer = [];
|
|
15
|
+
let inCodeFence = false; // 追踪是否在代码块内,避免误把代码内容当普通文本切分。
|
|
16
|
+
const flushParagraphs = () => {
|
|
17
|
+
if (buffer.length === 0)
|
|
18
|
+
return;
|
|
19
|
+
const text = buffer.join('\n').trim();
|
|
20
|
+
buffer = [];
|
|
21
|
+
if (!text)
|
|
22
|
+
return;
|
|
23
|
+
const kind = text.startsWith('#') ? 'heading' : 'paragraph';
|
|
24
|
+
blocks.push({ kind, text });
|
|
25
|
+
};
|
|
26
|
+
for (const line of lines) {
|
|
27
|
+
const trimmed = line.trim();
|
|
28
|
+
if (trimmed.startsWith('```')) {
|
|
29
|
+
if (inCodeFence) {
|
|
30
|
+
buffer.push(line);
|
|
31
|
+
blocks.push({ kind: 'code', text: buffer.join('\n').trim() });
|
|
32
|
+
buffer = [];
|
|
33
|
+
inCodeFence = false;
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
flushParagraphs();
|
|
37
|
+
inCodeFence = true;
|
|
38
|
+
buffer.push(line);
|
|
39
|
+
}
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
if (inCodeFence) {
|
|
43
|
+
buffer.push(line);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
if (trimmed.startsWith('#')) {
|
|
47
|
+
flushParagraphs();
|
|
48
|
+
blocks.push({ kind: 'heading', text: trimmed });
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
if (!trimmed) {
|
|
52
|
+
flushParagraphs();
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
buffer.push(line);
|
|
56
|
+
}
|
|
57
|
+
flushParagraphs();
|
|
58
|
+
return blocks;
|
|
59
|
+
}
|
|
60
|
+
// 第二阶段:如果某个单独 block 过长,再按自然边界做窗口切分,并保留 overlap。
|
|
61
|
+
function sliceWithOverlap(text, maxChars, overlapChars) {
|
|
62
|
+
const normalized = text.trim();
|
|
63
|
+
if (!normalized)
|
|
64
|
+
return [];
|
|
65
|
+
if (normalized.length <= maxChars)
|
|
66
|
+
return [normalized];
|
|
67
|
+
const out = [];
|
|
68
|
+
let start = 0;
|
|
69
|
+
while (start < normalized.length) {
|
|
70
|
+
let end = Math.min(start + maxChars, normalized.length);
|
|
71
|
+
if (end < normalized.length) {
|
|
72
|
+
const window = normalized.slice(start, end);
|
|
73
|
+
// 优先回退到更自然的句子/换行边界,避免把一句话截成两半。
|
|
74
|
+
const sentenceBreak = Math.max(window.lastIndexOf('. '), window.lastIndexOf('。'), window.lastIndexOf('! '), window.lastIndexOf('? '), window.lastIndexOf('\n'));
|
|
75
|
+
if (sentenceBreak >
|
|
76
|
+
Math.floor(maxChars * CHUNK_SENTENCE_BREAK_MIN_RATIO)) {
|
|
77
|
+
// 边界不能太靠前,否则可能导致过多 chunk 和过短的上下文,设置一个阈值(如 maxChars 的 60%)来平衡。
|
|
78
|
+
end = start + sentenceBreak + 1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
out.push(normalized.slice(start, end).trim());
|
|
82
|
+
if (end >= normalized.length)
|
|
83
|
+
break;
|
|
84
|
+
start = Math.max(end - overlapChars, start + 1);
|
|
85
|
+
}
|
|
86
|
+
return out.filter(Boolean);
|
|
87
|
+
}
|
|
88
|
+
// 将当前累计 block 收敛成一个 chunk,并把末尾 overlap 作为下一块的前缀上下文。
|
|
89
|
+
function finalizeChunk(chunks, currentBlocks, overlapChars) {
|
|
90
|
+
if (currentBlocks.length === 0)
|
|
91
|
+
return [];
|
|
92
|
+
const chunk = currentBlocks.join('\n\n').trim();
|
|
93
|
+
if (!chunk)
|
|
94
|
+
return [];
|
|
95
|
+
chunks.push(chunk);
|
|
96
|
+
if (overlapChars <= 0)
|
|
97
|
+
return [];
|
|
98
|
+
// 从上一个 chunk 末尾切出 overlapChars 长度的文本,作为下一 chunk 的前置上下文,减少边界信息丢失。
|
|
99
|
+
const tail = chunk.slice(-overlapChars).trim();
|
|
100
|
+
return tail ? [tail] : [];
|
|
101
|
+
}
|
|
102
|
+
// 对外主入口:结构切分优先,其次按 target/max 控制块大小,最后用 overlap 补边界。
|
|
103
|
+
export function splitTextIntoChunks(content, options = {}) {
|
|
104
|
+
const targetChars = options.targetChars ?? CHUNK_TARGET_CHARS;
|
|
105
|
+
const maxChars = options.maxChars ?? CHUNK_MAX_CHARS;
|
|
106
|
+
const overlapChars = options.overlapChars ?? CHUNK_OVERLAP_CHARS;
|
|
107
|
+
// 1. 语义切分:按照结构拆分(eg: ```, #)
|
|
108
|
+
const blocks = splitCodeAwareBlocks(content);
|
|
109
|
+
if (blocks.length === 0)
|
|
110
|
+
return [];
|
|
111
|
+
const chunks = []; // 每一个元素是如下结构组成的字符串:0-首行,之后的索引对应值都是:上一个 chunk 的末尾 overlap 文本 + 当前 block 文本,块与块之间用双换行分隔。
|
|
112
|
+
let currentBlocks = [];
|
|
113
|
+
let currentLength = 0;
|
|
114
|
+
for (const block of blocks) {
|
|
115
|
+
// 2. 自然边界切分:每一个block再按照[.。!?、换行等]自然边界切分
|
|
116
|
+
const oversizedParts = block.text.length > maxChars
|
|
117
|
+
? sliceWithOverlap(block.text, maxChars, overlapChars)
|
|
118
|
+
: [block.text];
|
|
119
|
+
for (const part of oversizedParts) {
|
|
120
|
+
// 3. overlap 滑动窗口:每当累计块接近目标大小或即将超出上限时,先把当前块收敛成一个 chunk,再开始下一块。每个新块的开头会带上前一个块末尾 overlapChars 长度的文本,减少边界信息丢失。
|
|
121
|
+
const additionLength = currentLength === 0 ? part.length : part.length + 2;
|
|
122
|
+
const wouldOverflowMax = currentLength > 0 && currentLength + additionLength > maxChars;
|
|
123
|
+
const reachedTarget = currentLength >= targetChars;
|
|
124
|
+
// 已接近目标大小或即将超出上限时,先收敛当前 chunk,再开始下一块。
|
|
125
|
+
if (wouldOverflowMax || reachedTarget) {
|
|
126
|
+
currentBlocks = finalizeChunk(chunks, currentBlocks, overlapChars);
|
|
127
|
+
currentLength = currentBlocks.join('\n\n').length;
|
|
128
|
+
}
|
|
129
|
+
currentBlocks.push(part);
|
|
130
|
+
currentLength = currentBlocks.join('\n\n').length;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
finalizeChunk(chunks, currentBlocks, 0);
|
|
134
|
+
return chunks;
|
|
135
|
+
}
|
|
136
|
+
// 截取 chunk,做chunk embedding 时的摘要展示,提升检索结果的可读性和判断相关性的效率。
|
|
137
|
+
function buildChunkSummary(content) {
|
|
138
|
+
const flattened = content.replace(/\s+/g, ' ').trim();
|
|
139
|
+
if (!flattened)
|
|
140
|
+
return null;
|
|
141
|
+
return flattened.slice(0, CHUNK_SUMMARY_MAX_CHARS);
|
|
142
|
+
}
|
|
143
|
+
// chunk hash 以 path + chunkIndex + content 生成,用于稳定标识具体分块。
|
|
144
|
+
function computeChunkHash(path, chunkIndex, content) {
|
|
145
|
+
return createHash('sha256')
|
|
146
|
+
.update(`${path}#${chunkIndex}\n${content}`)
|
|
147
|
+
.digest('hex');
|
|
148
|
+
}
|
|
149
|
+
// 将原始文档输入转为可入库的 chunk 记录,附带索引、总块数、摘要和 semantic hash。
|
|
150
|
+
export function buildDocumentChunks(document, options = {}) {
|
|
151
|
+
const chunks = splitTextIntoChunks(document.content, options);
|
|
152
|
+
return chunks.map((content, index) => ({
|
|
153
|
+
sourceId: document.sourceId ?? null,
|
|
154
|
+
title: document.title,
|
|
155
|
+
path: document.path,
|
|
156
|
+
chunkIndex: index,
|
|
157
|
+
chunkCount: chunks.length,
|
|
158
|
+
content,
|
|
159
|
+
summary: buildChunkSummary(content),
|
|
160
|
+
category: document.category ?? null,
|
|
161
|
+
meta: document.metadata ?? null,
|
|
162
|
+
semanticHash: computeChunkHash(document.path, index, content),
|
|
163
|
+
}));
|
|
164
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// 仅js类型使用,后续会删掉
|
|
2
|
-
|
|
2
|
+
import { EMBED_MAX_CONTENT_LENGTH } from '../config/tuning.js';
|
|
3
3
|
function briefMeta(meta) {
|
|
4
4
|
const keys = ['props', 'params', 'properties', 'hooks'];
|
|
5
5
|
const parts = [];
|
|
@@ -23,7 +23,7 @@ export function indexedRowToEmbedText(row) {
|
|
|
23
23
|
row.path,
|
|
24
24
|
row.description ?? '',
|
|
25
25
|
metaBit,
|
|
26
|
-
(row.content ?? '').slice(0,
|
|
26
|
+
(row.content ?? '').slice(0, EMBED_MAX_CONTENT_LENGTH),
|
|
27
27
|
]
|
|
28
28
|
.filter((s) => s.length > 0)
|
|
29
29
|
.join('\n');
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { env } from '../config/env.js';
|
|
2
|
+
import { CHUNK_SIMILARITY_THRESHOLD, CHUNK_TOP_K } from '../config/tuning.js';
|
|
3
|
+
import { SEARCHABLE_STATUS, SYMBOL_STATUS } from '../config/symbolStatus.js';
|
|
4
|
+
import { getPool } from '../db/postgres.js';
|
|
5
|
+
import { getAllChunkTableSQLs } from '../db/schema.js';
|
|
6
|
+
import { buildDocumentChunks } from '../indexer/chunkText.js';
|
|
7
|
+
import { createEmbeddingClient, embedAll, } from '../services/embeddingClient.js';
|
|
8
|
+
// 统一解析 pgvector 返回值,兼容字符串格式与数组格式。
|
|
9
|
+
function parseEmbedding(raw) {
|
|
10
|
+
if (raw == null)
|
|
11
|
+
return null;
|
|
12
|
+
if (Array.isArray(raw)) {
|
|
13
|
+
const nums = raw.map((item) => Number(item)).filter(Number.isFinite);
|
|
14
|
+
return nums.length === raw.length ? nums : null;
|
|
15
|
+
}
|
|
16
|
+
if (typeof raw === 'string') {
|
|
17
|
+
try {
|
|
18
|
+
const parsed = JSON.parse(raw);
|
|
19
|
+
return parseEmbedding(parsed);
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
// 将数据库行映射为业务层 chunk 对象。
|
|
28
|
+
function toStoredChunk(row) {
|
|
29
|
+
return {
|
|
30
|
+
id: row.id,
|
|
31
|
+
sourceId: row.source_id,
|
|
32
|
+
title: row.title,
|
|
33
|
+
path: row.path,
|
|
34
|
+
chunkIndex: row.chunk_index,
|
|
35
|
+
chunkCount: row.chunk_count,
|
|
36
|
+
content: row.content,
|
|
37
|
+
summary: row.summary,
|
|
38
|
+
category: row.category,
|
|
39
|
+
meta: row.meta ? JSON.parse(row.meta) : null,
|
|
40
|
+
semanticHash: row.semantic_hash,
|
|
41
|
+
embedding: parseEmbedding(row.embedding),
|
|
42
|
+
similarity: row.similarity ? Number(row.similarity) : undefined,
|
|
43
|
+
createdAt: row.created_at ?? null,
|
|
44
|
+
updatedAt: row.updated_at ?? null,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
// 保留标题/路径/摘要:chunk截取/正文:完整chunk信息
|
|
48
|
+
function chunkToEmbeddingText(chunk) {
|
|
49
|
+
return [chunk.title, chunk.path, chunk.summary ?? '', chunk.content]
|
|
50
|
+
.filter(Boolean)
|
|
51
|
+
.join('\n');
|
|
52
|
+
}
|
|
53
|
+
export class ChunkRepository {
|
|
54
|
+
pool;
|
|
55
|
+
constructor() {
|
|
56
|
+
this.pool = getPool();
|
|
57
|
+
}
|
|
58
|
+
// 确保 chunk 表和索引存在,便于独立运行写入流程。
|
|
59
|
+
async ensureSchema() {
|
|
60
|
+
if (!this.pool)
|
|
61
|
+
return;
|
|
62
|
+
for (const sql of getAllChunkTableSQLs()) {
|
|
63
|
+
await this.pool.query(sql);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
async upsertDocument(document, options = {}) {
|
|
67
|
+
if (!this.pool)
|
|
68
|
+
return [];
|
|
69
|
+
await this.ensureSchema();
|
|
70
|
+
// 先做语义切分,再加 overlap,得到一个文档的 chunk 列表。
|
|
71
|
+
const built = buildDocumentChunks(document, options);
|
|
72
|
+
if (built.length === 0)
|
|
73
|
+
return [];
|
|
74
|
+
let embeddings = built.map(() => null);
|
|
75
|
+
if (env.embeddingServiceUrl) {
|
|
76
|
+
// 批量 embedding,减少网络往返和 API 调用开销。
|
|
77
|
+
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
78
|
+
embeddings = await embedAll(client, built.map(chunkToEmbeddingText));
|
|
79
|
+
}
|
|
80
|
+
const db = await this.pool.connect();
|
|
81
|
+
try {
|
|
82
|
+
await db.query('BEGIN');
|
|
83
|
+
// 先删旧版本再写新版本,避免同 path 的历史 chunk 混入召回。
|
|
84
|
+
const existing = await db.query(`SELECT id FROM ${env.chunksTable} WHERE path = $1`, [document.path]);
|
|
85
|
+
if (existing.rowCount && existing.rowCount > 0) {
|
|
86
|
+
await db.query(`DELETE FROM ${env.chunksTable} WHERE path = $1`, [document.path]);
|
|
87
|
+
}
|
|
88
|
+
const sql = `
|
|
89
|
+
INSERT INTO ${env.chunksTable}
|
|
90
|
+
(source_id, title, path, chunk_index, chunk_count, content, summary, category, meta,
|
|
91
|
+
embedding, semantic_hash, status)
|
|
92
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::jsonb, $10::vector, $11, $12)
|
|
93
|
+
RETURNING id, source_id, title, path, chunk_index, chunk_count, content, summary, category,
|
|
94
|
+
meta::text AS meta, embedding, semantic_hash, status, created_at, updated_at
|
|
95
|
+
`;
|
|
96
|
+
const inserted = [];
|
|
97
|
+
for (let index = 0; index < built.length; index += 1) {
|
|
98
|
+
const chunk = built[index];
|
|
99
|
+
const embedding = embeddings[index];
|
|
100
|
+
const vecLiteral = Array.isArray(embedding)
|
|
101
|
+
? `[${embedding.join(',')}]`
|
|
102
|
+
: null;
|
|
103
|
+
// 无向量时写为 pending,后续可以复用 worker 补齐向量。
|
|
104
|
+
const { rows } = await db.query(sql, [
|
|
105
|
+
chunk.sourceId,
|
|
106
|
+
chunk.title,
|
|
107
|
+
chunk.path,
|
|
108
|
+
chunk.chunkIndex,
|
|
109
|
+
chunk.chunkCount,
|
|
110
|
+
chunk.content,
|
|
111
|
+
chunk.summary,
|
|
112
|
+
chunk.category,
|
|
113
|
+
JSON.stringify(chunk.meta),
|
|
114
|
+
vecLiteral,
|
|
115
|
+
chunk.semanticHash,
|
|
116
|
+
vecLiteral ? SYMBOL_STATUS.ONLINE : SYMBOL_STATUS.PENDING,
|
|
117
|
+
]);
|
|
118
|
+
inserted.push(toStoredChunk(rows[0]));
|
|
119
|
+
}
|
|
120
|
+
await db.query('COMMIT');
|
|
121
|
+
return inserted;
|
|
122
|
+
}
|
|
123
|
+
catch (error) {
|
|
124
|
+
await db.query('ROLLBACK');
|
|
125
|
+
throw error;
|
|
126
|
+
}
|
|
127
|
+
finally {
|
|
128
|
+
db.release();
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
async searchSemantic(query, opts) {
|
|
132
|
+
if (!env.embeddingServiceUrl) {
|
|
133
|
+
throw new Error('语义 chunk 检索需配置 EMBEDDING_SERVICE_URL');
|
|
134
|
+
}
|
|
135
|
+
if (!this.pool)
|
|
136
|
+
return [];
|
|
137
|
+
const limit = opts?.limit ?? CHUNK_TOP_K;
|
|
138
|
+
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
139
|
+
// 查询先向量化,再在数据库中用 pgvector 做相似度排序。
|
|
140
|
+
const [queryVec] = await client.embed([query.trim()]);
|
|
141
|
+
if (!queryVec?.length) {
|
|
142
|
+
throw new Error('查询向量为空');
|
|
143
|
+
}
|
|
144
|
+
const params = [
|
|
145
|
+
`[${queryVec.join(',')}]`,
|
|
146
|
+
SEARCHABLE_STATUS,
|
|
147
|
+
];
|
|
148
|
+
let sql = `
|
|
149
|
+
SELECT id, source_id, title, path, chunk_index, chunk_count, content, summary, category,
|
|
150
|
+
meta::text AS meta, embedding, semantic_hash, status, created_at, updated_at,
|
|
151
|
+
1 - (embedding <=> $1::vector) AS similarity
|
|
152
|
+
FROM ${env.chunksTable}
|
|
153
|
+
WHERE embedding IS NOT NULL
|
|
154
|
+
AND status = $2
|
|
155
|
+
`;
|
|
156
|
+
if (opts?.path) {
|
|
157
|
+
params.push(opts.path);
|
|
158
|
+
sql += ` AND path = $${params.length}`;
|
|
159
|
+
}
|
|
160
|
+
params.push(limit * 2);
|
|
161
|
+
sql += ` ORDER BY embedding <=> $1::vector LIMIT $${params.length}`;
|
|
162
|
+
const { rows } = await this.pool.query(sql, params);
|
|
163
|
+
return rows
|
|
164
|
+
.map(toStoredChunk)
|
|
165
|
+
.filter((chunk) => (chunk.similarity ?? 0) >= CHUNK_SIMILARITY_THRESHOLD)
|
|
166
|
+
.slice(0, limit);
|
|
167
|
+
}
|
|
168
|
+
// 命中 chunk 后取前后邻块,提升回答时上下文完整性。
|
|
169
|
+
async getAdjacentChunks(path, chunkIndex, radius = 1) {
|
|
170
|
+
if (!this.pool)
|
|
171
|
+
return [];
|
|
172
|
+
const { rows } = await this.pool.query(`
|
|
173
|
+
SELECT id, source_id, title, path, chunk_index, chunk_count, content, summary, category,
|
|
174
|
+
meta::text AS meta, embedding, semantic_hash, status, created_at, updated_at
|
|
175
|
+
FROM ${env.chunksTable}
|
|
176
|
+
WHERE path = $1 AND chunk_index BETWEEN $2 AND $3
|
|
177
|
+
ORDER BY chunk_index ASC
|
|
178
|
+
`, [path, Math.max(0, chunkIndex - radius), chunkIndex + radius]);
|
|
179
|
+
return rows.map(toStoredChunk);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import { env } from '../config/env.js';
|
|
2
2
|
import { getPool } from '../db/postgres.js';
|
|
3
|
+
import { SYMBOL_SIMILARITY_THRESHOLD, SYMBOL_TOP_K } from '../config/tuning.js';
|
|
3
4
|
import { createEmbeddingClient } from '../services/embeddingClient.js';
|
|
4
5
|
import { SEARCHABLE_STATUS } from '../config/symbolStatus.js';
|
|
5
|
-
const SIMILARITY_THRESHOLD = 0;
|
|
6
|
-
const TOP_K = 20;
|
|
7
6
|
const inMemorySymbols = [
|
|
8
7
|
{
|
|
9
8
|
id: 1,
|
|
@@ -188,7 +187,7 @@ export class SymbolRepository {
|
|
|
188
187
|
* 不再需要在 Node 拉取全量向量做内存计算。
|
|
189
188
|
*/
|
|
190
189
|
async searchSemanticHits(query, opts) {
|
|
191
|
-
console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ??
|
|
190
|
+
console.error('[code-intelligence-mcp] repository.searchSemanticHits.start query=%s type=%s table=%s limit=%s threshold=%s searchableStatus=%s hasPool=%s', query, opts?.type ?? '', env.symbolsTable, String(opts?.limit ?? SYMBOL_TOP_K), String(SYMBOL_SIMILARITY_THRESHOLD), String(SEARCHABLE_STATUS), String(Boolean(this.pool)));
|
|
192
191
|
if (!env.embeddingServiceUrl) {
|
|
193
192
|
console.error('[code-intelligence-mcp] repository.searchSemanticHits.error missingEmbeddingServiceUrl');
|
|
194
193
|
throw new Error('语义检索需配置 EMBEDDING_SERVICE_URL 并启动嵌入服务');
|
|
@@ -197,7 +196,7 @@ export class SymbolRepository {
|
|
|
197
196
|
console.error('[code-intelligence-mcp] repository.searchSemanticHits.noPool returnEmpty');
|
|
198
197
|
return [];
|
|
199
198
|
}
|
|
200
|
-
const limit = opts?.limit ??
|
|
199
|
+
const limit = opts?.limit ?? SYMBOL_TOP_K;
|
|
201
200
|
const client = createEmbeddingClient(env.embeddingServiceUrl);
|
|
202
201
|
const [queryVec] = await client.embed([query.trim()]);
|
|
203
202
|
if (!queryVec?.length) {
|
|
@@ -226,7 +225,7 @@ export class SymbolRepository {
|
|
|
226
225
|
symbol: mapRow(r),
|
|
227
226
|
similarity: Number(r.similarity),
|
|
228
227
|
}));
|
|
229
|
-
const passed = mapped.filter((x) => x.similarity >=
|
|
228
|
+
const passed = mapped.filter((x) => x.similarity >= SYMBOL_SIMILARITY_THRESHOLD);
|
|
230
229
|
console.error('[code-intelligence-mcp] repository.searchSemanticHits.db table=%s rawRows=%s passedThreshold=%s topRaw=%s', env.symbolsTable, String(rows.length), String(passed.length), JSON.stringify(mapped.slice(0, 5).map((x) => ({
|
|
231
230
|
id: x.symbol.id,
|
|
232
231
|
name: x.symbol.name,
|
|
@@ -8,6 +8,7 @@ import { createSearchByStructureTool } from '../tools/searchByStructure.js';
|
|
|
8
8
|
import { createIncUsageTool } from '../tools/incUsage.js';
|
|
9
9
|
import { RecommendationService } from '../services/recommendationService.js';
|
|
10
10
|
import { createRecommendComponentTool } from '../tools/recommendComponent.js';
|
|
11
|
+
import { createQueryDocsTool } from '../tools/queryDocs.js';
|
|
11
12
|
export function createServer() {
|
|
12
13
|
console.error('[code-intelligence-mcp] createServer.init');
|
|
13
14
|
const server = new McpServer({
|
|
@@ -39,6 +40,9 @@ export function createServer() {
|
|
|
39
40
|
console.error('[code-intelligence-mcp] tool.registered %s', recommendComponentTool.name);
|
|
40
41
|
registerReusableCodeAdvisorPrompt(server);
|
|
41
42
|
console.error('[code-intelligence-mcp] prompt.registered reusable-code-advisor');
|
|
42
|
-
|
|
43
|
+
const queryDocsTool = createQueryDocsTool();
|
|
44
|
+
server.tool(queryDocsTool.name, queryDocsTool.description, queryDocsTool.inputSchema, queryDocsTool.handler);
|
|
45
|
+
console.error('[code-intelligence-mcp] tool.registered %s', queryDocsTool.name);
|
|
46
|
+
console.error('[code-intelligence-mcp] createServer.ready toolCount=7 promptCount=1');
|
|
43
47
|
return server;
|
|
44
48
|
}
|