@lorrylurui/code-intelligence-mcp 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -0
- package/dist/config/env.js +9 -0
- package/dist/config/tuning.js +114 -0
- package/dist/db/schema.js +37 -0
- package/dist/index.js +1 -0
- package/dist/indexer/babelParser.js +2 -1
- package/dist/indexer/chunkText.js +164 -0
- package/dist/indexer/embedText.js +2 -2
- package/dist/indexer/indexProject.js +193 -22
- package/dist/indexer/jsAstNormalizer.js +36 -6
- package/dist/prompts/reusableCodeAdvisorPrompt.js +63 -34
- package/dist/repositories/chunkRepository.js +181 -0
- package/dist/repositories/symbolRepository.js +108 -15
- package/dist/server/createServer.js +16 -0
- package/dist/services/contextAssembler.js +150 -0
- package/dist/services/ranking.js +109 -58
- package/dist/services/recommendationService.js +515 -46
- package/dist/services/reindex.js +25 -0
- package/dist/tools/getSymbolDetail.js +2 -1
- package/dist/tools/queryDocs.js +113 -0
- package/dist/tools/recommendComponent.js +86 -10
- package/dist/tools/searchByStructure.js +2 -1
- package/dist/tools/searchSymbols.js +57 -21
- package/dist/types/chunk.js +1 -0
- package/dist/workers/embeddingWorker.js +0 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,6 +10,14 @@
|
|
|
10
10
|
- Prompt: `reusable-code-advisor`
|
|
11
11
|
- Cursor Skill:`reusable-code-advisor`(`.cursor/skills/reusable-code-advisor/`,
|
|
12
12
|
|
|
13
|
+
## 开发
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
1. npm run dev:mcp 启动mcp server
|
|
17
|
+
2. npm run embedding:dev 启动本地python环境
|
|
18
|
+
3. npm run worker:embedding 启动worker队列
|
|
19
|
+
```
|
|
20
|
+
|
|
13
21
|
## 1) 配置mcp servers
|
|
14
22
|
|
|
15
23
|
```
|
|
@@ -38,6 +46,39 @@
|
|
|
38
46
|
MYSQL\*SYMBOLS_TABLE=frontend_collections_symbols
|
|
39
47
|
INDEX_GLOB=xxx/\*\*/\_.{js,jsx,ts,tsx}
|
|
40
48
|
|
|
49
|
+
# 召回效果优化
|
|
50
|
+
|
|
51
|
+
你这个场景里,“召回优化”建议按 3 层做,效果最好:
|
|
52
|
+
|
|
53
|
+
**1. 先把候选尽量捞全(Recall 层)**
|
|
54
|
+
|
|
55
|
+
1. 提高语义检索候选池:把语义 `topK` 从现在的小值提高到 `50~200`,不要在第一层就截断太早。
|
|
56
|
+
2. 语义 + 关键词并行召回:同时跑向量检索、名称/路径/描述关键词检索、结构字段检索(props/hooks/sideEffects),最后并集去重。
|
|
57
|
+
3. 做查询扩展:`affix` 自动扩成 `fixed/sticky/offsetTop/offsetBottom/固钉/吸顶/吸底/固定定位`,提升召回覆盖率。
|
|
58
|
+
4. 加别名词典:为组件建立别名(如 `Affix -> 固钉, 固定定位, sticky`),召回时强制并入。
|
|
59
|
+
5. 路径先验加白名单:`Components/`、components这类目录可加权,`demo/examples/pages`不直接过滤,但降权。
|
|
60
|
+
|
|
61
|
+
**2. 再做精排(Ranking 层)**
|
|
62
|
+
|
|
63
|
+
1. 两阶段排序:第一阶段只看“能不能进池子”,第二阶段再做质量门槛。
|
|
64
|
+
2. 显式命中强加权:用户 query 出现明确词(如 `affix`)时,名称/文件名命中要有明显 boost。
|
|
65
|
+
3. 避免 usage 绑架:`usage_count` 只做弱特征,别让高 usage 的 demo/page 抢过真实组件。
|
|
66
|
+
4. 引入轻量 reranker:对 Top-50 用交叉编码器或规则+LLM 打分,可明显提升 Top-1 准确率。
|
|
67
|
+
|
|
68
|
+
**3. 建立评测闭环(Offline Eval)**
|
|
69
|
+
|
|
70
|
+
1. 做一套 50~200 条真实查询集(中英混合、别名、口语化)。
|
|
71
|
+
2. 每次改召回都跑:`Recall@10`、`Recall@50`、`MRR@10`、`nDCG@10`。
|
|
72
|
+
3. 对失败样本做“误杀分析”:是没召回、被质量门槛挡掉、还是排序被 demo 抢位。
|
|
73
|
+
|
|
74
|
+
你这个仓库当前最该优先做的 3 个点是:
|
|
75
|
+
|
|
76
|
+
1. 把第一阶段候选池再放大(至少 50+)。
|
|
77
|
+
2. 固化 query 扩展词典(Affix 这类高频组件先做)。
|
|
78
|
+
3. 把质量门槛后移,先召回后精排,减少“找到了但被早期过滤掉”。
|
|
79
|
+
|
|
80
|
+
如果你要,我可以下一步直接给你一版“可落地参数表”(每个阈值给默认值与调参范围),方便你马上 A/B。
|
|
81
|
+
|
|
41
82
|
# 待优化项
|
|
42
83
|
|
|
43
84
|
修复优先级:
|
package/dist/config/env.js
CHANGED
|
@@ -60,6 +60,15 @@ export const env = {
|
|
|
60
60
|
get symbolsTable() {
|
|
61
61
|
return process.env.SYMBOLS_TABLE ?? 'symbols';
|
|
62
62
|
},
|
|
63
|
+
/**
|
|
64
|
+
* 文档 chunks 表名。
|
|
65
|
+
* 默认跟随 SYMBOLS_TABLE 派生,例如:symbols -> symbols_chunks,repo_a_symbols -> repo_a_symbols_chunks。
|
|
66
|
+
* 如需显式覆盖,仍可单独传入 CHUNKS_TABLE。
|
|
67
|
+
*/
|
|
68
|
+
get chunksTable() {
|
|
69
|
+
const symbolsTable = process.env.SYMBOLS_TABLE ?? 'symbols';
|
|
70
|
+
return process.env.CHUNKS_TABLE ?? `${symbolsTable}_chunks`;
|
|
71
|
+
},
|
|
63
72
|
/** Python FastAPI 嵌入服务根 URL,如 http://127.0.0.1:8765 */
|
|
64
73
|
get embeddingServiceUrl() {
|
|
65
74
|
return (process.env.EMBEDDING_SERVICE_URL ?? '').trim();
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tuning.ts — 所有可调参数的集中配置。
|
|
3
|
+
*
|
|
4
|
+
* 生产环境中需要多次微调的阈值、权重和限制值均在此定义,
|
|
5
|
+
* 禁止在业务代码里直接写魔法数字。
|
|
6
|
+
*/
|
|
7
|
+
// ─── Chunk 切分参数 (chunkText.ts) ───────────────────────────────────────────
|
|
8
|
+
/** 目标 chunk 字符数:达到此值后尽快在下一个边界处收敛当前块 */
|
|
9
|
+
export const CHUNK_TARGET_CHARS = 900;
|
|
10
|
+
/** 单个 chunk 最大字符数:超过此值必须做二次切分 */
|
|
11
|
+
export const CHUNK_MAX_CHARS = 1200;
|
|
12
|
+
/** 相邻 chunk 的重叠字符数:用于减少边界信息丢失 */
|
|
13
|
+
export const CHUNK_OVERLAP_CHARS = 120;
|
|
14
|
+
/** 句子/换行边界的最小位置比例:不足此比例则不回退到该边界,避免生成过短的 chunk */
|
|
15
|
+
export const CHUNK_SENTENCE_BREAK_MIN_RATIO = 0.6;
|
|
16
|
+
/** chunk 摘要的最大字符数(仅用于展示与 embedding 辅助信息) */
|
|
17
|
+
export const CHUNK_SUMMARY_MAX_CHARS = 160;
|
|
18
|
+
// ─── Chunk 语义检索参数 (chunkRepository.ts) ─────────────────────────────────
|
|
19
|
+
/** 最低 cosine 相似度:低于此值的 chunk 不返回给调用方 */
|
|
20
|
+
export const CHUNK_SIMILARITY_THRESHOLD = 0;
|
|
21
|
+
/** 语义检索默认返回的 chunk 数量上限 */
|
|
22
|
+
export const CHUNK_TOP_K = 8;
|
|
23
|
+
// ─── Symbol 语义检索参数 (symbolRepository.ts) ───────────────────────────────
|
|
24
|
+
/** 最低 cosine 相似度:低于此值的 symbol 不返回给调用方 */
|
|
25
|
+
export const SYMBOL_SIMILARITY_THRESHOLD = 0;
|
|
26
|
+
/** 语义检索默认返回的 symbol 数量上限 */
|
|
27
|
+
export const SYMBOL_TOP_K = 20;
|
|
28
|
+
// ─── Embedding 文本截断 (embedText.ts) ──────────────────────────────────────
|
|
29
|
+
/** 源码片段送入 embedding 前的最大字符数,超出部分截断 */
|
|
30
|
+
export const EMBED_MAX_CONTENT_LENGTH = 1200;
|
|
31
|
+
// ─── 排名权重 (ranking.ts) ────────────────────────────────────────────────────
|
|
32
|
+
/** 综合排名四维度权重,总和须为 1 */
|
|
33
|
+
export const RANK_WEIGHTS = {
|
|
34
|
+
textMatch: 0.5,
|
|
35
|
+
usage: 0.3,
|
|
36
|
+
recency: 0.1,
|
|
37
|
+
commonPath: 0.1,
|
|
38
|
+
};
|
|
39
|
+
/** 每匹配到一个 callee 名称所增加的分数 */
|
|
40
|
+
export const CALLEE_MATCH_SCORE_PER_MATCH = 0.05;
|
|
41
|
+
/** callee 匹配分数的上限(防止大量 callee 匹配主导总分) */
|
|
42
|
+
export const CALLEE_MATCH_SCORE_MAX = 0.2;
|
|
43
|
+
/**
|
|
44
|
+
* Token 重叠度评分阶梯(按顺序匹配,首个满足条件的阶梯生效)。
|
|
45
|
+
* - minMatches: 查询 token 中至少命中的数量
|
|
46
|
+
* - minRatio: 命中比例下限
|
|
47
|
+
* - score: 对应的文本匹配分数
|
|
48
|
+
*/
|
|
49
|
+
export const TOKEN_OVERLAP_TIERS = [
|
|
50
|
+
{ minMatches: 4, minRatio: 0.45, score: 0.78 },
|
|
51
|
+
{ minMatches: 3, minRatio: 0.3, score: 0.68 },
|
|
52
|
+
{ minMatches: 2, minRatio: 0.18, score: 0.56 },
|
|
53
|
+
];
|
|
54
|
+
/** 非 token-overlap 类型的文本匹配固定分数 */
|
|
55
|
+
export const TEXT_MATCH_SCORES = {
|
|
56
|
+
nameContains: 0.85,
|
|
57
|
+
descriptionContains: 0.65,
|
|
58
|
+
weak: 0.2,
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* 时效性评分阶梯(按 maxDays 升序评估,首个满足条件的阶梯生效)。
|
|
62
|
+
* - maxDays: 创建距今天数上限
|
|
63
|
+
* - score: 对应的时效分数
|
|
64
|
+
*/
|
|
65
|
+
export const RECENCY_SCORE_TIERS = [
|
|
66
|
+
{ maxDays: 7, score: 1.0 },
|
|
67
|
+
{ maxDays: 30, score: 0.8 },
|
|
68
|
+
{ maxDays: 90, score: 0.6 },
|
|
69
|
+
{ maxDays: 180, score: 0.4 },
|
|
70
|
+
];
|
|
71
|
+
/** 无 createdAt 时使用的默认时效分数 */
|
|
72
|
+
export const RECENCY_SCORE_DEFAULT = 0.4;
|
|
73
|
+
/** createdAt 超出所有阶梯(>180天)时的最低时效分数 */
|
|
74
|
+
export const RECENCY_SCORE_OLDEST = 0.25;
|
|
75
|
+
/** 路径含 /common/ 或 /shared/ 的符号所获得的路径维度分数 */
|
|
76
|
+
export const COMMON_PATH_SCORE_YES = 1;
|
|
77
|
+
/** 路径不在 common/shared 目录时的路径维度分数 */
|
|
78
|
+
export const COMMON_PATH_SCORE_NO = 0.35;
|
|
79
|
+
/** 使用频率评分的 log 底数除数(值越大,曲线越平坦) */
|
|
80
|
+
export const USAGE_SCORE_LOG_DIVISOR = 3;
|
|
81
|
+
/** 语义相似度高于此值时,输出"语义相似度高"标签 */
|
|
82
|
+
export const SEMANTIC_REASON_THRESHOLD_HIGH = 0.55;
|
|
83
|
+
/** 语义相似度高于此值时,输出"语义相关"标签 */
|
|
84
|
+
export const SEMANTIC_REASON_THRESHOLD_MED = 0.4;
|
|
85
|
+
/** 使用频率分高于此值时,输出"使用频率高"标签 */
|
|
86
|
+
export const USAGE_REASON_THRESHOLD_HIGH = 0.6;
|
|
87
|
+
// ─── 推荐质量门控阈值 (recommendationService.ts) ─────────────────────────────
|
|
88
|
+
/** 候选通过质量门控所需的最低综合分数 */
|
|
89
|
+
export const MIN_RECOMMENDATION_SCORE = {
|
|
90
|
+
semantic: 0.5,
|
|
91
|
+
keyword: 0.45,
|
|
92
|
+
};
|
|
93
|
+
/** 语义模式下文本维度分数须达到的下限(用于高置信度路径) */
|
|
94
|
+
export const MIN_SEMANTIC_TEXT_MATCH_SCORE = 0.6;
|
|
95
|
+
/** 名称/路径字面命中时,放宽的综合分数下限 */
|
|
96
|
+
export const MIN_LITERAL_MATCH_SCORE = 0.18;
|
|
97
|
+
/** props/hooks 结构字段全部命中时,放宽的综合分数下限 */
|
|
98
|
+
export const REQUIRED_FIELD_FALLBACK_MIN_SCORE = 0.4;
|
|
99
|
+
/** 字面命中(名称或文件名匹配查询词)时,对优先级分数增加的值 */
|
|
100
|
+
export const LITERAL_MATCH_PRIORITY_BOOST = 0.22;
|
|
101
|
+
/** 路径为 demo/example 风格时,对优先级分数扣减的值 */
|
|
102
|
+
export const DEMO_PATH_PRIORITY_PENALTY = 0.18;
|
|
103
|
+
// ─── 搜索工具结果过滤 (tools/searchSymbols.ts) ───────────────────────────────
|
|
104
|
+
/** 最终返回结果所需的最低综合评分 */
|
|
105
|
+
export const SEARCH_SCORE_THRESHOLD = 0.45;
|
|
106
|
+
/** 最终返回的 symbol 数量上限 */
|
|
107
|
+
export const SEARCH_TOP_K = 20;
|
|
108
|
+
// ─── RAG 上下文组装参数 (services/contextAssembler.ts) ───────────────────────
|
|
109
|
+
/** 每个命中 chunk 向前后各扩展的邻块数量(减少边界截断信息丢失) */
|
|
110
|
+
export const CONTEXT_ADJACENT_RADIUS = 1;
|
|
111
|
+
/** 注入 prompt 的上下文总字符数预算(超出则从相似度最低的 chunk 开始截断) */
|
|
112
|
+
export const CONTEXT_MAX_CHARS = 6000;
|
|
113
|
+
/** 邻块扩展后保留的最大 chunk 数量 */
|
|
114
|
+
export const CONTEXT_MAX_CHUNKS = 12;
|
package/dist/db/schema.js
CHANGED
|
@@ -49,3 +49,40 @@ export function getAllTableSQLs() {
|
|
|
49
49
|
...getSymbolsIndexSQLs(),
|
|
50
50
|
];
|
|
51
51
|
}
|
|
52
|
+
export function getChunksTableSQL() {
|
|
53
|
+
const tableName = env.chunksTable;
|
|
54
|
+
return `CREATE TABLE IF NOT EXISTS ${tableName} (
|
|
55
|
+
id SERIAL PRIMARY KEY,
|
|
56
|
+
source_id VARCHAR(255),
|
|
57
|
+
title TEXT NOT NULL,
|
|
58
|
+
path TEXT NOT NULL,
|
|
59
|
+
chunk_index INT NOT NULL,
|
|
60
|
+
chunk_count INT NOT NULL,
|
|
61
|
+
content TEXT NOT NULL,
|
|
62
|
+
summary TEXT,
|
|
63
|
+
category VARCHAR(255),
|
|
64
|
+
meta JSONB,
|
|
65
|
+
embedding vector(384),
|
|
66
|
+
semantic_hash VARCHAR(64) NOT NULL,
|
|
67
|
+
status SMALLINT NOT NULL DEFAULT ${DEFAULT_STATUS_ON_UPSERT},
|
|
68
|
+
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
69
|
+
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
|
70
|
+
CONSTRAINT uk_${tableName}_path_chunk UNIQUE (path, chunk_index)
|
|
71
|
+
)`;
|
|
72
|
+
}
|
|
73
|
+
export function getChunksIndexSQLs() {
|
|
74
|
+
const t = env.chunksTable;
|
|
75
|
+
return [
|
|
76
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_source_id ON ${t}(source_id)`,
|
|
77
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_semantic_hash ON ${t}(semantic_hash)`,
|
|
78
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_status ON ${t}(status)`,
|
|
79
|
+
`CREATE INDEX IF NOT EXISTS idx_${t}_path ON ${t}(path)`,
|
|
80
|
+
];
|
|
81
|
+
}
|
|
82
|
+
export function getAllChunkTableSQLs() {
|
|
83
|
+
return [
|
|
84
|
+
getEnsureExtensionSQL(),
|
|
85
|
+
getChunksTableSQL(),
|
|
86
|
+
...getChunksIndexSQLs(),
|
|
87
|
+
];
|
|
88
|
+
}
|
package/dist/index.js
CHANGED
|
@@ -6,6 +6,7 @@ async function main() {
|
|
|
6
6
|
// 加载第三方项目的 .env(通过 INDEX_ROOT 指定,或默认当前工作目录)
|
|
7
7
|
const projectRoot = process.env.INDEX_ROOT || process.cwd();
|
|
8
8
|
loadProjectDotenv(projectRoot);
|
|
9
|
+
console.error('[code-intelligence-mcp] env.loaded env.projectRoot=%s', projectRoot);
|
|
9
10
|
const server = createServer();
|
|
10
11
|
const transport = new StdioServerTransport();
|
|
11
12
|
await server.connect(transport);
|
|
@@ -357,6 +357,7 @@ function createRowFromFunction(name, decl, filePath, projectRoot, isJsx) {
|
|
|
357
357
|
function createRowFromClass(name, _decl, filePath, projectRoot) {
|
|
358
358
|
const relPath = getRelativePathForDisplay(projectRoot, filePath);
|
|
359
359
|
const category = inferCategoryFromPath(filePath);
|
|
360
|
+
const jsdoc = parseJsDocInfo(_decl);
|
|
360
361
|
// 大写开头的类视为组件
|
|
361
362
|
const type = /^[A-Z]/.test(name) ? 'component' : 'function';
|
|
362
363
|
return {
|
|
@@ -364,7 +365,7 @@ function createRowFromClass(name, _decl, filePath, projectRoot) {
|
|
|
364
365
|
type,
|
|
365
366
|
category,
|
|
366
367
|
path: relPath,
|
|
367
|
-
description:
|
|
368
|
+
description: jsdoc.description,
|
|
368
369
|
// content meta.kind 暂时废弃不用,
|
|
369
370
|
content: null,
|
|
370
371
|
meta: {},
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { CHUNK_MAX_CHARS, CHUNK_OVERLAP_CHARS, CHUNK_SENTENCE_BREAK_MIN_RATIO, CHUNK_SUMMARY_MAX_CHARS, CHUNK_TARGET_CHARS, } from '../config/tuning.js';
|
|
3
|
+
// 统一换行并去掉首尾空白,避免切分时混入无意义差异。
|
|
4
|
+
function normalizeText(content) {
|
|
5
|
+
return content.replace(/\r\n/g, '\n').trim();
|
|
6
|
+
}
|
|
7
|
+
// 第一阶段:先按 Markdown 风格结构拆成 heading / paragraph / code 三类 block。
|
|
8
|
+
function splitCodeAwareBlocks(content) {
|
|
9
|
+
const normalized = normalizeText(content);
|
|
10
|
+
if (!normalized)
|
|
11
|
+
return [];
|
|
12
|
+
const lines = normalized.split('\n');
|
|
13
|
+
const blocks = [];
|
|
14
|
+
let buffer = [];
|
|
15
|
+
let inCodeFence = false; // 追踪是否在代码块内,避免误把代码内容当普通文本切分。
|
|
16
|
+
const flushParagraphs = () => {
|
|
17
|
+
if (buffer.length === 0)
|
|
18
|
+
return;
|
|
19
|
+
const text = buffer.join('\n').trim();
|
|
20
|
+
buffer = [];
|
|
21
|
+
if (!text)
|
|
22
|
+
return;
|
|
23
|
+
const kind = text.startsWith('#') ? 'heading' : 'paragraph';
|
|
24
|
+
blocks.push({ kind, text });
|
|
25
|
+
};
|
|
26
|
+
for (const line of lines) {
|
|
27
|
+
const trimmed = line.trim();
|
|
28
|
+
if (trimmed.startsWith('```')) {
|
|
29
|
+
if (inCodeFence) {
|
|
30
|
+
buffer.push(line);
|
|
31
|
+
blocks.push({ kind: 'code', text: buffer.join('\n').trim() });
|
|
32
|
+
buffer = [];
|
|
33
|
+
inCodeFence = false;
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
flushParagraphs();
|
|
37
|
+
inCodeFence = true;
|
|
38
|
+
buffer.push(line);
|
|
39
|
+
}
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
if (inCodeFence) {
|
|
43
|
+
buffer.push(line);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
if (trimmed.startsWith('#')) {
|
|
47
|
+
flushParagraphs();
|
|
48
|
+
blocks.push({ kind: 'heading', text: trimmed });
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
if (!trimmed) {
|
|
52
|
+
flushParagraphs();
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
buffer.push(line);
|
|
56
|
+
}
|
|
57
|
+
flushParagraphs();
|
|
58
|
+
return blocks;
|
|
59
|
+
}
|
|
60
|
+
// 第二阶段:如果某个单独 block 过长,再按自然边界做窗口切分,并保留 overlap。
|
|
61
|
+
function sliceWithOverlap(text, maxChars, overlapChars) {
|
|
62
|
+
const normalized = text.trim();
|
|
63
|
+
if (!normalized)
|
|
64
|
+
return [];
|
|
65
|
+
if (normalized.length <= maxChars)
|
|
66
|
+
return [normalized];
|
|
67
|
+
const out = [];
|
|
68
|
+
let start = 0;
|
|
69
|
+
while (start < normalized.length) {
|
|
70
|
+
let end = Math.min(start + maxChars, normalized.length);
|
|
71
|
+
if (end < normalized.length) {
|
|
72
|
+
const window = normalized.slice(start, end);
|
|
73
|
+
// 优先回退到更自然的句子/换行边界,避免把一句话截成两半。
|
|
74
|
+
const sentenceBreak = Math.max(window.lastIndexOf('. '), window.lastIndexOf('。'), window.lastIndexOf('! '), window.lastIndexOf('? '), window.lastIndexOf('\n'));
|
|
75
|
+
if (sentenceBreak >
|
|
76
|
+
Math.floor(maxChars * CHUNK_SENTENCE_BREAK_MIN_RATIO)) {
|
|
77
|
+
// 边界不能太靠前,否则可能导致过多 chunk 和过短的上下文,设置一个阈值(如 maxChars 的 60%)来平衡。
|
|
78
|
+
end = start + sentenceBreak + 1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
out.push(normalized.slice(start, end).trim());
|
|
82
|
+
if (end >= normalized.length)
|
|
83
|
+
break;
|
|
84
|
+
start = Math.max(end - overlapChars, start + 1);
|
|
85
|
+
}
|
|
86
|
+
return out.filter(Boolean);
|
|
87
|
+
}
|
|
88
|
+
// 将当前累计 block 收敛成一个 chunk,并把末尾 overlap 作为下一块的前缀上下文。
|
|
89
|
+
function finalizeChunk(chunks, currentBlocks, overlapChars) {
|
|
90
|
+
if (currentBlocks.length === 0)
|
|
91
|
+
return [];
|
|
92
|
+
const chunk = currentBlocks.join('\n\n').trim();
|
|
93
|
+
if (!chunk)
|
|
94
|
+
return [];
|
|
95
|
+
chunks.push(chunk);
|
|
96
|
+
if (overlapChars <= 0)
|
|
97
|
+
return [];
|
|
98
|
+
// 从上一个 chunk 末尾切出 overlapChars 长度的文本,作为下一 chunk 的前置上下文,减少边界信息丢失。
|
|
99
|
+
const tail = chunk.slice(-overlapChars).trim();
|
|
100
|
+
return tail ? [tail] : [];
|
|
101
|
+
}
|
|
102
|
+
// 对外主入口:结构切分优先,其次按 target/max 控制块大小,最后用 overlap 补边界。
|
|
103
|
+
export function splitTextIntoChunks(content, options = {}) {
|
|
104
|
+
const targetChars = options.targetChars ?? CHUNK_TARGET_CHARS;
|
|
105
|
+
const maxChars = options.maxChars ?? CHUNK_MAX_CHARS;
|
|
106
|
+
const overlapChars = options.overlapChars ?? CHUNK_OVERLAP_CHARS;
|
|
107
|
+
// 1. 语义切分:按照结构拆分(eg: ```, #)
|
|
108
|
+
const blocks = splitCodeAwareBlocks(content);
|
|
109
|
+
if (blocks.length === 0)
|
|
110
|
+
return [];
|
|
111
|
+
const chunks = []; // 每一个元素是如下结构组成的字符串:0-首行,之后的索引对应值都是:上一个 chunk 的末尾 overlap 文本 + 当前 block 文本,块与块之间用双换行分隔。
|
|
112
|
+
let currentBlocks = [];
|
|
113
|
+
let currentLength = 0;
|
|
114
|
+
for (const block of blocks) {
|
|
115
|
+
// 2. 自然边界切分:每一个block再按照[.。!?、换行等]自然边界切分
|
|
116
|
+
const oversizedParts = block.text.length > maxChars
|
|
117
|
+
? sliceWithOverlap(block.text, maxChars, overlapChars)
|
|
118
|
+
: [block.text];
|
|
119
|
+
for (const part of oversizedParts) {
|
|
120
|
+
// 3. overlap 滑动窗口:每当累计块接近目标大小或即将超出上限时,先把当前块收敛成一个 chunk,再开始下一块。每个新块的开头会带上前一个块末尾 overlapChars 长度的文本,减少边界信息丢失。
|
|
121
|
+
const additionLength = currentLength === 0 ? part.length : part.length + 2;
|
|
122
|
+
const wouldOverflowMax = currentLength > 0 && currentLength + additionLength > maxChars;
|
|
123
|
+
const reachedTarget = currentLength >= targetChars;
|
|
124
|
+
// 已接近目标大小或即将超出上限时,先收敛当前 chunk,再开始下一块。
|
|
125
|
+
if (wouldOverflowMax || reachedTarget) {
|
|
126
|
+
currentBlocks = finalizeChunk(chunks, currentBlocks, overlapChars);
|
|
127
|
+
currentLength = currentBlocks.join('\n\n').length;
|
|
128
|
+
}
|
|
129
|
+
currentBlocks.push(part);
|
|
130
|
+
currentLength = currentBlocks.join('\n\n').length;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
finalizeChunk(chunks, currentBlocks, 0);
|
|
134
|
+
return chunks;
|
|
135
|
+
}
|
|
136
|
+
// 截取 chunk,做chunk embedding 时的摘要展示,提升检索结果的可读性和判断相关性的效率。
|
|
137
|
+
function buildChunkSummary(content) {
|
|
138
|
+
const flattened = content.replace(/\s+/g, ' ').trim();
|
|
139
|
+
if (!flattened)
|
|
140
|
+
return null;
|
|
141
|
+
return flattened.slice(0, CHUNK_SUMMARY_MAX_CHARS);
|
|
142
|
+
}
|
|
143
|
+
// chunk hash 以 path + chunkIndex + content 生成,用于稳定标识具体分块。
|
|
144
|
+
function computeChunkHash(path, chunkIndex, content) {
|
|
145
|
+
return createHash('sha256')
|
|
146
|
+
.update(`${path}#${chunkIndex}\n${content}`)
|
|
147
|
+
.digest('hex');
|
|
148
|
+
}
|
|
149
|
+
// 将原始文档输入转为可入库的 chunk 记录,附带索引、总块数、摘要和 semantic hash。
|
|
150
|
+
export function buildDocumentChunks(document, options = {}) {
|
|
151
|
+
const chunks = splitTextIntoChunks(document.content, options);
|
|
152
|
+
return chunks.map((content, index) => ({
|
|
153
|
+
sourceId: document.sourceId ?? null,
|
|
154
|
+
title: document.title,
|
|
155
|
+
path: document.path,
|
|
156
|
+
chunkIndex: index,
|
|
157
|
+
chunkCount: chunks.length,
|
|
158
|
+
content,
|
|
159
|
+
summary: buildChunkSummary(content),
|
|
160
|
+
category: document.category ?? null,
|
|
161
|
+
meta: document.metadata ?? null,
|
|
162
|
+
semanticHash: computeChunkHash(document.path, index, content),
|
|
163
|
+
}));
|
|
164
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// 仅js类型使用,后续会删掉
|
|
2
|
-
|
|
2
|
+
import { EMBED_MAX_CONTENT_LENGTH } from '../config/tuning.js';
|
|
3
3
|
function briefMeta(meta) {
|
|
4
4
|
const keys = ['props', 'params', 'properties', 'hooks'];
|
|
5
5
|
const parts = [];
|
|
@@ -23,7 +23,7 @@ export function indexedRowToEmbedText(row) {
|
|
|
23
23
|
row.path,
|
|
24
24
|
row.description ?? '',
|
|
25
25
|
metaBit,
|
|
26
|
-
(row.content ?? '').slice(0,
|
|
26
|
+
(row.content ?? '').slice(0, EMBED_MAX_CONTENT_LENGTH),
|
|
27
27
|
]
|
|
28
28
|
.filter((s) => s.length > 0)
|
|
29
29
|
.join('\n');
|