@llangtop/pwiki-core 1.0.8 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -39
- package/dist/WikiEngine.d.ts.map +1 -1
- package/dist/WikiEngine.js +19 -1
- package/dist/WikiEngine.js.map +1 -1
- package/dist/config.d.ts +1 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +3 -0
- package/dist/config.js.map +1 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -1
- package/dist/index.js.map +1 -1
- package/dist/lib/bm25.d.ts +30 -0
- package/dist/lib/bm25.d.ts.map +1 -0
- package/dist/lib/bm25.js +155 -0
- package/dist/lib/bm25.js.map +1 -0
- package/dist/lib/indexer.d.ts +1 -0
- package/dist/lib/indexer.d.ts.map +1 -1
- package/dist/lib/indexer.js +1 -0
- package/dist/lib/indexer.js.map +1 -1
- package/dist/lib/search.d.ts +3 -0
- package/dist/lib/search.d.ts.map +1 -1
- package/dist/lib/search.js +128 -20
- package/dist/lib/search.js.map +1 -1
- package/dist/lib/semantic-compiler.js +165 -165
- package/dist/lib/store-index.d.ts +3 -0
- package/dist/lib/store-index.d.ts.map +1 -1
- package/dist/lib/store-index.js +14 -1
- package/dist/lib/store-index.js.map +1 -1
- package/dist/lib/store.d.ts +1 -1
- package/dist/lib/store.d.ts.map +1 -1
- package/dist/lib/store.js +1 -1
- package/dist/lib/store.js.map +1 -1
- package/dist/lib/tokenizer.d.ts +10 -0
- package/dist/lib/tokenizer.d.ts.map +1 -0
- package/dist/lib/tokenizer.js +87 -0
- package/dist/lib/tokenizer.js.map +1 -0
- package/package.json +34 -34
- package/dist/WikiEngine.d.ts +0 -73
- package/dist/lib/ast-chunker.d.ts +0 -23
- package/dist/lib/content-cache.d.ts +0 -13
- package/dist/lib/embedder.d.ts +0 -22
- package/dist/lib/file-manifest.d.ts +0 -36
- package/dist/lib/indexer-compile.d.ts +0 -18
- package/dist/lib/indexer-embed.d.ts +0 -21
- package/dist/lib/indexer-scan.d.ts +0 -4
- package/dist/lib/model-registry.d.ts +0 -18
- package/dist/lib/parser.d.ts +0 -9
- package/dist/lib/preprocessor.d.ts +0 -36
- package/dist/lib/semantic-compiler.d.ts +0 -44
- package/dist/lib/semantic-search.d.ts +0 -4
- package/dist/lib/store-config.d.ts +0 -25
- package/dist/lib/store-vectors.d.ts +0 -17
- package/dist/lib/types.d.ts +0 -108
package/dist/lib/search.js
CHANGED
|
@@ -1,30 +1,119 @@
|
|
|
1
|
-
// search.ts — 关键词搜索
|
|
2
|
-
//
|
|
3
|
-
|
|
1
|
+
// search.ts — 关键词搜索 (BM25)
|
|
2
|
+
//
|
|
3
|
+
// BM25 模式下:
|
|
4
|
+
// 正文 = BM25 评分 × 10 缩放
|
|
5
|
+
// title/path/tags = token 命中额外加分(不参与 BM25 公式)
|
|
6
|
+
// 降级: bm25_stats.json 不存在时,fallback 到旧算法
|
|
7
|
+
import { getIndex, readBm25Stats } from "./store-index.js";
|
|
4
8
|
import { getContent } from "./content-cache.js";
|
|
9
|
+
import { tokenize } from "./tokenizer.js";
|
|
10
|
+
import { bm25Score, getDocTokens } from "./bm25.js";
|
|
5
11
|
import { resolve } from "node:path";
|
|
6
12
|
import { existsSync, readFileSync } from "node:fs";
|
|
7
|
-
|
|
13
|
+
/** BM25 得分缩放因子(使分值落在和旧算法相近的范围) */
|
|
14
|
+
const BM25_SCALE = 10;
|
|
15
|
+
/** 字段 boost 权重 */
|
|
16
|
+
const TITLE_WEIGHT = 10;
|
|
17
|
+
const PATH_WEIGHT = 5;
|
|
18
|
+
const TAG_WEIGHT = 3;
|
|
19
|
+
/** 上下文摘取最大行长度 */
|
|
20
|
+
const MAX_LINE_LEN = 100;
|
|
21
|
+
/**
|
|
22
|
+
* 从文档内容中摘取包含查询 token 的上下文行
|
|
23
|
+
*/
|
|
24
|
+
function bm25Snippet(content, queryTokens, maxLen = MAX_LINE_LEN) {
|
|
8
25
|
const lower = content.toLowerCase();
|
|
9
|
-
const q = query.toLowerCase();
|
|
10
|
-
const pos = lower.indexOf(q);
|
|
11
|
-
if (pos < 0)
|
|
12
|
-
return "";
|
|
13
|
-
const before = content.slice(0, pos);
|
|
14
|
-
const lineNum = before.split("\n").length;
|
|
15
26
|
const lines = content.split("\n");
|
|
16
|
-
const prev = lineNum > 1 ? lines[lineNum - 2].trim() : "";
|
|
17
|
-
const curr = lines[lineNum - 1].trim();
|
|
18
|
-
const next = lineNum < lines.length ? lines[lineNum].trim() : "";
|
|
19
27
|
const parts = [];
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
28
|
+
// 找到第一个命中任意 query token 的行
|
|
29
|
+
for (let li = 0; li < lines.length; li++) {
|
|
30
|
+
const lineLower = lines[li].toLowerCase();
|
|
31
|
+
const matched = queryTokens.some(tok => lineLower.includes(tok));
|
|
32
|
+
if (!matched)
|
|
33
|
+
continue;
|
|
34
|
+
const prev = li > 0 ? `L${li}: ${lines[li - 1].trim().slice(0, maxLen)}` : null;
|
|
35
|
+
const curr = `L${li + 1}: ${lines[li].trim().slice(0, maxLen)}`;
|
|
36
|
+
const next = li < lines.length - 1 ? `L${li + 2}: ${lines[li + 1].trim().slice(0, maxLen)}` : null;
|
|
37
|
+
if (prev && !parts.includes(prev))
|
|
38
|
+
parts.push(prev);
|
|
39
|
+
if (!parts.includes(curr))
|
|
40
|
+
parts.push(curr);
|
|
41
|
+
if (next && !parts.includes(next))
|
|
42
|
+
parts.push(next);
|
|
43
|
+
if (parts.length >= 5)
|
|
44
|
+
break; // 最多 5 行上下文
|
|
45
|
+
li++; // 跳过 next 行,避免重复命中
|
|
46
|
+
}
|
|
25
47
|
return parts.join("\n");
|
|
26
48
|
}
|
|
49
|
+
/** 获取文档内容(缓存优先,磁盘 fallback) */
|
|
50
|
+
function getDocContent(entry) {
|
|
51
|
+
let content = getContent(entry.relPath);
|
|
52
|
+
if (content)
|
|
53
|
+
return content;
|
|
54
|
+
const fullPath = resolve(entry.sourceDir, entry.relPath);
|
|
55
|
+
if (!existsSync(fullPath))
|
|
56
|
+
return null;
|
|
57
|
+
try {
|
|
58
|
+
return readFileSync(fullPath, "utf-8");
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* BM25 关键词搜索
|
|
66
|
+
*/
|
|
27
67
|
export function keywordSearch(query) {
|
|
68
|
+
const stats = readBm25Stats();
|
|
69
|
+
return stats ? keywordSearchBm25(query, stats) : keywordSearchLegacy(query);
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* BM25 模式搜索
|
|
73
|
+
*/
|
|
74
|
+
function keywordSearchBm25(query, stats) {
|
|
75
|
+
const idx = getIndex();
|
|
76
|
+
const queryTokens = tokenize(query);
|
|
77
|
+
const hits = [];
|
|
78
|
+
for (const [relPath, entry] of Object.entries(idx)) {
|
|
79
|
+
let score = 0;
|
|
80
|
+
// 字段 boost:改用 token 匹配
|
|
81
|
+
const titleLower = entry.title.toLowerCase();
|
|
82
|
+
const tagsLower = entry.tags.map(t => t.toLowerCase());
|
|
83
|
+
for (const tok of queryTokens) {
|
|
84
|
+
if (titleLower.includes(tok))
|
|
85
|
+
score += TITLE_WEIGHT;
|
|
86
|
+
if (relPath.toLowerCase().includes(tok))
|
|
87
|
+
score += PATH_WEIGHT / queryTokens.length; // 拆分权重
|
|
88
|
+
if (tagsLower.some(t => t.includes(tok)))
|
|
89
|
+
score += TAG_WEIGHT;
|
|
90
|
+
}
|
|
91
|
+
// 正文 BM25 评分
|
|
92
|
+
const docTokens = getDocTokens(entry); // 含路径前缀,与 stats 构建对齐
|
|
93
|
+
if (docTokens) {
|
|
94
|
+
const bm = bm25Score(queryTokens, docTokens, stats);
|
|
95
|
+
score += bm * BM25_SCALE;
|
|
96
|
+
}
|
|
97
|
+
if (score <= 0)
|
|
98
|
+
continue;
|
|
99
|
+
// 摘取上下文
|
|
100
|
+
let snippet = "";
|
|
101
|
+
const rawContent = getDocContent(entry);
|
|
102
|
+
if (rawContent) {
|
|
103
|
+
snippet = bm25Snippet(rawContent, queryTokens);
|
|
104
|
+
}
|
|
105
|
+
hits.push({
|
|
106
|
+
relPath: entry.relPath, sourceDir: entry.sourceDir,
|
|
107
|
+
title: entry.title, tags: entry.tags,
|
|
108
|
+
snippet, score: Math.round(score),
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
return hits.sort((a, b) => b.score - a.score);
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* 旧版降级搜索(bm25_stats.json 不存在时使用)
|
|
115
|
+
*/
|
|
116
|
+
function keywordSearchLegacy(query) {
|
|
28
117
|
const idx = getIndex();
|
|
29
118
|
const q = query.toLowerCase();
|
|
30
119
|
const hits = [];
|
|
@@ -38,7 +127,6 @@ export function keywordSearch(query) {
|
|
|
38
127
|
if (entry.tags.some(t => t.toLowerCase().includes(q)))
|
|
39
128
|
score += 3;
|
|
40
129
|
let content = getContent(relPath);
|
|
41
|
-
// Fallback: read from disk if not in cache (CLI/MCP separate process)
|
|
42
130
|
if (!content) {
|
|
43
131
|
const fullPath = resolve(entry.sourceDir, relPath);
|
|
44
132
|
if (existsSync(fullPath)) {
|
|
@@ -55,7 +143,7 @@ export function keywordSearch(query) {
|
|
|
55
143
|
count++;
|
|
56
144
|
if (count === 1)
|
|
57
145
|
score += 1;
|
|
58
|
-
const ctx =
|
|
146
|
+
const ctx = legacyContext(content, query);
|
|
59
147
|
if (ctx && !parts.some(pp => pp.includes(ctx.slice(0, 30)))) {
|
|
60
148
|
parts.push(ctx);
|
|
61
149
|
}
|
|
@@ -73,5 +161,25 @@ export function keywordSearch(query) {
|
|
|
73
161
|
}
|
|
74
162
|
return hits.sort((a, b) => b.score - a.score);
|
|
75
163
|
}
|
|
164
|
+
function legacyContext(content, query, maxLen = 100) {
|
|
165
|
+
const lower = content.toLowerCase();
|
|
166
|
+
const q = query.toLowerCase();
|
|
167
|
+
const pos = lower.indexOf(q);
|
|
168
|
+
if (pos < 0)
|
|
169
|
+
return "";
|
|
170
|
+
const before = content.slice(0, pos);
|
|
171
|
+
const lineNum = before.split("\n").length;
|
|
172
|
+
const lines = content.split("\n");
|
|
173
|
+
const prev = lineNum > 1 ? lines[lineNum - 2].trim() : "";
|
|
174
|
+
const curr = lines[lineNum - 1].trim();
|
|
175
|
+
const next = lineNum < lines.length ? lines[lineNum].trim() : "";
|
|
176
|
+
const parts = [];
|
|
177
|
+
if (prev)
|
|
178
|
+
parts.push(`L${lineNum - 1}: ${prev.slice(0, maxLen)}`);
|
|
179
|
+
parts.push(`L${lineNum}: ${curr.slice(0, maxLen)}`);
|
|
180
|
+
if (next)
|
|
181
|
+
parts.push(`L${lineNum + 1}: ${next.slice(0, maxLen)}`);
|
|
182
|
+
return parts.join("\n");
|
|
183
|
+
}
|
|
76
184
|
export const search = keywordSearch;
|
|
77
185
|
//# sourceMappingURL=search.js.map
|
package/dist/lib/search.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"search.js","sourceRoot":"","sources":["../../src/lib/search.ts"],"names":[],"mappings":"AAAA,
|
|
1
|
+
{"version":3,"file":"search.js","sourceRoot":"","sources":["../../src/lib/search.ts"],"names":[],"mappings":"AAAA,2BAA2B;AAC3B,EAAE;AACF,YAAY;AACZ,yBAAyB;AACzB,gDAAgD;AAChD,2CAA2C;AAE3C,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAEpD,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AAGnD,kCAAkC;AAClC,MAAM,UAAU,GAAG,EAAE,CAAC;AAEtB,kBAAkB;AAClB,MAAM,YAAY,GAAG,EAAE,CAAC;AACxB,MAAM,WAAW,GAAG,CAAC,CAAC;AACtB,MAAM,UAAU,GAAG,CAAC,CAAC;AAErB,iBAAiB;AACjB,MAAM,YAAY,GAAG,GAAG,CAAC;AAEzB;;GAEG;AACH,SAAS,WAAW,CAAC,OAAe,EAAE,WAAqB,EAAE,MAAM,GAAG,YAAY;IAChF,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,2BAA2B;IAC3B,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,KAAK,CAAC,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC;QACzC,MAAM,SAAS,GAAG,KAAK,CAAC,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;QAC1C,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC;QACjE,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,IAAI,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAChF,MAAM,IAAI,GAAG,IAAI,EAAE,GAAG,CAAC,KAAK,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC;QAChE,MAAM,IAAI,GAAG,EAAE,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,CAAC,KAAK,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAEnG,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpD,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5C,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpD,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;YAAE,MAAM,CAAC,YAAY;QAC1C,EAAE,EAAE,CAAC,CAAC,mBAAmB;IAC3B,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,+BAA+B;AAC/B,SAAS,aAAa,CAAC,KAAgB;IACrC,IAAI,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACxC,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAC5B,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACzD,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IACvC,IAAI,CAAC;QAAC,OAAO,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC;QAAC,OAAO,IAAI,CAAC;IAAC,CAAC;AACxE,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,KAAa;IACzC,MAAM,KAAK,GAAG,aAAa,EAAE,CAAC;IAC9B,OAAO,KAAK,CAAC,CAAC,CAAC,iBAAiB,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,KAAK,CAAC,CAAC;AAC9E,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAa,EAAE,KAAgB;IACxD,MAAM,GAAG,GAAG,QAAQ,EAAE,CAAC;IACvB,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC;IACpC,MAAM,IAAI,GAAgB,EAAE,CAAC;IAE7B,KAAK,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACnD,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,uBAAuB;QACvB,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;QAC7C,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QACvD,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;YAC9B,IAAI,UAAU,CAAC,QAAQ,CAAC,GAAG,CAAC;gBAAE,KAAK,IAAI,YAAY,CAAC;YACpD,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC;gBAAE,KAAK,IAAI,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,OAAO;YAC3F,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;gBAAE,KAAK,IAAI,UAAU,CAAC;QAChE,CAAC;QAED,aAAa;QACb,MAAM,SAAS,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAE,qBAAqB;QAC7D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,EAAE,GAAG,SAAS,CAAC,WAAW,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC;YACpD,KAAK,IAAI,EAAE,GAAG,UAAU,CAAC;QAC3B,CAAC;QAED,IAAI,KAAK,IAAI,CAAC;YAAE,SAAS;QAEzB,QAAQ;QACR,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,MAAM,UAAU,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,UAAU,EAAE,CAAC;YACf,OAAO,GAAG,WAAW,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;QACjD,CAAC;QAED,IAAI,CAAC,IAAI,CAAC;YACR,OAAO,EAAE,KAAK,CAAC,OAAO,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS;YAClD,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI;YACpC,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC;SAClC,CAAC,CAAC;IACL,CAAC;IAED,OAAO,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;AAChD,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,KAAa;IACxC,MAAM,GAAG,GAAG,QAAQ,EAAE,CAAC;IACvB,MAAM,CAAC,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;IAC9B,MAAM,IAAI,GAAgB,EAAE,CAAC;IAE7B,KAAK,MAAM,CAAC,OAAO,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACnD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,KAAK,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC;YAAE,KAAK,IAAI,EAAE,CAAC;QACvD,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;QAClD,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;QAElE,IAAI,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC;QAClC,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YACnD,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACzB,IAAI,CAAC;oBAAC,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,UAAU,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;QACD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;YACpC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACpC,OAAO,CAAC,IAAI,CAAC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;gBAC3B,KAAK,EAAE,CAAC;gBACR,IAAI,KAAK,KAAK,CAAC;oBAAE,KAAK,IAAI,CAAC,CAAC;gBAC5B,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;gBAC1C,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC5D,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAClB,CAAC;gBACD,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;YAC9B,CAAC;YACD,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,IAAI,CAAC,IAAI,CAAC;gBACR,OAAO,EAAE,KAAK,CAAC,OAAO,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS;gBAClD,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI;gBACpC,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK;aACjC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,aAAa,CAAC,OAAe,EAAE,KAAa,EAAE,MAAM,GAAG,GAAG;IACjE,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,CAAC,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;IAC9B,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAC7B,IAAI,GAAG,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACvB,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;IACrC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,IAAI,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1D,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACvC,MAAM,IAAI,GAAG,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjE,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;IAClE,KAAK,CAAC,IAAI,CAAC,IAAI,OAAO,KAAK,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;IACpD,IAAI,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;IAClE,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,CAAC,MAAM,MAAM,GAAG,aAAa,CAAC"}
|
|
@@ -13,31 +13,31 @@ export const BATCH_SIZE = 25;
|
|
|
13
13
|
// ============================================================
|
|
14
14
|
// System Prompt (蓝图 §24)
|
|
15
15
|
// ============================================================
|
|
16
|
-
export const COMPILE_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
17
|
-
|
|
18
|
-
你的任务不是总结内容。
|
|
19
|
-
|
|
20
|
-
你的任务是:
|
|
21
|
-
将人类随手记录的非结构化笔记,
|
|
22
|
-
转换为适合机器语义索引、概念检索、
|
|
23
|
-
知识聚类、长期演化的"认知知识单元"。
|
|
24
|
-
|
|
25
|
-
核心原则:
|
|
26
|
-
1. 保留原始信息 — 不删技术细节
|
|
27
|
-
2. 不改变原意 — 只规范化表达
|
|
28
|
-
3. 补全隐式表达 — 补充省略的主语、展开缩写
|
|
29
|
-
4. 统一术语 — 将同义表达归一(如 "状态污染" ↔ "stale closure")
|
|
30
|
-
5. 提取核心概念 — 识别技术关键词
|
|
31
|
-
6. 保持单主题 — 一个 chunk 只描述一个认知主题
|
|
32
|
-
7. 输出结构化 JSON — 严格遵循 schema
|
|
33
|
-
|
|
34
|
-
禁止:
|
|
35
|
-
1. 过度总结
|
|
36
|
-
2. 删除原文
|
|
37
|
-
3. 改写逻辑
|
|
38
|
-
4. 主观推断
|
|
39
|
-
5. 引入不存在的信息
|
|
40
|
-
|
|
16
|
+
export const COMPILE_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
17
|
+
|
|
18
|
+
你的任务不是总结内容。
|
|
19
|
+
|
|
20
|
+
你的任务是:
|
|
21
|
+
将人类随手记录的非结构化笔记,
|
|
22
|
+
转换为适合机器语义索引、概念检索、
|
|
23
|
+
知识聚类、长期演化的"认知知识单元"。
|
|
24
|
+
|
|
25
|
+
核心原则:
|
|
26
|
+
1. 保留原始信息 — 不删技术细节
|
|
27
|
+
2. 不改变原意 — 只规范化表达
|
|
28
|
+
3. 补全隐式表达 — 补充省略的主语、展开缩写
|
|
29
|
+
4. 统一术语 — 将同义表达归一(如 "状态污染" ↔ "stale closure")
|
|
30
|
+
5. 提取核心概念 — 识别技术关键词
|
|
31
|
+
6. 保持单主题 — 一个 chunk 只描述一个认知主题
|
|
32
|
+
7. 输出结构化 JSON — 严格遵循 schema
|
|
33
|
+
|
|
34
|
+
禁止:
|
|
35
|
+
1. 过度总结
|
|
36
|
+
2. 删除原文
|
|
37
|
+
3. 改写逻辑
|
|
38
|
+
4. 主观推断
|
|
39
|
+
5. 引入不存在的信息
|
|
40
|
+
|
|
41
41
|
你的角色是:"语义标准化器",不是"内容作者"。`;
|
|
42
42
|
// ============================================================
|
|
43
43
|
// User Prompt 构建 (蓝图 §25)
|
|
@@ -75,67 +75,67 @@ export function buildCompilePrompt(chunks) {
|
|
|
75
75
|
return contextLines.join("\n");
|
|
76
76
|
})
|
|
77
77
|
.join("\n\n");
|
|
78
|
-
return `请分析以下 ${chunks.length} 个笔记块。
|
|
79
|
-
|
|
80
|
-
## 上下文说明
|
|
81
|
-
|
|
82
|
-
每个块附带以下上下文信息,请善用以理解块在文件和笔记体系中的位置:
|
|
83
|
-
- FILE_TAGS: 文件级别的标签,反映所属领域(如 AMI, DLMS)
|
|
84
|
-
- HEADING_PATH: 块的标题层级路径(如 "10.18AMI更新 > 排查过程")
|
|
85
|
-
- SIBLING_HEADINGS: 同一文件内所有块的标题,帮助你理解块之间的前后关系
|
|
86
|
-
- CHUNK_POSITION: 本块在文件中的位置和角色(开头/中间/结尾)
|
|
87
|
-
|
|
88
|
-
## 输出字段
|
|
89
|
-
|
|
90
|
-
对每个块,输出一个 JSON 对象,包含以下字段:
|
|
91
|
-
|
|
92
|
-
{
|
|
93
|
-
"key": "块标识(与输入的 KEY 一致)",
|
|
94
|
-
"topic": "核心主题(一句话)",
|
|
95
|
-
"summary": "一句话摘要(≤30 字)",
|
|
96
|
-
"concepts": ["提取的技术概念"],
|
|
97
|
-
"entities": ["实体名称"],
|
|
98
|
-
"aliases": ["同义表达,格式 '中文 ↔ English'"],
|
|
99
|
-
"keywords": ["检索关键词"],
|
|
100
|
-
"normalizedText": "规范化后的文本(补全省略、统一术语、保留所有技术细节)",
|
|
101
|
-
"chunkType": "concept | note | code | reference | todo | idea | question | architecture | decision | log | research",
|
|
102
|
-
"importance": 0.0-1.0,
|
|
103
|
-
"confidence": 0.0-1.0,
|
|
104
|
-
"contentClass": "knowledge | event | conversation | reference",
|
|
105
|
-
"temporalAnchor": "如果内容是事件类,提取时间锚点(如 '2024-10-18'),否则省略",
|
|
106
|
-
"followsChunk": "如果有 SIBLING_HEADINGS 且本块不是第一个,填写前一个块的 KEY",
|
|
107
|
-
"precedesChunk": "如果有 SIBLING_HEADINGS 且本块不是最后一个,填写后一个块的 KEY",
|
|
108
|
-
"siblingHeadings": ["同文件所有块的标题列表(复制 SIBLING_HEADINGS)"]
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
## 关系字段判断指南
|
|
112
|
-
|
|
113
|
-
contentClass 判断:
|
|
114
|
-
- knowledge: 无时间性的技术知识、编码规范、原理说明
|
|
115
|
-
- event: 有时间锚点的故障记录、更新日志、会议记录
|
|
116
|
-
- conversation: 对话推理链中的片段(隐式引用前文)
|
|
117
|
-
- reference: 纯参考列表、配置清单、目录结构
|
|
118
|
-
|
|
119
|
-
followsChunk/precedesChunk:
|
|
120
|
-
- 仅在 SIBLING_HEADINGS 明确显示了前后顺序时填写
|
|
121
|
-
- 使用 CHUNK #N 中的 KEY 值
|
|
122
|
-
- 如果块之间无明显因果/时序关系,留空
|
|
123
|
-
|
|
124
|
-
temporalAnchor:
|
|
125
|
-
- 仅在 contentClass=event 时尝试提取
|
|
126
|
-
- 从 FILE_TAGS、HEADING_PATH 或文本本身提取日期
|
|
127
|
-
- 格式: YYYY-MM-DD,无法确定则留空
|
|
128
|
-
|
|
129
|
-
## 注意事项
|
|
130
|
-
|
|
131
|
-
- normalizedText 必须保留 ALL 技术细节
|
|
132
|
-
- importance: 高价值技术知识 0.8+,TODO/碎片 0.3-,普通笔记 0.5
|
|
133
|
-
- confidence: 信息完整明确 0.9+,曖昧 0.5-
|
|
134
|
-
- 所有关系字段为可选 —— 如果无法判断,留空而非猜测
|
|
135
|
-
|
|
136
|
-
返回格式:一个 JSON 数组,包含 ${chunks.length} 个对象。
|
|
137
|
-
不要 markdown 代码块包裹,直接输出 JSON 数组。
|
|
138
|
-
|
|
78
|
+
return `请分析以下 ${chunks.length} 个笔记块。
|
|
79
|
+
|
|
80
|
+
## 上下文说明
|
|
81
|
+
|
|
82
|
+
每个块附带以下上下文信息,请善用以理解块在文件和笔记体系中的位置:
|
|
83
|
+
- FILE_TAGS: 文件级别的标签,反映所属领域(如 AMI, DLMS)
|
|
84
|
+
- HEADING_PATH: 块的标题层级路径(如 "10.18AMI更新 > 排查过程")
|
|
85
|
+
- SIBLING_HEADINGS: 同一文件内所有块的标题,帮助你理解块之间的前后关系
|
|
86
|
+
- CHUNK_POSITION: 本块在文件中的位置和角色(开头/中间/结尾)
|
|
87
|
+
|
|
88
|
+
## 输出字段
|
|
89
|
+
|
|
90
|
+
对每个块,输出一个 JSON 对象,包含以下字段:
|
|
91
|
+
|
|
92
|
+
{
|
|
93
|
+
"key": "块标识(与输入的 KEY 一致)",
|
|
94
|
+
"topic": "核心主题(一句话)",
|
|
95
|
+
"summary": "一句话摘要(≤30 字)",
|
|
96
|
+
"concepts": ["提取的技术概念"],
|
|
97
|
+
"entities": ["实体名称"],
|
|
98
|
+
"aliases": ["同义表达,格式 '中文 ↔ English'"],
|
|
99
|
+
"keywords": ["检索关键词"],
|
|
100
|
+
"normalizedText": "规范化后的文本(补全省略、统一术语、保留所有技术细节)",
|
|
101
|
+
"chunkType": "concept | note | code | reference | todo | idea | question | architecture | decision | log | research",
|
|
102
|
+
"importance": 0.0-1.0,
|
|
103
|
+
"confidence": 0.0-1.0,
|
|
104
|
+
"contentClass": "knowledge | event | conversation | reference",
|
|
105
|
+
"temporalAnchor": "如果内容是事件类,提取时间锚点(如 '2024-10-18'),否则省略",
|
|
106
|
+
"followsChunk": "如果有 SIBLING_HEADINGS 且本块不是第一个,填写前一个块的 KEY",
|
|
107
|
+
"precedesChunk": "如果有 SIBLING_HEADINGS 且本块不是最后一个,填写后一个块的 KEY",
|
|
108
|
+
"siblingHeadings": ["同文件所有块的标题列表(复制 SIBLING_HEADINGS)"]
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
## 关系字段判断指南
|
|
112
|
+
|
|
113
|
+
contentClass 判断:
|
|
114
|
+
- knowledge: 无时间性的技术知识、编码规范、原理说明
|
|
115
|
+
- event: 有时间锚点的故障记录、更新日志、会议记录
|
|
116
|
+
- conversation: 对话推理链中的片段(隐式引用前文)
|
|
117
|
+
- reference: 纯参考列表、配置清单、目录结构
|
|
118
|
+
|
|
119
|
+
followsChunk/precedesChunk:
|
|
120
|
+
- 仅在 SIBLING_HEADINGS 明确显示了前后顺序时填写
|
|
121
|
+
- 使用 CHUNK #N 中的 KEY 值
|
|
122
|
+
- 如果块之间无明显因果/时序关系,留空
|
|
123
|
+
|
|
124
|
+
temporalAnchor:
|
|
125
|
+
- 仅在 contentClass=event 时尝试提取
|
|
126
|
+
- 从 FILE_TAGS、HEADING_PATH 或文本本身提取日期
|
|
127
|
+
- 格式: YYYY-MM-DD,无法确定则留空
|
|
128
|
+
|
|
129
|
+
## 注意事项
|
|
130
|
+
|
|
131
|
+
- normalizedText 必须保留 ALL 技术细节
|
|
132
|
+
- importance: 高价值技术知识 0.8+,TODO/碎片 0.3-,普通笔记 0.5
|
|
133
|
+
- confidence: 信息完整明确 0.9+,曖昧 0.5-
|
|
134
|
+
- 所有关系字段为可选 —— 如果无法判断,留空而非猜测
|
|
135
|
+
|
|
136
|
+
返回格式:一个 JSON 数组,包含 ${chunks.length} 个对象。
|
|
137
|
+
不要 markdown 代码块包裹,直接输出 JSON 数组。
|
|
138
|
+
|
|
139
139
|
${chunkBlocks}`;
|
|
140
140
|
}
|
|
141
141
|
// ============================================================
|
|
@@ -198,22 +198,22 @@ export function parseCompiledResult(text) {
|
|
|
198
198
|
// v5.2 文件级编译
|
|
199
199
|
// ============================================================
|
|
200
200
|
/** 文件级编译 System Prompt(只要求 LLM 做 4 件事) */
|
|
201
|
-
export const FILE_COMPILE_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
202
|
-
|
|
203
|
-
你的任务: 将整篇笔记转换为结构化的语义知识单元。
|
|
204
|
-
|
|
205
|
-
你需要做的 4 件事:
|
|
206
|
-
1. 自行判断语义边界 — 将文件分成若干连续的语义片段(segments)
|
|
207
|
-
2. 为每个片段写出 topic(核心主题,一句话)
|
|
208
|
-
3. 为每个片段写出 normalizedText(规范化文本:补全省略、统一术语、保留所有技术细节)
|
|
209
|
-
4. 为每个片段提取 concepts(技术概念)和 aliases(同义表达,格式 "中文 ↔ English")
|
|
210
|
-
|
|
211
|
-
核心原则:
|
|
212
|
-
- 保留原始信息 — 不删技术细节(API 名、参数、错误信息、缩写)
|
|
213
|
-
- 不改变原意 — 只规范化表达
|
|
214
|
-
- 语义边界 = 同一认知主题的自然段或连续段落
|
|
215
|
-
- 如果整个文件是单一主题,只输出 1 个 segment
|
|
216
|
-
|
|
201
|
+
export const FILE_COMPILE_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
202
|
+
|
|
203
|
+
你的任务: 将整篇笔记转换为结构化的语义知识单元。
|
|
204
|
+
|
|
205
|
+
你需要做的 4 件事:
|
|
206
|
+
1. 自行判断语义边界 — 将文件分成若干连续的语义片段(segments)
|
|
207
|
+
2. 为每个片段写出 topic(核心主题,一句话)
|
|
208
|
+
3. 为每个片段写出 normalizedText(规范化文本:补全省略、统一术语、保留所有技术细节)
|
|
209
|
+
4. 为每个片段提取 concepts(技术概念)和 aliases(同义表达,格式 "中文 ↔ English")
|
|
210
|
+
|
|
211
|
+
核心原则:
|
|
212
|
+
- 保留原始信息 — 不删技术细节(API 名、参数、错误信息、缩写)
|
|
213
|
+
- 不改变原意 — 只规范化表达
|
|
214
|
+
- 语义边界 = 同一认知主题的自然段或连续段落
|
|
215
|
+
- 如果整个文件是单一主题,只输出 1 个 segment
|
|
216
|
+
|
|
217
217
|
禁止: 过度总结、删除原文、改写逻辑、主观推断、引入不存在的信息。`;
|
|
218
218
|
/**
|
|
219
219
|
* 为文件级编译构建 prompt
|
|
@@ -226,36 +226,36 @@ export function buildFileCompilePrompt(relPath, fullText, preprocessed) {
|
|
|
226
226
|
.slice(0, 5)
|
|
227
227
|
.map((p) => ` - ${p.heading || "(无标题)"} [${p.chunkType}, ${p.contentClass}, imp=${p.importance.toFixed(1)}]`)
|
|
228
228
|
.join("\n");
|
|
229
|
-
return `请分析以下笔记文件。
|
|
230
|
-
|
|
231
|
-
文件: ${relPath}
|
|
232
|
-
程序预分析(仅供参考,你不需填写这些字段):
|
|
233
|
-
${preSummary}
|
|
234
|
-
|
|
235
|
-
你需要输出一个 JSON 对象:
|
|
236
|
-
|
|
237
|
-
{
|
|
238
|
-
"segments": [
|
|
239
|
-
{
|
|
240
|
-
"text": "片段原文(从文件中截取)",
|
|
241
|
-
"topic": "核心主题(一句话)",
|
|
242
|
-
"normalizedText": "规范化文本",
|
|
243
|
-
"concepts": ["技术概念"],
|
|
244
|
-
"aliases": ["同义表达 (格式: 中文 ↔ English)"]
|
|
245
|
-
}
|
|
246
|
-
]
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
注意事项:
|
|
250
|
-
- segments 按文件顺序排列
|
|
251
|
-
- 语义边界你自己判断: 同一认知主题归为一个 segment
|
|
252
|
-
- 一个 segment 可以包含多个自然段
|
|
253
|
-
- 至少输出 1 个 segment
|
|
254
|
-
- normalizedText 必须保留 ALL 技术细节
|
|
255
|
-
- 不要 markdown 代码块包裹,直接输出 JSON
|
|
256
|
-
|
|
257
|
-
=== 文件全文 ===
|
|
258
|
-
|
|
229
|
+
return `请分析以下笔记文件。
|
|
230
|
+
|
|
231
|
+
文件: ${relPath}
|
|
232
|
+
程序预分析(仅供参考,你不需填写这些字段):
|
|
233
|
+
${preSummary}
|
|
234
|
+
|
|
235
|
+
你需要输出一个 JSON 对象:
|
|
236
|
+
|
|
237
|
+
{
|
|
238
|
+
"segments": [
|
|
239
|
+
{
|
|
240
|
+
"text": "片段原文(从文件中截取)",
|
|
241
|
+
"topic": "核心主题(一句话)",
|
|
242
|
+
"normalizedText": "规范化文本",
|
|
243
|
+
"concepts": ["技术概念"],
|
|
244
|
+
"aliases": ["同义表达 (格式: 中文 ↔ English)"]
|
|
245
|
+
}
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
注意事项:
|
|
250
|
+
- segments 按文件顺序排列
|
|
251
|
+
- 语义边界你自己判断: 同一认知主题归为一个 segment
|
|
252
|
+
- 一个 segment 可以包含多个自然段
|
|
253
|
+
- 至少输出 1 个 segment
|
|
254
|
+
- normalizedText 必须保留 ALL 技术细节
|
|
255
|
+
- 不要 markdown 代码块包裹,直接输出 JSON
|
|
256
|
+
|
|
257
|
+
=== 文件全文 ===
|
|
258
|
+
|
|
259
259
|
${fullText}`;
|
|
260
260
|
}
|
|
261
261
|
/** 从 LLM 响应中提取 FileSegment 数组 */
|
|
@@ -283,44 +283,44 @@ export function parseFileSegments(text) {
|
|
|
283
283
|
// v5.4 文件级编译(简化版:不要求 segments,只要求 1 个文件的 4 字段)
|
|
284
284
|
// ============================================================
|
|
285
285
|
/** v5.4 文件级 System Prompt(极简版) */
|
|
286
|
-
export const FILE_LLM_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
287
|
-
|
|
288
|
-
你的任务: 将整篇笔记转换为结构化的语义元数据,用于增强语义搜索。
|
|
289
|
-
|
|
290
|
-
你需要输出的 4 个字段:
|
|
291
|
-
1. topic — 核心主题(一句话概括全文)
|
|
292
|
-
2. normalizedText — 规范化文本(补全省略主语、统一术语、保留所有技术细节)
|
|
293
|
-
3. concepts — 技术概念列表(3-8 个核心概念)
|
|
294
|
-
4. aliases — 同义表达(格式 "中文 ↔ English",2-5 组)
|
|
295
|
-
|
|
296
|
-
核心原则:
|
|
297
|
-
- 保留所有技术细节(API 名、参数、错误信息、缩写、版本号)
|
|
298
|
-
- 不改变原意,只规范化表达
|
|
299
|
-
- concepts 提取技术关键词,不是摘要
|
|
300
|
-
- aliases 覆盖中英对照和缩写展开
|
|
301
|
-
|
|
286
|
+
export const FILE_LLM_SYSTEM_PROMPT = `你是一个"知识语义编译器"。
|
|
287
|
+
|
|
288
|
+
你的任务: 将整篇笔记转换为结构化的语义元数据,用于增强语义搜索。
|
|
289
|
+
|
|
290
|
+
你需要输出的 4 个字段:
|
|
291
|
+
1. topic — 核心主题(一句话概括全文)
|
|
292
|
+
2. normalizedText — 规范化文本(补全省略主语、统一术语、保留所有技术细节)
|
|
293
|
+
3. concepts — 技术概念列表(3-8 个核心概念)
|
|
294
|
+
4. aliases — 同义表达(格式 "中文 ↔ English",2-5 组)
|
|
295
|
+
|
|
296
|
+
核心原则:
|
|
297
|
+
- 保留所有技术细节(API 名、参数、错误信息、缩写、版本号)
|
|
298
|
+
- 不改变原意,只规范化表达
|
|
299
|
+
- concepts 提取技术关键词,不是摘要
|
|
300
|
+
- aliases 覆盖中英对照和缩写展开
|
|
301
|
+
|
|
302
302
|
禁止: 过度总结、删除原文、改写逻辑、主观推断、引入不存在的信息。`;
|
|
303
303
|
/**
|
|
304
304
|
* v5.4 构建简化文件级编译 prompt
|
|
305
305
|
*/
|
|
306
306
|
export function buildFileLLMPrompt(relPath, fullText) {
|
|
307
|
-
return `请分析以下笔记文件,提取语义元数据。
|
|
308
|
-
|
|
309
|
-
文件: ${relPath}
|
|
310
|
-
|
|
311
|
-
输出一个 JSON 对象:
|
|
312
|
-
|
|
313
|
-
{
|
|
314
|
-
"topic": "核心主题(一句话)",
|
|
315
|
-
"normalizedText": "规范化文本",
|
|
316
|
-
"concepts": ["技术概念1", "技术概念2"],
|
|
317
|
-
"aliases": ["中文 ↔ English"]
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
不要 markdown 代码块包裹,直接输出 JSON。
|
|
321
|
-
|
|
322
|
-
=== 文件全文 ===
|
|
323
|
-
|
|
307
|
+
return `请分析以下笔记文件,提取语义元数据。
|
|
308
|
+
|
|
309
|
+
文件: ${relPath}
|
|
310
|
+
|
|
311
|
+
输出一个 JSON 对象:
|
|
312
|
+
|
|
313
|
+
{
|
|
314
|
+
"topic": "核心主题(一句话)",
|
|
315
|
+
"normalizedText": "规范化文本",
|
|
316
|
+
"concepts": ["技术概念1", "技术概念2"],
|
|
317
|
+
"aliases": ["中文 ↔ English"]
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
不要 markdown 代码块包裹,直接输出 JSON。
|
|
321
|
+
|
|
322
|
+
=== 文件全文 ===
|
|
323
|
+
|
|
324
324
|
${fullText}`;
|
|
325
325
|
}
|
|
326
326
|
/**
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { FileEntry } from "./types.js";
|
|
2
|
+
import type { Bm25Stats } from "./bm25.js";
|
|
2
3
|
export declare function readIndex(): Record<string, FileEntry>;
|
|
3
4
|
export declare function writeIndex(idx: Record<string, FileEntry>): void;
|
|
4
5
|
export declare function getIndex(): Record<string, FileEntry>;
|
|
@@ -11,4 +12,6 @@ export declare function removeEntriesBySource(sourceDir: string): number;
|
|
|
11
12
|
export declare function indexStats(): {
|
|
12
13
|
files: number;
|
|
13
14
|
};
|
|
15
|
+
export declare function readBm25Stats(): Bm25Stats | null;
|
|
16
|
+
export declare function writeBm25Stats(stats: Bm25Stats): void;
|
|
14
17
|
//# sourceMappingURL=store-index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"store-index.d.ts","sourceRoot":"","sources":["../../src/lib/store-index.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"store-index.d.ts","sourceRoot":"","sources":["../../src/lib/store-index.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAC5C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAK3C,wBAAgB,SAAS,IAAI,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAMrD;AAED,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,IAAI,CAE/D;AAID,wBAAgB,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAEpD;AAED,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI,CAE1D;AAID,wBAAgB,UAAU,CAAC,OAAO,EAAE,SAAS,EAAE,GAAG,IAAI,CAOrD;AAED,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAMpD;AAED,wBAAgB,eAAe,CAC7B,UAAU,EAAE,MAAM,EAClB,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,SAAS,GACf,OAAO,CAOT;AAED,2BAA2B;AAC3B,wBAAgB,qBAAqB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAW/D;AAID,wBAAgB,UAAU,IAAI;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAE9C;AAID,wBAAgB,aAAa,IAAI,SAAS,GAAG,IAAI,CAMhD;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,SAAS,GAAG,IAAI,CAErD"}
|
package/dist/lib/store-index.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
// 管理: 文件索引 Record<relPath, FileEntry>
|
|
4
4
|
// 拆自�?store-settings.ts,索引不再嵌�?config.json
|
|
5
5
|
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
6
|
-
import { indexFile } from "../config.js";
|
|
6
|
+
import { indexFile, bm25StatsFile } from "../config.js";
|
|
7
7
|
import { setLastScan } from "./store-config.js";
|
|
8
8
|
// ---- 核心读写 ----
|
|
9
9
|
export function readIndex() {
|
|
@@ -69,4 +69,17 @@ export function removeEntriesBySource(sourceDir) {
|
|
|
69
69
|
export function indexStats() {
|
|
70
70
|
return { files: Object.keys(readIndex()).length };
|
|
71
71
|
}
|
|
72
|
+
// ---- BM25 统计持久化 ----
|
|
73
|
+
export function readBm25Stats() {
|
|
74
|
+
try {
|
|
75
|
+
const p = bm25StatsFile();
|
|
76
|
+
if (existsSync(p))
|
|
77
|
+
return JSON.parse(readFileSync(p, "utf-8"));
|
|
78
|
+
}
|
|
79
|
+
catch { /* ignore */ }
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
export function writeBm25Stats(stats) {
|
|
83
|
+
writeFileSync(bm25StatsFile(), JSON.stringify(stats, null, 2), "utf-8");
|
|
84
|
+
}
|
|
72
85
|
//# sourceMappingURL=store-index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"store-index.js","sourceRoot":"","sources":["../../src/lib/store-index.ts"],"names":[],"mappings":"AAAA,mCAAmC;AACnC,EAAE;AACF,sCAAsC;AACtC,2CAA2C;AAE3C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"store-index.js","sourceRoot":"","sources":["../../src/lib/store-index.ts"],"names":[],"mappings":"AAAA,mCAAmC;AACnC,EAAE;AACF,sCAAsC;AACtC,2CAA2C;AAE3C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAGxD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,iBAAiB;AAEjB,MAAM,UAAU,SAAS;IACvB,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,SAAS,EAAE,CAAC;QACtB,IAAI,UAAU,CAAC,CAAC,CAAC;YAAE,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;IACjE,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IACxB,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,GAA8B;IACvD,aAAa,CAAC,SAAS,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,eAAe;AAEf,MAAM,UAAU,QAAQ;IACtB,OAAO,SAAS,EAAE,CAAC;AACrB,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,OAAe;IACtC,OAAO,SAAS,EAAE,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC;AACtC,CAAC;AAED,eAAe;AAEf,MAAM,UAAU,UAAU,CAAC,OAAoB;IAC7C,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;IACxB,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACrB,CAAC;IACD,UAAU,CAAC,GAAG,CAAC,CAAC;IAChB,WAAW,CAAC,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,OAAe;IACzC,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;IACxB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAC;IAChC,OAAO,GAAG,CAAC,OAAO,CAAC,CAAC;IACpB,UAAU,CAAC,GAAG,CAAC,CAAC;IAChB,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,UAAkB,EAClB,UAAkB,EAClB,KAAgB;IAEhB,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;IACxB,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC;QAAE,OAAO,KAAK,CAAC;IACnC,OAAO,GAAG,CAAC,UAAU,CAAC,CAAC;IACvB,GAAG,CAAC,UAAU,CAAC,GAAG,KAAK,CAAC;IACxB,UAAU,CAAC,GAAG,CAAC,CAAC;IAChB,OAAO,IAAI,CAAC;AACd,CAAC;AAED,2BAA2B;AAC3B,MAAM,UAAU,qBAAqB,CAAC,SAAiB;IACrD,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;IACxB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,IAAI,KAAK,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;YAClC,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YAChB,KAAK,EAAE,CAAC;QACV,CAAC;IACH,CAAC;IACD,IAAI,KAAK;QAAE,UAAU,CAAC,GAAG,CAAC,CAAC;IAC3B,OAAO,KAAK,CAAC;AACf,CAAC;AAED,eAAe;AAEf,MAAM,UAAU,UAAU;IACxB,OAAO,EAAE,KAAK,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;AACpD,CAAC;AAED,uBAAuB;AAEvB,MAAM,UAAU,aAAa;IAC3B,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,aAAa,EAAE,CAAC;QAC1B,IAAI,UAAU,CAAC,CAAC,CAAC;YAAE,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;IACjE,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IACxB,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,KAAgB;IAC7C,aAAa,CAAC,aAAa,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AAC1E,CAAC"}
|
package/dist/lib/store.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export { getSources, addSource, removeSource, getSemanticEnabled, setSemanticEnabled, readModelId, writeModelId, getWikiModel, configStats, } from "./store-config.js";
|
|
2
|
-
export { getIndex, mergeIndex, removeEntry, updateEntryPath, getEntry, removeEntriesBySource, indexStats, } from "./store-index.js";
|
|
2
|
+
export { getIndex, mergeIndex, removeEntry, updateEntryPath, getEntry, removeEntriesBySource, indexStats, readBm25Stats, writeBm25Stats, } from "./store-index.js";
|
|
3
3
|
export { getEmbeddings, setEmbeddings, getChunkInfo, setChunkInfo, removeEmbedding, getEmbeddingModel, getEmbeddingDim, vectorsStats, } from "./store-vectors.js";
|
|
4
4
|
export declare function stats(): {
|
|
5
5
|
sources: number;
|