kc-beta 0.5.5 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/QUICKSTART.md +17 -4
- package/README.md +58 -11
- package/bin/kc-beta.js +35 -1
- package/package.json +1 -1
- package/src/agent/bundle-tree.js +553 -0
- package/src/agent/context.js +40 -1
- package/src/agent/engine.js +644 -28
- package/src/agent/llm-client.js +67 -18
- package/src/agent/pipelines/finalization.js +186 -0
- package/src/agent/pipelines/index.js +8 -0
- package/src/agent/pipelines/initializer.js +40 -0
- package/src/agent/pipelines/skill-authoring.js +100 -6
- package/src/agent/skill-loader.js +54 -4
- package/src/agent/task-manager.js +66 -3
- package/src/agent/tools/agent-tool.js +283 -35
- package/src/agent/tools/bundle-search.js +146 -0
- package/src/agent/tools/document-chunk.js +246 -0
- package/src/agent/tools/document-classify.js +311 -0
- package/src/agent/tools/document-parse.js +8 -1
- package/src/agent/tools/phase-advance.js +30 -7
- package/src/agent/tools/registry.js +10 -0
- package/src/agent/tools/rule-catalog.js +17 -3
- package/src/agent/tools/sandbox-exec.js +30 -0
- package/src/agent/workspace.js +168 -14
- package/src/cli/components.js +165 -17
- package/src/cli/index.js +166 -19
- package/src/cli/meme.js +58 -0
- package/src/config.js +39 -2
- package/src/model-tiers.json +3 -2
- package/src/providers.js +34 -1
- package/template/skills/en/meta-meta/evolution-loop/SKILL.md +13 -1
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +74 -0
- package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +7 -1
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +73 -0
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Onion-peeler chunker with virtual-root multi-file bundle support.
|
|
3
|
+
*
|
|
4
|
+
* Ported from archive/pr_verify_app/backend/shared/chunker.py. The Python
|
|
5
|
+
* version is battle-tested across E2E #3 and the AMC verification app's
|
|
6
|
+
* ~378-task run; this is a faithful Node translation kept close to the
|
|
7
|
+
* original so future AMC-side fixes port cleanly.
|
|
8
|
+
*
|
|
9
|
+
* Shape:
|
|
10
|
+
*
|
|
11
|
+
* root (bundle)
|
|
12
|
+
* ├── file: foo.pdf
|
|
13
|
+
* │ ├── §1 重要提示
|
|
14
|
+
* │ ├── §2 产品概况
|
|
15
|
+
* │ │ └── 2.1 名称...
|
|
16
|
+
* │ └── §3 财务指标
|
|
17
|
+
* └── file: bar.xlsx
|
|
18
|
+
* └── (single leaf — non-paged doc)
|
|
19
|
+
*
|
|
20
|
+
* - Each leaf carries `pageRange: [start, end]` in the source file.
|
|
21
|
+
* - Leaves are bounded by `maxTokensPerChunk` (default 2000 ≈ 5000 chars CJK).
|
|
22
|
+
* - A CJK-bigram + English-word keyword index maps tokens → chunkIds for
|
|
23
|
+
* O(1) RAG lookup without embedding models.
|
|
24
|
+
* - Output is JSON-serializable for disk caching.
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
// ------------------ Constants ------------------
|
|
28
|
+
|
|
29
|
+
const HEADING_RE = /^(#{1,6})\s+(.+?)\s*$/gm;
|
|
30
|
+
const TOKEN_CHARS = 2.5; // CJK-heavy rough estimate; matches Python side
|
|
31
|
+
|
|
32
|
+
// Chinese + English tokenizers for the keyword index.
|
|
33
|
+
const CJK_CHUNK_RE = /[\u4e00-\u9fff]{2,}/g;
|
|
34
|
+
const EN_WORD_RE = /[A-Za-z][A-Za-z0-9_-]{2,}/g;
|
|
35
|
+
|
|
36
|
+
export function estimateTokens(text) {
|
|
37
|
+
if (!text) return 1;
|
|
38
|
+
return Math.max(1, Math.floor(text.length / TOKEN_CHARS));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ------------------ BundleTree ------------------
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* A chunk is a plain JS object with the shape:
|
|
45
|
+
* {
|
|
46
|
+
* chunk_id, kind, title,
|
|
47
|
+
* header_path: [],
|
|
48
|
+
* source_file, page_range: [start, end],
|
|
49
|
+
* children: [chunkId], content, tokens
|
|
50
|
+
* }
|
|
51
|
+
* Kept as a plain object (not a class) so JSON (de)serialization is trivial.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
export class BundleTree {
|
|
55
|
+
constructor({ rootId, chunks, keywordIndex, leavesOrder }) {
|
|
56
|
+
this.root_id = rootId;
|
|
57
|
+
this.chunks = chunks;
|
|
58
|
+
this.keyword_index = keywordIndex;
|
|
59
|
+
this.leaves_order = leavesOrder;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
toJSON() {
|
|
63
|
+
return {
|
|
64
|
+
root_id: this.root_id,
|
|
65
|
+
chunks: this.chunks,
|
|
66
|
+
keyword_index: this.keyword_index,
|
|
67
|
+
leaves_order: this.leaves_order,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
static fromJSON(obj) {
|
|
72
|
+
return new BundleTree({
|
|
73
|
+
rootId: obj.root_id,
|
|
74
|
+
chunks: obj.chunks,
|
|
75
|
+
keywordIndex: obj.keyword_index,
|
|
76
|
+
leavesOrder: obj.leaves_order,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// --- Query API (mirrors chunker.py BundleTree) ---
|
|
81
|
+
|
|
82
|
+
get(chunkId) {
|
|
83
|
+
return this.chunks[chunkId] || null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Compact textual outline — handed to the agent so it can pick which chunks
|
|
88
|
+
* to fetch. One line per node, indented by depth.
|
|
89
|
+
*/
|
|
90
|
+
outline(maxDepth = 3) {
|
|
91
|
+
const lines = [];
|
|
92
|
+
const walk = (cid, depth) => {
|
|
93
|
+
const ch = this.chunks[cid];
|
|
94
|
+
if (!ch || depth > maxDepth) return;
|
|
95
|
+
const prefix = " ".repeat(depth);
|
|
96
|
+
let label;
|
|
97
|
+
if (ch.kind === "root") label = "📦 bundle";
|
|
98
|
+
else if (ch.kind === "file") label = `📄 ${ch.source_file || ch.title || ""}`;
|
|
99
|
+
else if (ch.kind === "section") label = `§ ${ch.title}`;
|
|
100
|
+
else if (ch.kind === "leaf") label = `• ${ch.title} [${ch.chunk_id}]`;
|
|
101
|
+
else label = ch.title || "";
|
|
102
|
+
|
|
103
|
+
const pr = ch.page_range || [1, 1];
|
|
104
|
+
const loc = ch.kind === "leaf" && pr[0] > 1 ? ` (p.${pr[0]})` : "";
|
|
105
|
+
if (ch.kind === "leaf") {
|
|
106
|
+
lines.push(`${prefix}${label}${loc} · ${ch.tokens || 0} tokens`);
|
|
107
|
+
} else {
|
|
108
|
+
lines.push(`${prefix}${label}`);
|
|
109
|
+
}
|
|
110
|
+
for (const childId of ch.children || []) walk(childId, depth + 1);
|
|
111
|
+
};
|
|
112
|
+
walk(this.root_id, 0);
|
|
113
|
+
return lines.join("\n");
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Score leaves by how many of the keywords hit them (index + substring
|
|
118
|
+
* fallback for multi-word phrases). Return up to `limit` ranked results.
|
|
119
|
+
*/
|
|
120
|
+
search(keywords, limit = 8) {
|
|
121
|
+
if (!Array.isArray(keywords) || keywords.length === 0) return [];
|
|
122
|
+
const kws = keywords
|
|
123
|
+
.map((k) => (typeof k === "string" ? k.trim().toLowerCase() : ""))
|
|
124
|
+
.filter(Boolean);
|
|
125
|
+
if (kws.length === 0) return [];
|
|
126
|
+
|
|
127
|
+
const scores = new Map();
|
|
128
|
+
for (const kw of kws) {
|
|
129
|
+
const hits = this.keyword_index[kw] || [];
|
|
130
|
+
for (const cid of hits) {
|
|
131
|
+
scores.set(cid, (scores.get(cid) || 0) + 1);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Substring fallback for keywords not in the index (e.g. multi-word phrases).
|
|
136
|
+
const indexed = new Set(Object.keys(this.keyword_index));
|
|
137
|
+
const unindexed = kws.filter((k) => !indexed.has(k));
|
|
138
|
+
if (unindexed.length > 0) {
|
|
139
|
+
for (const cid of this.leaves_order) {
|
|
140
|
+
const ch = this.chunks[cid];
|
|
141
|
+
if (!ch) continue;
|
|
142
|
+
const hay = ((ch.content || "") + "\n" + (ch.header_path || []).join("/"))
|
|
143
|
+
.toLowerCase();
|
|
144
|
+
for (const kw of unindexed) {
|
|
145
|
+
if (hay.includes(kw)) {
|
|
146
|
+
scores.set(cid, (scores.get(cid) || 0) + 1);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Rank: higher score first, then document order.
|
|
153
|
+
const positionOf = new Map(this.leaves_order.map((cid, i) => [cid, i]));
|
|
154
|
+
const ranked = Array.from(scores.entries()).sort((a, b) => {
|
|
155
|
+
if (b[1] !== a[1]) return b[1] - a[1];
|
|
156
|
+
const pa = positionOf.get(a[0]) ?? Infinity;
|
|
157
|
+
const pb = positionOf.get(b[0]) ?? Infinity;
|
|
158
|
+
return pa - pb;
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
return ranked.slice(0, limit).map(([cid]) => this.chunks[cid]);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
allLeaves() {
|
|
165
|
+
return this.leaves_order.map((cid) => this.chunks[cid]).filter(Boolean);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/** Direct children of the synthetic root (one entry per input file). */
|
|
169
|
+
files() {
|
|
170
|
+
const root = this.chunks[this.root_id];
|
|
171
|
+
if (!root) return [];
|
|
172
|
+
return (root.children || []).map((cid) => this.chunks[cid]).filter(Boolean);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// ------------------ Builder ------------------
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Build a BundleTree from parsed files.
|
|
180
|
+
*
|
|
181
|
+
* @param {Array<{source_file: string, total_pages: number, blocks: Array<{page: number, markdown: string}>, parse_error?: string}>} parsedFiles
|
|
182
|
+
* @param {{ maxTokensPerChunk?: number }} [opts]
|
|
183
|
+
* @returns {BundleTree}
|
|
184
|
+
*/
|
|
185
|
+
export function buildBundleTree(parsedFiles, { maxTokensPerChunk = 2000 } = {}) {
|
|
186
|
+
const chunks = {};
|
|
187
|
+
const leavesOrder = [];
|
|
188
|
+
|
|
189
|
+
const add = (ch) => {
|
|
190
|
+
chunks[ch.chunk_id] = ch;
|
|
191
|
+
if (ch.kind === "leaf") leavesOrder.push(ch.chunk_id);
|
|
192
|
+
return ch.chunk_id;
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
// Root
|
|
196
|
+
const rootId = "bundle_root";
|
|
197
|
+
add({
|
|
198
|
+
chunk_id: rootId, kind: "root", title: "文档包(Bundle)",
|
|
199
|
+
header_path: [], source_file: "", page_range: [1, 1], children: [],
|
|
200
|
+
content: "", tokens: 0,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
parsedFiles.forEach((pf, pfIdx) => {
|
|
204
|
+
// Parse-error placeholder leaf
|
|
205
|
+
if ((!pf.blocks || pf.blocks.length === 0) && pf.parse_error) {
|
|
206
|
+
const errId = `file${String(pfIdx).padStart(2, "0")}_error`;
|
|
207
|
+
add({
|
|
208
|
+
chunk_id: errId, kind: "leaf",
|
|
209
|
+
title: `${pf.source_file} (解析失败)`,
|
|
210
|
+
source_file: pf.source_file, page_range: [1, 1],
|
|
211
|
+
content: `[parse error] ${pf.parse_error}`,
|
|
212
|
+
tokens: estimateTokens(pf.parse_error || ""),
|
|
213
|
+
header_path: [`${pf.source_file} (解析失败)`],
|
|
214
|
+
children: [],
|
|
215
|
+
});
|
|
216
|
+
chunks[rootId].children.push(errId);
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// File node
|
|
221
|
+
const fileChunkId = `file${String(pfIdx).padStart(2, "0")}`;
|
|
222
|
+
add({
|
|
223
|
+
chunk_id: fileChunkId, kind: "file", title: pf.source_file,
|
|
224
|
+
source_file: pf.source_file,
|
|
225
|
+
page_range: [1, pf.total_pages || 1],
|
|
226
|
+
header_path: [], content: "", tokens: 0, children: [],
|
|
227
|
+
});
|
|
228
|
+
chunks[rootId].children.push(fileChunkId);
|
|
229
|
+
|
|
230
|
+
// Per-file section tree
|
|
231
|
+
const sectionsRoot = parseFileIntoSections(pf);
|
|
232
|
+
emitSections({
|
|
233
|
+
parentChunkId: fileChunkId,
|
|
234
|
+
node: sectionsRoot,
|
|
235
|
+
pf,
|
|
236
|
+
chunks,
|
|
237
|
+
leavesOrder,
|
|
238
|
+
maxTokens: maxTokensPerChunk,
|
|
239
|
+
counter: { n: 0 },
|
|
240
|
+
filePrefix: fileChunkId,
|
|
241
|
+
headerAncestry: [],
|
|
242
|
+
});
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
const keywordIndex = buildKeywordIndex(chunks, leavesOrder);
|
|
246
|
+
|
|
247
|
+
return new BundleTree({
|
|
248
|
+
rootId,
|
|
249
|
+
chunks,
|
|
250
|
+
keywordIndex,
|
|
251
|
+
leavesOrder,
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// ------------------ Per-file section tree ------------------
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Internal section node used during tree construction. Not serialized.
|
|
259
|
+
*
|
|
260
|
+
* @typedef {{
|
|
261
|
+
* header: string,
|
|
262
|
+
* level: number,
|
|
263
|
+
* body: string,
|
|
264
|
+
* page_range: [number, number],
|
|
265
|
+
* children: SectionNode[],
|
|
266
|
+
* }} SectionNode
|
|
267
|
+
*/
|
|
268
|
+
|
|
269
|
+
function parseFileIntoSections(pf) {
|
|
270
|
+
// Concatenate blocks with page separators, tracking (start, end, page).
|
|
271
|
+
const fullParts = [];
|
|
272
|
+
const pageMap = []; // [start, end, page]
|
|
273
|
+
let cum = 0;
|
|
274
|
+
for (const b of pf.blocks) {
|
|
275
|
+
const body = b.markdown || "";
|
|
276
|
+
const start = cum;
|
|
277
|
+
cum += body.length + 2; // + "\n\n"
|
|
278
|
+
pageMap.push([start, cum, b.page]);
|
|
279
|
+
fullParts.push(body);
|
|
280
|
+
}
|
|
281
|
+
const fullText = fullParts.join("\n\n");
|
|
282
|
+
|
|
283
|
+
// All markdown headers
|
|
284
|
+
const headers = []; // [start, end, level, title]
|
|
285
|
+
HEADING_RE.lastIndex = 0;
|
|
286
|
+
let m;
|
|
287
|
+
while ((m = HEADING_RE.exec(fullText)) !== null) {
|
|
288
|
+
headers.push([m.index, m.index + m[0].length, m[1].length, m[2].trim()]);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Segments: (start, end, level, title)
|
|
292
|
+
const segments = [];
|
|
293
|
+
if (headers.length === 0) {
|
|
294
|
+
segments.push([0, fullText.length, 1, ""]);
|
|
295
|
+
} else {
|
|
296
|
+
if (headers[0][0] > 0) segments.push([0, headers[0][0], 1, ""]);
|
|
297
|
+
for (let i = 0; i < headers.length; i++) {
|
|
298
|
+
const [, eHdr, lvl, title] = headers[i];
|
|
299
|
+
const bodyStart = eHdr;
|
|
300
|
+
const bodyEnd = i + 1 < headers.length ? headers[i + 1][0] : fullText.length;
|
|
301
|
+
segments.push([bodyStart, bodyEnd, lvl, title]);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
const root = { header: "", level: 0, body: "", page_range: [1, 1], children: [] };
|
|
306
|
+
const stack = [root];
|
|
307
|
+
for (const [start, end, level, title] of segments) {
|
|
308
|
+
const body = fullText.slice(start, end).trim();
|
|
309
|
+
const pr = pageRange(start, end, pageMap);
|
|
310
|
+
const node = { header: title, level, body, page_range: pr, children: [] };
|
|
311
|
+
while (stack.length > 0 && stack[stack.length - 1].level >= level) {
|
|
312
|
+
stack.pop();
|
|
313
|
+
}
|
|
314
|
+
if (stack.length === 0) stack.push(root);
|
|
315
|
+
stack[stack.length - 1].children.push(node);
|
|
316
|
+
stack.push(node);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return root;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function pageRange(start, end, pageMap) {
|
|
323
|
+
if (pageMap.length === 0) return [1, 1];
|
|
324
|
+
let first = pageMap[pageMap.length - 1][2];
|
|
325
|
+
let last = pageMap[0][2];
|
|
326
|
+
for (const [s, e, p] of pageMap) {
|
|
327
|
+
if (s <= start && start < e && p < first) first = p;
|
|
328
|
+
}
|
|
329
|
+
for (const [s, e, p] of pageMap) {
|
|
330
|
+
if (s < end && end <= e && p > last) last = p;
|
|
331
|
+
else if (s < end) last = Math.max(last, p);
|
|
332
|
+
}
|
|
333
|
+
first = Math.max(1, Math.min(first, last));
|
|
334
|
+
return [first, last];
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// ------------------ Emit chunks recursively ------------------
|
|
338
|
+
|
|
339
|
+
function subtreeTokens(node) {
|
|
340
|
+
let n = estimateTokens(node.body);
|
|
341
|
+
for (const c of node.children) n += subtreeTokens(c);
|
|
342
|
+
return n;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function emitSections({
|
|
346
|
+
parentChunkId, node, pf, chunks, leavesOrder,
|
|
347
|
+
maxTokens, counter, filePrefix, headerAncestry,
|
|
348
|
+
}) {
|
|
349
|
+
// Whole subtree fits → single leaf
|
|
350
|
+
if (subtreeTokens(node) <= maxTokens && (node.body || node.children.length > 0)) {
|
|
351
|
+
const bodyParts = [];
|
|
352
|
+
if (node.header && node.level > 0) {
|
|
353
|
+
bodyParts.push(`${"#".repeat(node.level)} ${node.header}`);
|
|
354
|
+
}
|
|
355
|
+
if (node.body.trim()) bodyParts.push(node.body.trim());
|
|
356
|
+
|
|
357
|
+
const walkBody = (child) => {
|
|
358
|
+
if (child.header && child.level > 0) {
|
|
359
|
+
bodyParts.push(`${"#".repeat(child.level)} ${child.header}`);
|
|
360
|
+
}
|
|
361
|
+
if (child.body.trim()) bodyParts.push(child.body.trim());
|
|
362
|
+
for (const c of child.children) walkBody(c);
|
|
363
|
+
};
|
|
364
|
+
for (const c of node.children) walkBody(c);
|
|
365
|
+
|
|
366
|
+
// Union page range across entire subtree
|
|
367
|
+
const pr = [node.page_range[0], node.page_range[1]];
|
|
368
|
+
const unionPr = (nn) => {
|
|
369
|
+
pr[0] = Math.min(pr[0], nn.page_range[0]);
|
|
370
|
+
pr[1] = Math.max(pr[1], nn.page_range[1]);
|
|
371
|
+
for (const c of nn.children) unionPr(c);
|
|
372
|
+
};
|
|
373
|
+
unionPr(node);
|
|
374
|
+
|
|
375
|
+
counter.n += 1;
|
|
376
|
+
const leafId = `${filePrefix}_c${String(counter.n).padStart(3, "0")}`;
|
|
377
|
+
const title =
|
|
378
|
+
node.header ||
|
|
379
|
+
deriveTitle(bodyParts[0] || "") ||
|
|
380
|
+
`段落 ${counter.n}`;
|
|
381
|
+
const fullContent = bodyParts.join("\n\n").trim();
|
|
382
|
+
const ch = {
|
|
383
|
+
chunk_id: leafId, kind: "leaf", title,
|
|
384
|
+
source_file: pf.source_file, page_range: pr,
|
|
385
|
+
content: fullContent,
|
|
386
|
+
tokens: estimateTokens(fullContent),
|
|
387
|
+
header_path: [...headerAncestry, ...(node.header ? [node.header] : [])],
|
|
388
|
+
children: [],
|
|
389
|
+
};
|
|
390
|
+
chunks[leafId] = ch;
|
|
391
|
+
leavesOrder.push(leafId);
|
|
392
|
+
chunks[parentChunkId].children.push(leafId);
|
|
393
|
+
return;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Too large → emit as section, recurse into children
|
|
397
|
+
const myAncestry = [...headerAncestry, ...(node.header ? [node.header] : [])];
|
|
398
|
+
|
|
399
|
+
// Own body → size-bounded leaves
|
|
400
|
+
if (node.body.trim()) {
|
|
401
|
+
for (const [splitBody, splitPr] of splitTextIntoSizedParts(
|
|
402
|
+
node.body, node.page_range, maxTokens,
|
|
403
|
+
)) {
|
|
404
|
+
counter.n += 1;
|
|
405
|
+
const leafId = `${filePrefix}_c${String(counter.n).padStart(3, "0")}`;
|
|
406
|
+
const firstLineTitle = deriveTitle(splitBody) || node.header || `段落 ${counter.n}`;
|
|
407
|
+
const prefixHeader =
|
|
408
|
+
node.header && node.level > 0 ? `${"#".repeat(node.level)} ${node.header}\n\n` : "";
|
|
409
|
+
const fullContent = (prefixHeader + splitBody).trim();
|
|
410
|
+
const ch = {
|
|
411
|
+
chunk_id: leafId, kind: "leaf",
|
|
412
|
+
title: node.header || firstLineTitle,
|
|
413
|
+
source_file: pf.source_file, page_range: splitPr,
|
|
414
|
+
content: fullContent,
|
|
415
|
+
tokens: estimateTokens(fullContent),
|
|
416
|
+
header_path: myAncestry,
|
|
417
|
+
children: [],
|
|
418
|
+
};
|
|
419
|
+
chunks[leafId] = ch;
|
|
420
|
+
leavesOrder.push(leafId);
|
|
421
|
+
chunks[parentChunkId].children.push(leafId);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Children → section container + recurse
|
|
426
|
+
if (node.children.length > 0) {
|
|
427
|
+
if (
|
|
428
|
+
node.level > 0 &&
|
|
429
|
+
(node.body.trim() || !sectionAlreadyEmitted(chunks, parentChunkId, node.header))
|
|
430
|
+
) {
|
|
431
|
+
const sectionId = `${filePrefix}_s${String(Object.keys(chunks).length).padStart(4, "0")}`;
|
|
432
|
+
chunks[sectionId] = {
|
|
433
|
+
chunk_id: sectionId, kind: "section",
|
|
434
|
+
title: node.header || "(无标题段)",
|
|
435
|
+
source_file: pf.source_file,
|
|
436
|
+
page_range: [node.page_range[0], node.page_range[1]],
|
|
437
|
+
header_path: myAncestry,
|
|
438
|
+
children: [], content: "", tokens: 0,
|
|
439
|
+
};
|
|
440
|
+
chunks[parentChunkId].children.push(sectionId);
|
|
441
|
+
for (const child of node.children) {
|
|
442
|
+
emitSections({
|
|
443
|
+
parentChunkId: sectionId, node: child, pf, chunks, leavesOrder,
|
|
444
|
+
maxTokens, counter, filePrefix, headerAncestry: myAncestry,
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
} else {
|
|
448
|
+
for (const child of node.children) {
|
|
449
|
+
emitSections({
|
|
450
|
+
parentChunkId, node: child, pf, chunks, leavesOrder,
|
|
451
|
+
maxTokens, counter, filePrefix, headerAncestry: myAncestry,
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function sectionAlreadyEmitted(chunks, parentId, header) {
|
|
459
|
+
if (!header) return false;
|
|
460
|
+
const parent = chunks[parentId];
|
|
461
|
+
if (!parent) return false;
|
|
462
|
+
for (const cid of parent.children || []) {
|
|
463
|
+
const ch = chunks[cid];
|
|
464
|
+
if (ch && ch.title === header && ch.kind === "section") return true;
|
|
465
|
+
}
|
|
466
|
+
return false;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
function deriveTitle(text) {
|
|
470
|
+
if (!text) return "";
|
|
471
|
+
for (const line of text.split("\n")) {
|
|
472
|
+
const stripped = line.trim().replace(/^#+/, "").trim();
|
|
473
|
+
if (stripped) return stripped.slice(0, 60);
|
|
474
|
+
}
|
|
475
|
+
return "";
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function splitTextIntoSizedParts(text, pageRangeArr, maxTokens) {
|
|
479
|
+
if (estimateTokens(text) <= maxTokens) {
|
|
480
|
+
return [[text, [pageRangeArr[0], pageRangeArr[1]]]];
|
|
481
|
+
}
|
|
482
|
+
const maxChars = Math.floor(maxTokens * TOKEN_CHARS);
|
|
483
|
+
const parts = [];
|
|
484
|
+
const paragraphs = text.split(/\n\s*\n/).filter((p) => p.trim());
|
|
485
|
+
let buf = [];
|
|
486
|
+
let bufLen = 0;
|
|
487
|
+
for (const p of paragraphs) {
|
|
488
|
+
if (bufLen + p.length + 2 > maxChars && buf.length > 0) {
|
|
489
|
+
parts.push([buf.join("\n\n"), [pageRangeArr[0], pageRangeArr[1]]]);
|
|
490
|
+
buf = [];
|
|
491
|
+
bufLen = 0;
|
|
492
|
+
}
|
|
493
|
+
buf.push(p);
|
|
494
|
+
bufLen += p.length + 2;
|
|
495
|
+
}
|
|
496
|
+
if (buf.length > 0) {
|
|
497
|
+
parts.push([buf.join("\n\n"), [pageRangeArr[0], pageRangeArr[1]]]);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Hard-slice any paragraph that's still too big
|
|
501
|
+
const final = [];
|
|
502
|
+
for (const [chunk, pr] of parts) {
|
|
503
|
+
if (estimateTokens(chunk) <= maxTokens) {
|
|
504
|
+
final.push([chunk, pr]);
|
|
505
|
+
continue;
|
|
506
|
+
}
|
|
507
|
+
for (let i = 0; i < chunk.length; i += maxChars) {
|
|
508
|
+
final.push([chunk.slice(i, i + maxChars), pr]);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
return final;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// ------------------ Keyword index ------------------
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* CJK + English tokenization. For CJK, use 2-character sliding windows so
|
|
518
|
+
* "现金管理" matches "现金管理类"; for English, lowercase word tokens.
|
|
519
|
+
*/
|
|
520
|
+
export function tokenizeForIndex(text) {
|
|
521
|
+
const out = new Set();
|
|
522
|
+
if (!text) return out;
|
|
523
|
+
const cjkMatches = text.match(CJK_CHUNK_RE) || [];
|
|
524
|
+
for (const m of cjkMatches) {
|
|
525
|
+
for (let i = 0; i < m.length - 1; i++) {
|
|
526
|
+
out.add(m.slice(i, i + 2));
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
const enMatches = text.match(EN_WORD_RE) || [];
|
|
530
|
+
for (const m of enMatches) {
|
|
531
|
+
out.add(m.toLowerCase());
|
|
532
|
+
}
|
|
533
|
+
return out;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
function buildKeywordIndex(chunks, leavesOrder) {
|
|
537
|
+
const idx = {};
|
|
538
|
+
for (const cid of leavesOrder) {
|
|
539
|
+
const ch = chunks[cid];
|
|
540
|
+
if (!ch) continue;
|
|
541
|
+
const hay = [
|
|
542
|
+
ch.title || "",
|
|
543
|
+
(ch.header_path || []).join("/"),
|
|
544
|
+
(ch.content || "").slice(0, 2000),
|
|
545
|
+
].join(" ");
|
|
546
|
+
const tokens = tokenizeForIndex(hay);
|
|
547
|
+
for (const t of tokens) {
|
|
548
|
+
if (!idx[t]) idx[t] = [];
|
|
549
|
+
idx[t].push(cid);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
return idx;
|
|
553
|
+
}
|
package/src/agent/context.js
CHANGED
|
@@ -97,7 +97,46 @@ the primary view.
|
|
|
97
97
|
|
|
98
98
|
The developer user configures the project, provides regulations and samples, and \
|
|
99
99
|
makes business decisions (accuracy thresholds, cost trade-offs, rule scope). Discuss \
|
|
100
|
-
unclear regulations with them. Present results and let them judge
|
|
100
|
+
unclear regulations with them. Present results and let them judge.
|
|
101
|
+
|
|
102
|
+
## Samples Are Not Labeled
|
|
103
|
+
|
|
104
|
+
The developer user may provide samples that are a MIX of compliant and \
|
|
105
|
+
non-compliant documents — not pre-classified, not pre-annotated. Do not assume \
|
|
106
|
+
any sample is correct. YOU are the ground truth: apply each rule to each sample \
|
|
107
|
+
and determine the verdict from the rule text + document content, not from any \
|
|
108
|
+
implicit labeling. If a sample appears to be a "golden" reference (all rules \
|
|
109
|
+
pass), verify that explicitly rather than trusting its position or filename. \
|
|
110
|
+
This is project-agnostic baseline behavior — it applies even when AGENT.md \
|
|
111
|
+
does not restate it.
|
|
112
|
+
|
|
113
|
+
## Phase-Boundary Markdown Reports
|
|
114
|
+
|
|
115
|
+
When a phase completes (either via exit criteria or manual phase_advance), \
|
|
116
|
+
write a short markdown summary to \`logs/phase_<name>_<YYYYMMDD_HHMMSS>.md\` \
|
|
117
|
+
capturing: what was done, what worked, what didn't, open questions for the \
|
|
118
|
+
next phase. Aim for 100-300 lines — enough detail for someone resuming the \
|
|
119
|
+
session to pick up context, not an exhaustive log. These reports are soft \
|
|
120
|
+
— they're not enforced by pipeline state and won't block phase_advance. \
|
|
121
|
+
Write them before invoking phase_advance so the report reflects the phase \
|
|
122
|
+
you just completed.
|
|
123
|
+
|
|
124
|
+
Other good write-a-markdown moments: after finishing a batch of skill \
|
|
125
|
+
authoring, after an evolution-loop iteration wraps, after a QC round. Any \
|
|
126
|
+
natural "chapter boundary" in the work. Store in \`logs/\` so the git auto-\
|
|
127
|
+
commit captures them without polluting rule_skills/ or output/.
|
|
128
|
+
|
|
129
|
+
## Retry Output Convention
|
|
130
|
+
|
|
131
|
+
When re-running a workflow, skill, or evolution iteration that produces \
|
|
132
|
+
output files, write **sibling files with a \`_vN\` suffix**, not nested \
|
|
133
|
+
\`run_1/\` subfolders. E.g. if \`output/distillation/14b_A.log\` already \
|
|
134
|
+
exists and you're retrying, write \`output/distillation/14b_A_v2.log\` \
|
|
135
|
+
next to it, then \`_v3\`, \`_v4\`. This keeps output/ flat and greppable — \
|
|
136
|
+
\`ls *_v*.log\` shows retry history at a glance, and the finalization \
|
|
137
|
+
phase's coverage report can present retries as a single bullet per rule. \
|
|
138
|
+
Nested per-run subfolders (\`run_1/\`, \`run_2/\`) force readers to walk \
|
|
139
|
+
the tree to see what was produced.`;
|
|
101
140
|
|
|
102
141
|
/**
|
|
103
142
|
* Builds the system prompt from multiple context sources.
|