@zhouchangui/math-ati 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- import { readdir } from 'node:fs/promises';
1
+ import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
2
2
  import path from 'node:path';
3
- import { callChatAgent, callVisionAgent } from './agentClient.js';
3
+ import { callChatAgent, callChatTextAgent, callVisionTextAgent } from './agentClient.js';
4
4
  import {
5
5
  chapterDataPaths,
6
6
  ensureChapterDataDirs,
@@ -17,13 +17,24 @@ const KNOWLEDGE_PAGE_TIMEOUT_MS = Number(process.env.KNOWLEDGE_EXTRACT_PAGE_TIME
17
17
  const KNOWLEDGE_SUMMARY_TIMEOUT_MS = Number(process.env.KNOWLEDGE_EXTRACT_SUMMARY_TIMEOUT_MS || 120000);
18
18
  const KNOWLEDGE_PAGE_RETRIES = Number(process.env.KNOWLEDGE_EXTRACT_PAGE_RETRIES || 2);
19
19
  const KNOWLEDGE_SUMMARY_RETRIES = Number(process.env.KNOWLEDGE_EXTRACT_SUMMARY_RETRIES || 2);
20
+ const KNOWLEDGE_SUMMARY_PAGE_CHUNK_SIZE = Number(process.env.KNOWLEDGE_SUMMARY_PAGE_CHUNK_SIZE || 4);
21
+ const KNOWLEDGE_MAX_CORE_POINTS = Number(process.env.KNOWLEDGE_MAX_CORE_POINTS || 24);
22
+ const KNOWLEDGE_MAX_MISTAKE_POINTS = Number(process.env.KNOWLEDGE_MAX_MISTAKE_POINTS || 8);
20
23
 
21
24
  function extractionDir(chapterId) {
22
25
  return chapterDataPaths(chapterId).pageExtracts;
23
26
  }
24
27
 
25
28
  function pageExtractPath(chapterId, imageFile) {
26
- return path.join(extractionDir(chapterId), `${path.basename(imageFile, path.extname(imageFile))}.json`);
29
+ return path.join(extractionDir(chapterId), `${path.basename(imageFile, path.extname(imageFile))}.md`);
30
+ }
31
+
32
+ function chunkSummaryDir(chapterId) {
33
+ return path.join(chapterDataPaths(chapterId).knowledge, 'chunk_summaries');
34
+ }
35
+
36
+ function chunkSummaryPath(chapterId, chunkIndex) {
37
+ return path.join(chunkSummaryDir(chapterId), `chunk-${String(chunkIndex + 1).padStart(2, '0')}.md`);
27
38
  }
28
39
 
29
40
  function summaryPath(chapterId) {
@@ -46,17 +57,15 @@ async function chapterImages(chapter) {
46
57
  }
47
58
 
48
59
  function normalizeExtractProfile(profile = {}) {
49
- const detailLevels = new Set(['exam_focus', 'balanced', 'fine_grained']);
50
60
  const baselines = new Set(['strong', 'normal', 'weak']);
51
- const allowedFocus = new Set(['exam_points', 'error_prone', 'prerequisite_gaps', 'calculation_links']);
52
- const focus = Array.isArray(profile.focus)
53
- ? profile.focus.filter((item) => allowedFocus.has(item))
54
- : [];
55
61
  return {
56
- detailLevel: detailLevels.has(profile.detailLevel) ? profile.detailLevel : 'exam_focus',
62
+ detailLevel: 'exam_focus',
57
63
  studentBaseline: baselines.has(profile.studentBaseline) ? profile.studentBaseline : 'strong',
58
- focus: focus.length ? focus : ['exam_points', 'error_prone'],
59
- displayLayer: profile.displayLayer === 'fine_grained_wall' ? 'fine_grained_wall' : 'exam_point_wall'
64
+ focus: ['exam_points', 'error_prone'],
65
+ displayLayer: 'exam_point_wall',
66
+ maxCorePointCount: Math.max(8, Math.min(40, Number(profile.maxCorePointCount || KNOWLEDGE_MAX_CORE_POINTS))),
67
+ maxMistakePointCount: Math.max(0, Math.min(16, Number(profile.maxMistakePointCount || KNOWLEDGE_MAX_MISTAKE_POINTS))),
68
+ summaryChunkSize: Math.max(2, Math.min(8, Number(profile.summaryChunkSize || KNOWLEDGE_SUMMARY_PAGE_CHUNK_SIZE)))
60
69
  };
61
70
  }
62
71
 
@@ -90,6 +99,97 @@ function normalizePageExtract(chapter, imagePath, pageIndex, data, source = 'age
90
99
  };
91
100
  }
92
101
 
102
+ function markdownSection(markdown, title) {
103
+ const lines = String(markdown || '').split(/\r?\n/);
104
+ const start = lines.findIndex((line) => new RegExp(`^##\\s+${title}\\s*$`).test(line.trim()));
105
+ if (start < 0) return '';
106
+ const end = lines.findIndex((line, index) => index > start && /^##\s+/.test(line.trim()));
107
+ return lines.slice(start + 1, end < 0 ? undefined : end).join('\n').trim();
108
+ }
109
+
110
+ function markdownHeadingItems(sectionText) {
111
+ const items = [];
112
+ const lines = String(sectionText || '').split(/\r?\n/);
113
+ let current = null;
114
+ for (const line of lines) {
115
+ const heading = line.match(/^###\s+(.+?)\s*$/);
116
+ if (heading) {
117
+ current = { title: heading[1].trim(), body: [] };
118
+ items.push(current);
119
+ } else if (current) {
120
+ current.body.push(line);
121
+ }
122
+ }
123
+ return items;
124
+ }
125
+
126
+ function fieldFromMarkdownBody(body, label) {
127
+ const pattern = new RegExp(`^\\s*-\\s*(?:\\*\\*)?${label}(?:\\*\\*)?[::]\\s*`);
128
+ const line = body.find((item) => pattern.test(item.trim()));
129
+ return line ? line.replace(pattern, '').trim() : '';
130
+ }
131
+
132
+ function markdownFieldList(body, label) {
133
+ const pattern = new RegExp(`^\\s*-\\s*(?:\\*\\*)?${label}(?:\\*\\*)?[::]\\s*`);
134
+ const nextFieldPattern = /^\s*-\s*(?:\*\*)?[\u4e00-\u9fa5A-Za-z0-9 /_-]+(?:\*\*)?[::]/;
135
+ const start = body.findIndex((line) => pattern.test(line.trim()));
136
+ if (start < 0) return [];
137
+ const firstValue = body[start].replace(pattern, '').trim();
138
+ const values = firstValue ? [firstValue] : [];
139
+ for (let index = start + 1; index < body.length; index += 1) {
140
+ const line = body[index];
141
+ if (nextFieldPattern.test(line.trim())) break;
142
+ const listItem = line.match(/^\s*-\s+(.+?)\s*$/);
143
+ if (listItem?.[1]) values.push(listItem[1].trim());
144
+ }
145
+ return values.filter(Boolean);
146
+ }
147
+
148
+ function parsePageMarkdownExtract({ chapter, imagePath, pageIndex, pageCount, markdown }) {
149
+ const knowledgeItems = markdownHeadingItems(markdownSection(markdown, '知识点'));
150
+ const mistakeItems = markdownHeadingItems(markdownSection(markdown, '易错点'));
151
+ const pageTitle = markdownSection(markdown, '页面标题').split(/\r?\n/).find(Boolean) || path.basename(imagePath);
152
+ return {
153
+ chapterId: chapter.id,
154
+ chapterTitle: chapter.fullTitle,
155
+ imageFile: path.basename(imagePath),
156
+ pageIndex,
157
+ pageCount,
158
+ source: 'agent_markdown',
159
+ extractedAt: new Date().toISOString(),
160
+ pageTitle: pageTitle.replace(/^#+\s*/, '').trim(),
161
+ rawOutline: markdownSection(markdown, '原文结构')
162
+ .split(/\r?\n/)
163
+ .map((line) => line.replace(/^\s*-\s*/, '').trim())
164
+ .filter(Boolean),
165
+ knowledgePoints: knowledgeItems.map((item) => ({
166
+ title: item.title,
167
+ summary: fieldFromMarkdownBody(item.body, '摘要'),
168
+ formulas: fieldFromMarkdownBody(item.body, '公式')
169
+ .split(/[;;]/)
170
+ .map((value) => value.trim())
171
+ .filter(Boolean),
172
+ examples: fieldFromMarkdownBody(item.body, '例子')
173
+ .split(/[;;]/)
174
+ .map((value) => value.trim())
175
+ .filter(Boolean),
176
+ prerequisite: fieldFromMarkdownBody(item.body, '前置'),
177
+ difficulty: fieldFromMarkdownBody(item.body, '难度') || 'basic'
178
+ })),
179
+ easyMistakes: mistakeItems.map((item) => ({
180
+ title: item.title,
181
+ errorType: fieldFromMarkdownBody(item.body, '错因') || item.title,
182
+ description: fieldFromMarkdownBody(item.body, '说明'),
183
+ correction: fieldFromMarkdownBody(item.body, '纠正')
184
+ })),
185
+ exerciseHints: markdownSection(markdown, '出题方向')
186
+ .split(/\r?\n/)
187
+ .map((line) => line.replace(/^\s*-\s*/, '').trim())
188
+ .filter(Boolean),
189
+ markdown
190
+ };
191
+ }
192
+
93
193
  function knowledgeExtractionError(reason, detail = '') {
94
194
  const error = new Error(`knowledge_extraction_failed:${reason}`);
95
195
  error.status = 502;
@@ -118,8 +218,9 @@ export async function extractChapterPage({
118
218
  await ensureChapterWorkspace(chapter);
119
219
  const outputPath = pageExtractPath(chapter.id, imagePath);
120
220
  if (!force) {
121
- const existing = await readJson(outputPath, null);
221
+ const existing = await readFile(outputPath, 'utf8').catch(() => '');
122
222
  if (existing) {
223
+ const parsed = parsePageMarkdownExtract({ chapter, imagePath, pageIndex, pageCount, markdown: existing });
123
224
  onProgress?.({
124
225
  step: 'knowledge_extract.page.cached',
125
226
  message: `第 ${pageIndex}/${pageCount || '?'} 页已有提取缓存,直接复用。`,
@@ -127,7 +228,7 @@ export async function extractChapterPage({
127
228
  pageCount,
128
229
  imageFile: path.basename(imagePath)
129
230
  });
130
- return existing;
231
+ return parsed;
131
232
  }
132
233
  }
133
234
  const systemPrompt = await readPrompt('knowledge-extract.system.md');
@@ -138,46 +239,36 @@ export async function extractChapterPage({
138
239
  pageCount,
139
240
  imageFile: path.basename(imagePath)
140
241
  });
141
- const agent = await callVisionAgent({
242
+ const agent = await callVisionTextAgent({
142
243
  timeoutMs: KNOWLEDGE_PAGE_TIMEOUT_MS,
143
244
  retries: KNOWLEDGE_PAGE_RETRIES,
144
245
  system: systemPrompt,
145
- text: promptPayload({
146
- task: '从这一页提分笔记图片中逐项提取知识点、公式、例题线索和易错点。',
147
- context: {
148
- chapter: {
149
- id: chapter.id,
150
- title: chapter.fullTitle,
151
- track: chapter.track
152
- },
153
- extractProfile: normalizeExtractProfile(extractProfile || {})
154
- },
155
- requirements: [
156
- 'knowledgePoints 要覆盖页面出现的每个概念、性质、公式、方法或题型。',
157
- 'easyMistakes 要提取页面明确写出的易错点,也可以从页面中的提醒、比较、条件限制中归纳,但不能凭空添加。',
158
- 'summary 用学生能懂的短句;公式必须使用 $...$。',
159
- 'exerciseHints 只写题型方向,不写完整答案。'
160
- ],
161
- schema: {
162
- pageTitle: 'string',
163
- rawOutline: ['string'],
164
- knowledgePoints: [{
165
- title: 'string',
166
- summary: 'string',
167
- formulas: ['string with LaTeX'],
168
- examples: ['short example or expression'],
169
- prerequisite: 'string',
170
- difficulty: 'basic|medium|challenge'
171
- }],
172
- easyMistakes: [{
173
- title: 'string',
174
- errorType: 'string',
175
- description: 'string',
176
- correction: 'string'
177
- }],
178
- exerciseHints: ['string']
179
- }
180
- }),
246
+ text: [
247
+ `任务:从这一页提分笔记图片中提取可用于后续汇总的 Markdown 笔记。`,
248
+ '',
249
+ `章节:${chapter.id} ${chapter.fullTitle}`,
250
+ `主线:${chapter.track}`,
251
+ `提取策略:考点和易错点优先;不要把例子拆成独立知识点;不要补充图片没有出现的内容。`,
252
+ '',
253
+ '输出必须是 Markdown,且只包含以下标题:',
254
+ '# 页面知识提取',
255
+ '## 页面标题',
256
+ '## 原文结构',
257
+ '## 知识点',
258
+ '### <知识点标题>',
259
+ '- 摘要:<一句话>',
260
+ '- 公式:<可为空,多个用分号>',
261
+ '- 例子:<可为空,多个用分号>',
262
+ '- 前置:<可为空>',
263
+ '- 难度:basic|medium|challenge',
264
+ '## 易错点',
265
+ '### <易错点标题>',
266
+ '- 错因:<错误类型>',
267
+ '- 说明:<错误表现>',
268
+ '- 纠正:<正确做法>',
269
+ '## 出题方向',
270
+ '- <题型方向,只写方向,不写完整答案>'
271
+ ].join('\n'),
181
272
  imagePaths: [imagePath],
182
273
  onAttempt: ({ phase, attempt, attempts, delayMs, result }) => {
183
274
  const base = `第 ${pageIndex}/${pageCount || '?'} 页识别`;
@@ -210,9 +301,12 @@ export async function extractChapterPage({
210
301
  `第 ${pageIndex}/${pageCount || '?'} 页 ${path.basename(imagePath)} 识别失败,已尝试 ${agent.attempts || 1} 次。${agent.detail || ''}`.trim()
211
302
  );
212
303
  }
213
- const extract = normalizePageExtract(chapter, imagePath, pageIndex, agent.data, 'agent');
214
- await writeJson(outputPath, extract);
215
- await writeJson(path.join(paths.knowledgeExtracts, chapter.id, `${path.basename(imagePath, path.extname(imagePath))}.json`), extract);
304
+ const markdown = String(agent.data || '').trim();
305
+ const extract = parsePageMarkdownExtract({ chapter, imagePath, pageIndex, pageCount, markdown });
306
+ await writeFile(outputPath, `${markdown}\n`, 'utf8');
307
+ const mirrorPath = path.join(paths.knowledgeExtracts, chapter.id, `${path.basename(imagePath, path.extname(imagePath))}.md`);
308
+ await mkdir(path.dirname(mirrorPath), { recursive: true });
309
+ await writeFile(mirrorPath, `${markdown}\n`, 'utf8');
216
310
  onProgress?.({
217
311
  step: 'knowledge_extract.page.done',
218
312
  message: `第 ${pageIndex}/${pageCount || '?'} 页识别完成,提取 ${extract.knowledgePoints.length} 个知识点。`,
@@ -226,7 +320,7 @@ export async function extractChapterPage({
226
320
  function dedupeByTitle(items) {
227
321
  const seen = new Map();
228
322
  for (const item of items) {
229
- const key = String(item.title || item.errorType || '').replace(/\s+/g, '');
323
+ const key = normalizedKnowledgeKey(item.title || item.errorType || '');
230
324
  if (!key) continue;
231
325
  if (!seen.has(key)) {
232
326
  seen.set(key, { ...item });
@@ -242,6 +336,131 @@ function dedupeByTitle(items) {
242
336
  return [...seen.values()];
243
337
  }
244
338
 
339
+ function normalizedKnowledgeKey(value) {
340
+ return String(value || '')
341
+ .toLowerCase()
342
+ .replace(/[“”"‘'`]/g, '')
343
+ .replace(/[((].*?[))]/g, '')
344
+ .replace(/[::,,。;;、\s·\-—_]/g, '')
345
+ .replace(/的概念|概念|性质|定义|方法|判定|定理|公式/g, '')
346
+ .trim();
347
+ }
348
+
349
+ function chunkArray(items, size) {
350
+ const chunks = [];
351
+ for (let index = 0; index < items.length; index += size) {
352
+ chunks.push(items.slice(index, index + size));
353
+ }
354
+ return chunks;
355
+ }
356
+
357
+ function countDocPoints(doc) {
358
+ return (doc.sections || []).reduce((sum, section) => sum + (section.points?.length || 0), 0);
359
+ }
360
+
361
+ function pointHasUsableTemplate(point) {
362
+ return Array.isArray(point.questionTemplates)
363
+ && point.questionTemplates.some((template) => Array.isArray(template) && template[0] && template[1]);
364
+ }
365
+
366
+ function normalizePointForBudget(point) {
367
+ return {
368
+ ...point,
369
+ title: String(point.title || '').trim(),
370
+ summary: String(point.summary || '').trim(),
371
+ formulas: Array.isArray(point.formulas) ? point.formulas.filter(Boolean).slice(0, 4) : [],
372
+ pitfalls: Array.isArray(point.pitfalls) ? point.pitfalls.filter(Boolean).slice(0, 5) : [],
373
+ examples: Array.isArray(point.examples) ? point.examples.filter(Boolean).slice(0, 4) : [],
374
+ questionTemplates: Array.isArray(point.questionTemplates) && point.questionTemplates.length
375
+ ? point.questionTemplates.filter((template) => Array.isArray(template)).slice(0, 3)
376
+ : [[
377
+ `围绕「${point.title || '本知识点'}」完成一道基础覆盖题,并写出关键结论。`,
378
+ point.summary || '答案需符合知识点定义、性质或方法。',
379
+ point.title || '知识点理解错误'
380
+ ]]
381
+ };
382
+ }
383
+
384
+ function mergeDuplicatePoints(points) {
385
+ const byKey = new Map();
386
+ for (const rawPoint of points) {
387
+ const point = normalizePointForBudget(rawPoint);
388
+ const key = normalizedKnowledgeKey(point.title);
389
+ if (!key) continue;
390
+ if (!byKey.has(key)) {
391
+ byKey.set(key, point);
392
+ continue;
393
+ }
394
+ const current = byKey.get(key);
395
+ current.summary = current.summary.length >= point.summary.length ? current.summary : point.summary;
396
+ current.formulas = [...new Set([...(current.formulas || []), ...(point.formulas || [])])].slice(0, 4);
397
+ current.pitfalls = [...new Set([...(current.pitfalls || []), ...(point.pitfalls || [])])].slice(0, 5);
398
+ current.examples = [...new Set([...(current.examples || []), ...(point.examples || [])])].slice(0, 4);
399
+ current.questionTemplates = [...(current.questionTemplates || []), ...(point.questionTemplates || [])]
400
+ .filter((template, index, templates) => (
401
+ Array.isArray(template)
402
+ && templates.findIndex((candidate) => Array.isArray(candidate) && candidate[0] === template[0]) === index
403
+ ))
404
+ .slice(0, 3);
405
+ current.sources = [...new Set([...(current.sources || []), ...(point.sources || [])])];
406
+ }
407
+ return [...byKey.values()];
408
+ }
409
+
410
+ function pointPriority(point) {
411
+ let score = 0;
412
+ if (pointHasUsableTemplate(point)) score += 4;
413
+ if (point.summary) score += 2;
414
+ if (point.pitfalls?.length) score += 2;
415
+ if (point.formulas?.length) score += 1;
416
+ if (point.examples?.length) score += 1;
417
+ return score;
418
+ }
419
+
420
+ function enforceKnowledgeBudget(chapter, doc, profile) {
421
+ const maxCore = profile.maxCorePointCount || KNOWLEDGE_MAX_CORE_POINTS;
422
+ const maxMistakes = profile.maxMistakePointCount ?? KNOWLEDGE_MAX_MISTAKE_POINTS;
423
+ const corePoints = [];
424
+ const mistakePoints = [];
425
+ for (const section of doc.sections || []) {
426
+ const isMistakeSection = /易错|错题|错误|mistake/i.test(section.title || '');
427
+ for (const point of section.points || []) {
428
+ if (isMistakeSection) mistakePoints.push(point);
429
+ else corePoints.push(point);
430
+ }
431
+ }
432
+ const dedupedCore = mergeDuplicatePoints(corePoints)
433
+ .sort((a, b) => pointPriority(b) - pointPriority(a))
434
+ .slice(0, maxCore)
435
+ .map((point, index) => ({
436
+ ...point,
437
+ id: `${chapter.id}-kp-${String(index + 1).padStart(2, '0')}`
438
+ }));
439
+ const dedupedMistakes = mergeDuplicatePoints(mistakePoints)
440
+ .sort((a, b) => pointPriority(b) - pointPriority(a))
441
+ .slice(0, maxMistakes)
442
+ .map((point, index) => ({
443
+ ...point,
444
+ id: `${chapter.id}-mistake-${String(index + 1).padStart(2, '0')}`
445
+ }));
446
+ return {
447
+ ...doc,
448
+ sections: [
449
+ { title: '知识点覆盖', points: dedupedCore },
450
+ { title: '易错题专项', points: dedupedMistakes }
451
+ ].filter((section) => section.points.length),
452
+ review: {
453
+ ...(doc.review || {}),
454
+ pointBudget: {
455
+ maxCorePointCount: maxCore,
456
+ maxMistakePointCount: maxMistakes,
457
+ corePointCount: dedupedCore.length,
458
+ mistakePointCount: dedupedMistakes.length
459
+ }
460
+ }
461
+ };
462
+ }
463
+
245
464
  function localMergeChapter(chapter, pageExtracts) {
246
465
  const points = dedupeByTitle(pageExtracts.flatMap((page) =>
247
466
  page.knowledgePoints.map((point) => ({
@@ -302,79 +521,132 @@ function localMergeChapter(chapter, pageExtracts) {
302
521
  };
303
522
  }
304
523
 
305
- export async function summarizeChapterExtraction({
524
+ async function callKnowledgeSummaryAgent({
525
+ systemPrompt,
306
526
  chapter,
307
- pageExtracts,
308
- extractProfile = null,
309
- resetLearningState = false,
310
- onProgress = null
527
+ task,
528
+ context,
529
+ requirements,
530
+ schema,
531
+ onProgress,
532
+ progressPrefix
311
533
  }) {
312
- const local = localMergeChapter(chapter, pageExtracts);
313
- const normalizedProfile = normalizeExtractProfile(extractProfile || {});
314
- const systemPrompt = await readPrompt('knowledge-summarize.system.md');
315
- onProgress?.({
316
- step: 'knowledge_extract.summary.start',
317
- message: `正在合并 ${pageExtracts.length} 页提取结果,生成章节知识点。`,
318
- pageCount: pageExtracts.length
319
- });
320
- const agent = await callChatAgent({
534
+ return callChatAgent({
321
535
  timeoutMs: KNOWLEDGE_SUMMARY_TIMEOUT_MS,
322
536
  retries: KNOWLEDGE_SUMMARY_RETRIES,
323
537
  temperature: 0.1,
324
538
  system: systemPrompt,
325
- user: promptPayload({
326
- task: '把逐页提取结果合并成章节知识文档,并做覆盖检查。',
327
- context: {
328
- chapter,
329
- pageExtracts,
330
- localDraft: local,
331
- extractProfile: normalizedProfile
332
- },
333
- requirements: [
334
- '合并同义知识点,保留来源页。',
335
- normalizedProfile.detailLevel === 'fine_grained'
336
- ? '当前提取画像要求细粒度拆分:保留必要前置概念、步骤性方法和基础易错点。'
337
- : '当前提取画像要求考点优先:优先保留考试常见考点、易错点、变式边界和必要前置关系,不把教材说明拆得过碎。',
338
- 'sections 至少包含“知识点覆盖”;如果有易错点,单独包含“易错题专项”。',
339
- '每个知识点必须有 id、title、summary、formulas、pitfalls、questionTemplates。',
340
- 'questionTemplates 用于后续出题,题干只写题目,不写解题过程。',
341
- 'review.missingOrWeak 列出疑似遗漏或需要人工复核的点。'
342
- ],
343
- schema: {
344
- sections: [{
345
- title: '知识点覆盖',
346
- points: [{
347
- id: `${chapter.id}-kp-01`,
348
- title: 'string',
349
- summary: 'string',
350
- formulas: ['string with LaTeX'],
351
- pitfalls: ['string'],
352
- examples: ['string'],
353
- questionTemplates: [['stem', 'answer', 'expectedErrorType']],
354
- sources: ['image filename']
355
- }]
356
- }],
357
- review: {
358
- passed: true,
359
- coverageSummary: 'string',
360
- missingOrWeak: ['string'],
361
- duplicateMerged: ['string']
362
- }
539
+ user: promptPayload({ task, context, requirements, schema }),
540
+ onAttempt: ({ phase, attempt, attempts, delayMs, result }) => {
541
+ if (phase === 'start') {
542
+ onProgress?.({
543
+ step: `${progressPrefix}.attempt`,
544
+ message: `${chapter.fullTitle}知识点合并:第 ${attempt}/${attempts} 次尝试。`,
545
+ attempt,
546
+ attempts
547
+ });
548
+ }
549
+ if (phase === 'retry') {
550
+ onProgress?.({
551
+ step: `${progressPrefix}.retry`,
552
+ message: `知识点合并遇到${retryReasonText(result?.reason)},${Math.round(delayMs / 1000)} 秒后自动重试。`,
553
+ attempt,
554
+ attempts,
555
+ reason: result?.reason || null
556
+ });
363
557
  }
364
- }),
558
+ }
559
+ });
560
+ }
561
+
562
+ function knowledgeSummarySchema(chapter) {
563
+ return {
564
+ sections: [{
565
+ title: '知识点覆盖',
566
+ points: [{
567
+ id: `${chapter.id}-kp-01`,
568
+ title: 'string',
569
+ summary: 'string',
570
+ formulas: ['string with LaTeX'],
571
+ pitfalls: ['string'],
572
+ examples: ['string'],
573
+ questionTemplates: [['stem', 'answer', 'expectedErrorType']],
574
+ sources: ['image filename']
575
+ }]
576
+ }],
577
+ review: {
578
+ passed: true,
579
+ coverageSummary: 'string',
580
+ missingOrWeak: ['string'],
581
+ duplicateMerged: ['string']
582
+ }
583
+ };
584
+ }
585
+
586
+ async function summarizePageChunk({ chapter, pages, chunkIndex, chunkCount, systemPrompt, normalizedProfile, onProgress }) {
587
+ const pageLabels = pages.map((page) => page.imageFile).join('、');
588
+ const cachedMarkdown = await readFile(chunkSummaryPath(chapter.id, chunkIndex), 'utf8').catch(() => '');
589
+ if (cachedMarkdown) {
590
+ const knowledgePointCount = (cachedMarkdown.match(/^###\s+/gm) || []).length;
591
+ onProgress?.({
592
+ step: 'knowledge_extract.summary.chunk.cached',
593
+ message: `知识点分组 ${chunkIndex + 1}/${chunkCount} 已有 Markdown 汇总缓存,直接复用。`,
594
+ chunkIndex: chunkIndex + 1,
595
+ chunkCount,
596
+ knowledgePointCount
597
+ });
598
+ return {
599
+ chunkIndex: chunkIndex + 1,
600
+ pageFiles: pages.map((page) => page.imageFile),
601
+ markdown: cachedMarkdown
602
+ };
603
+ }
604
+ onProgress?.({
605
+ step: 'knowledge_extract.summary.chunk.start',
606
+ message: `正在合并知识点分组 ${chunkIndex + 1}/${chunkCount}(${pageLabels})。`,
607
+ chunkIndex: chunkIndex + 1,
608
+ chunkCount,
609
+ pageCount: pages.length
610
+ });
611
+ const agent = await callChatTextAgent({
612
+ system: systemPrompt,
613
+ timeoutMs: KNOWLEDGE_SUMMARY_TIMEOUT_MS,
614
+ retries: KNOWLEDGE_SUMMARY_RETRIES,
615
+ temperature: 0.1,
616
+ user: [
617
+ `任务:合并《${chapter.fullTitle}》第 ${chunkIndex + 1}/${chunkCount} 组逐页 Markdown 提取结果。`,
618
+ '',
619
+ `本组页面:${pageLabels}`,
620
+ `数量控制:核心知识点不超过 ${Math.ceil(normalizedProfile.maxCorePointCount / chunkCount) + 4} 个;易错点不超过 ${Math.ceil(normalizedProfile.maxMistakePointCount / chunkCount) + 2} 个。`,
621
+ '',
622
+ '要求:',
623
+ '- 只基于输入页面合并知识点,不新增页面没有依据的内容。',
624
+ '- 合并同义、过细、重复候选,保留来源页。',
625
+ '- 优先保留考试常见考点、易错边界、几何概念辨析和可出题的方法。',
626
+ '- 输出 Markdown,不输出 JSON。',
627
+ '- 必须包含标题:# 分组知识汇总、## 知识点覆盖、## 易错题专项、## 合并说明。',
628
+ '- 每个知识点用三级标题,包含:摘要、来源、公式、易错边界、出题模板。',
629
+ '',
630
+ '逐页 Markdown:',
631
+ ...pages.map((page) => [
632
+ `\n---\n`,
633
+ `来源页:${page.imageFile}`,
634
+ page.markdown || ''
635
+ ].join('\n'))
636
+ ].join('\n'),
365
637
  onAttempt: ({ phase, attempt, attempts, delayMs, result }) => {
366
638
  if (phase === 'start') {
367
639
  onProgress?.({
368
- step: 'knowledge_extract.summary.attempt',
369
- message: `章节汇总:第 ${attempt}/${attempts} 次尝试。`,
640
+ step: 'knowledge_extract.summary.chunk.attempt',
641
+ message: `知识点分组 ${chunkIndex + 1}/${chunkCount}:第 ${attempt}/${attempts} 次尝试。`,
370
642
  attempt,
371
643
  attempts
372
644
  });
373
645
  }
374
646
  if (phase === 'retry') {
375
647
  onProgress?.({
376
- step: 'knowledge_extract.summary.retry',
377
- message: `章节汇总遇到${retryReasonText(result?.reason)},${Math.round(delayMs / 1000)} 秒后自动重试。`,
648
+ step: 'knowledge_extract.summary.chunk.retry',
649
+ message: `知识点分组 ${chunkIndex + 1}/${chunkCount} 遇到${retryReasonText(result?.reason)},${Math.round(delayMs / 1000)} 秒后自动重试。`,
378
650
  attempt,
379
651
  attempts,
380
652
  reason: result?.reason || null
@@ -382,13 +654,178 @@ export async function summarizeChapterExtraction({
382
654
  }
383
655
  }
384
656
  });
385
- if (!agent.ok || !Array.isArray(agent.data?.sections)) {
657
+ if (!agent.ok || !agent.data) {
386
658
  throw knowledgeExtractionError(
387
- agent.reason || 'invalid_agent_response',
388
- `章节汇总失败,已尝试 ${agent.attempts || 1} 次。${agent.detail || ''}`.trim()
659
+ agent.reason || 'empty_response',
660
+ `知识点分组 ${chunkIndex + 1}/${chunkCount} 合并失败,已尝试 ${agent.attempts || 1} 次。${agent.detail || ''}`.trim()
389
661
  );
390
662
  }
391
- const merged = agent.data;
663
+ const markdown = String(agent.data || '').trim();
664
+ await mkdir(chunkSummaryDir(chapter.id), { recursive: true });
665
+ await writeFile(chunkSummaryPath(chapter.id, chunkIndex), `${markdown}\n`, 'utf8');
666
+ const knowledgePointCount = (markdown.match(/^###\s+/gm) || []).length;
667
+ onProgress?.({
668
+ step: 'knowledge_extract.summary.chunk.done',
669
+ message: `知识点分组 ${chunkIndex + 1}/${chunkCount} 合并完成,得到约 ${knowledgePointCount} 个候选点。`,
670
+ chunkIndex: chunkIndex + 1,
671
+ chunkCount,
672
+ knowledgePointCount
673
+ });
674
+ return {
675
+ chunkIndex: chunkIndex + 1,
676
+ pageFiles: pages.map((page) => page.imageFile),
677
+ markdown
678
+ };
679
+ }
680
+
681
+ function cleanMarkdownTitle(title) {
682
+ return String(title || '')
683
+ .replace(/^\s*\d+[.、]\s*/, '')
684
+ .replace(/^#+\s*/, '')
685
+ .trim();
686
+ }
687
+
688
+ function splitMarkdownValues(value) {
689
+ return String(value || '')
690
+ .split(/[;;、,,]/)
691
+ .map((item) => item.trim())
692
+ .filter((item) => item && item !== '无');
693
+ }
694
+
695
+ function chunkMarkdownPoints(markdown, sectionTitle, pointType = 'core') {
696
+ const section = markdownSection(markdown, sectionTitle);
697
+ return markdownHeadingItems(section).map((item) => {
698
+ const summary = fieldFromMarkdownBody(item.body, '摘要');
699
+ const sources = splitMarkdownValues(fieldFromMarkdownBody(item.body, '来源'));
700
+ const formulas = splitMarkdownValues(fieldFromMarkdownBody(item.body, '公式'));
701
+ const pitfalls = [
702
+ ...splitMarkdownValues(fieldFromMarkdownBody(item.body, '易错边界')),
703
+ ...splitMarkdownValues(fieldFromMarkdownBody(item.body, '错因')),
704
+ ...splitMarkdownValues(fieldFromMarkdownBody(item.body, '说明'))
705
+ ].filter(Boolean);
706
+ const templateStems = markdownFieldList(item.body, '出题模板');
707
+ const title = cleanMarkdownTitle(item.title);
708
+ return {
709
+ title,
710
+ summary,
711
+ formulas,
712
+ pitfalls: [...new Set(pitfalls)].slice(0, 5),
713
+ examples: [],
714
+ questionTemplates: (templateStems.length ? templateStems : [
715
+ `围绕「${title}」设计一道${pointType === 'mistake' ? '易错辨析' : '基础覆盖'}题。`
716
+ ]).slice(0, 3).map((stem) => [
717
+ stem,
718
+ summary || `正确运用「${title}」相关概念、性质或方法。`,
719
+ pitfalls[0] || title
720
+ ]),
721
+ sources
722
+ };
723
+ }).filter((point) => point.title);
724
+ }
725
+
726
+ function localMergeChunkSummaries(chapter, chunkDocs) {
727
+ const corePoints = [];
728
+ const mistakePoints = [];
729
+ const mergeNotes = [];
730
+ for (const chunk of chunkDocs) {
731
+ corePoints.push(...chunkMarkdownPoints(chunk.markdown, '知识点覆盖', 'core'));
732
+ mistakePoints.push(...chunkMarkdownPoints(chunk.markdown, '易错题专项', 'mistake'));
733
+ mergeNotes.push(...markdownSection(chunk.markdown, '合并说明')
734
+ .split(/\r?\n/)
735
+ .map((line) => line.replace(/^\s*-\s*/, '').trim())
736
+ .filter(Boolean));
737
+ }
738
+ return {
739
+ sections: [
740
+ { title: '知识点覆盖', points: corePoints },
741
+ { title: '易错题专项', points: mistakePoints }
742
+ ].filter((section) => section.points.length),
743
+ review: {
744
+ passed: corePoints.length > 0,
745
+ coverageSummary: `由 ${chunkDocs.length} 个分组 Markdown 汇总合并生成最终章节知识点,并在保存前执行去重和数量控制。`,
746
+ missingOrWeak: [],
747
+ duplicateMerged: mergeNotes.slice(0, 20)
748
+ }
749
+ };
750
+ }
751
+
752
+ export async function summarizeChapterExtraction({
753
+ chapter,
754
+ pageExtracts,
755
+ extractProfile = null,
756
+ resetLearningState = false,
757
+ onProgress = null
758
+ }) {
759
+ const local = localMergeChapter(chapter, pageExtracts);
760
+ const normalizedProfile = normalizeExtractProfile(extractProfile || {});
761
+ const systemPrompt = await readPrompt('knowledge-summarize.system.md');
762
+ onProgress?.({
763
+ step: 'knowledge_extract.summary.start',
764
+ message: `正在合并 ${pageExtracts.length} 页提取结果,生成章节知识点。`,
765
+ pageCount: pageExtracts.length
766
+ });
767
+ const pageChunks = chunkArray(pageExtracts, normalizedProfile.summaryChunkSize);
768
+ const chunkDocs = [];
769
+ if (pageChunks.length > 1) {
770
+ for (let index = 0; index < pageChunks.length; index += 1) {
771
+ chunkDocs.push(await summarizePageChunk({
772
+ chapter,
773
+ pages: pageChunks[index],
774
+ chunkIndex: index,
775
+ chunkCount: pageChunks.length,
776
+ systemPrompt,
777
+ normalizedProfile,
778
+ onProgress
779
+ }));
780
+ }
781
+ }
782
+ const finalContext = pageChunks.length > 1
783
+ ? { chapter, chunkDocs, localDraft: local, extractProfile: normalizedProfile }
784
+ : { chapter, pageExtracts, localDraft: local, extractProfile: normalizedProfile };
785
+ onProgress?.({
786
+ step: 'knowledge_extract.summary.final.start',
787
+ message: pageChunks.length > 1
788
+ ? `正在把 ${chunkDocs.length} 个知识点分组合并为最终章节知识。`
789
+ : '正在生成最终章节知识点。',
790
+ chunkCount: chunkDocs.length
791
+ });
792
+ let finalDoc = null;
793
+ if (pageChunks.length > 1) {
794
+ finalDoc = localMergeChunkSummaries(chapter, chunkDocs);
795
+ onProgress?.({
796
+ step: 'knowledge_extract.summary.final.local',
797
+ message: `已从 ${chunkDocs.length} 个分组 Markdown 生成最终章节知识草稿,正在去重和限量。`,
798
+ chunkCount: chunkDocs.length
799
+ });
800
+ } else {
801
+ const agent = await callKnowledgeSummaryAgent({
802
+ systemPrompt,
803
+ chapter,
804
+ task: '把逐页提取结果合并成章节知识文档,并做覆盖检查。',
805
+ progressPrefix: 'knowledge_extract.summary.final',
806
+ onProgress,
807
+ context: finalContext,
808
+ requirements: [
809
+ '合并同义知识点,保留来源页。',
810
+ `最终“知识点覆盖”核心点数量控制在 ${normalizedProfile.maxCorePointCount} 个以内。`,
811
+ `最终“易错题专项”数量控制在 ${normalizedProfile.maxMistakePointCount} 个以内。`,
812
+ '优先保留考试常见考点、易错点、变式边界、几何概念辨析和必要前置关系,不把教材说明拆得过碎。',
813
+ 'sections 至少包含“知识点覆盖”;如果有易错点,单独包含“易错题专项”。',
814
+ '每个知识点必须有 title、summary、formulas、pitfalls、questionTemplates。',
815
+ 'questionTemplates 用于后续出题,题干只写题目,不写解题过程。',
816
+ 'review.missingOrWeak 列出疑似遗漏或需要人工复核的点。'
817
+ ],
818
+ schema: knowledgeSummarySchema(chapter)
819
+ });
820
+ if (!agent.ok || !Array.isArray(agent.data?.sections)) {
821
+ throw knowledgeExtractionError(
822
+ agent.reason || 'invalid_agent_response',
823
+ `章节汇总失败,已尝试 ${agent.attempts || 1} 次。${agent.detail || ''}`.trim()
824
+ );
825
+ }
826
+ finalDoc = agent.data;
827
+ }
828
+ const merged = enforceKnowledgeBudget(chapter, finalDoc, normalizedProfile);
392
829
  const extractedAt = new Date().toISOString();
393
830
  const normalized = await saveKnowledgeDoc(chapter, merged, 'agent', {
394
831
  extractProfile: normalizedProfile,
@@ -404,6 +841,7 @@ export async function summarizeChapterExtraction({
404
841
  updatedAt: extractedAt,
405
842
  extractProfile: normalizedProfile,
406
843
  extractorVersion: 1,
844
+ summaryStrategy: pageChunks.length > 1 ? 'chunked_markdown_local_merge' : 'single_merge',
407
845
  review: merged.review || null,
408
846
  knowledgePointCount: normalized.sections.reduce((sum, section) => sum + section.points.length, 0)
409
847
  };