@open330/kiwimu 0.8.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  import { LLMClient } from "../llm-client";
2
2
  import type { Store } from "../store";
3
3
  import { slugify } from "./chunker";
4
- import type { Persona } from "../config";
4
+ import type { Persona, WikiSchema } from "../config";
5
+ import { compileTerms, standardizeTerms } from "./standardizer";
6
+ import { parseCitations } from "./citations";
7
+ import { stripJsonFences } from "../utils";
5
8
 
6
9
  // ── Phase 1: Extract original document structure ──
7
10
 
@@ -16,15 +19,16 @@ TEXT:
16
19
 
17
20
  Return a JSON array of sections in order. Each element:
18
21
  - "title": string — Original section/chapter title from the document
19
- - "content": string — The full content of this section, converted to clean markdown. Preserve all information. Use LaTeX ($..$ inline, $$...$$ display) for equations. Clean up OCR artifacts.
22
+ - "content": string — The full content of this section, converted to clean markdown. Preserve all information. Use LaTeX ($..$ inline, $$...$$ display) for equations. Clean up OCR artifacts. When the content describes processes, workflows, hierarchies, state transitions, or relationships, add a Mermaid diagram using fenced code blocks (\`\`\`mermaid). Supported types: flowchart, sequenceDiagram, classDiagram, stateDiagram-v2, erDiagram, gantt, pie, mindmap, timeline.
20
23
  - "level": number — 1 for chapter, 2 for section, 3 for subsection
21
24
 
22
25
  Keep the content faithful to the original. Do not add or remove information. Just clean up formatting.
26
+ When appropriate, enhance understanding by including Mermaid diagrams that visualize key concepts, flows, or relationships described in the text.
23
27
  Return at most 8 sections per response to keep output manageable.`;
24
28
 
25
29
  // ── Phase 2: Extract concepts for separate pages ──
26
30
 
27
- function getConceptSystem(persona: Persona | null): string {
31
+ function getConceptSystem(persona: Persona | null, schema?: WikiSchema): string {
28
32
  const base = `You are a study wiki editor. Given source material pages, identify important concepts, terms, and definitions that deserve their own dedicated wiki pages.
29
33
 
30
34
  Rules:
@@ -32,21 +36,53 @@ Rules:
32
36
  - Each concept page should have substantial educational content (2+ paragraphs)
33
37
  - Explain the concept clearly with definitions, formulas, examples, and context
34
38
  - Use [[wiki links]] to reference other concepts and source pages. Example: "[[Synchrotron Radiation]] is observed at [[radio frequencies]]"
39
+ - Use LaTeX ($..$ inline, $$...$$ display) for equations
40
+ - When a concept involves processes, relationships, hierarchies, or state transitions, include a Mermaid diagram using fenced code blocks (\`\`\`mermaid). Supported: flowchart, sequenceDiagram, classDiagram, stateDiagram-v2, erDiagram, mindmap, pie
35
41
  - Suggest Wikipedia links for further reading
36
42
 
37
43
  Return valid JSON only. No markdown fences.`;
38
44
 
45
+ let schemaRules = "";
46
+ if (schema) {
47
+ const rules: string[] = [];
48
+ if (schema.categories?.length) {
49
+ rules.push(`- Assign each concept to one of these categories: ${schema.categories.join(", ")}. Include a "category" field in your JSON output.`);
50
+ }
51
+ if (schema.page_template?.sections?.length) {
52
+ rules.push(`- Structure each concept page with these sections (use ## headings): ${schema.page_template.sections.join(", ")}`);
53
+ }
54
+ if (schema.naming_convention) {
55
+ const conventions: Record<string, string> = {
56
+ noun_phrase: "Use noun phrases for titles (e.g., 'Neural Network', 'Gradient Descent')",
57
+ question: "Use question form for titles (e.g., 'What is a Neural Network?', 'How does Gradient Descent work?')",
58
+ topic: "Use simple topic words for titles (e.g., 'Backpropagation', 'Optimization')",
59
+ };
60
+ rules.push(`- Title format: ${conventions[schema.naming_convention] || schema.naming_convention}`);
61
+ }
62
+ if (schema.terms && Object.keys(schema.terms).length > 0) {
63
+ const termList = Object.entries(schema.terms).map(([k, v]) => `${k} -> ${v}`).join(", ");
64
+ rules.push(`- Use these standard terms (replace abbreviations with full forms): ${termList}`);
65
+ }
66
+ if (rules.length > 0) {
67
+ schemaRules = `\n\nSchema rules:\n${rules.join("\n")}`;
68
+ }
69
+ }
70
+
39
71
  if (persona) {
40
- return `${persona.system_prompt}\n\n${base}\n\nIMPORTANT: ${persona.content_style}`;
72
+ return `${persona.system_prompt}\n\n${base}${schemaRules}\n\nIMPORTANT: ${persona.content_style}`;
41
73
  }
42
- return base;
74
+ return `${base}${schemaRules}`;
43
75
  }
44
76
 
45
- function getConceptPrompt(persona: Persona | null): string {
77
+ function getConceptPrompt(persona: Persona | null, schema?: WikiSchema): string {
46
78
  const styleNote = persona
47
79
  ? `\n\nWrite content in the following style:\n${persona.content_style}`
48
80
  : "";
49
81
 
82
+ const categoryField = schema?.categories?.length
83
+ ? `\n- "category": string — One of: ${schema.categories.join(", ")}`
84
+ : "";
85
+
50
86
  return `Based on these source pages, create concept/glossary wiki pages for important terms.
51
87
 
52
88
  Source pages already created:
@@ -56,9 +92,15 @@ Create 3-6 concept pages for the most important terms, definitions, laws, and eq
56
92
  Do NOT duplicate the source pages — instead, create focused concept pages that the source pages can link to.
57
93
  Keep each page concise (2-3 paragraphs).${styleNote}
58
94
 
95
+ IMPORTANT — Provenance citations:
96
+ When a claim or fact comes from a specific source page, add an inline citation marker at the end of that sentence using the format [^src:SOURCE_PAGE_SLUG].
97
+ The SOURCE_PAGE_SLUG must match one of the source page slugs listed above (the hyphenated identifier shown after the title).
98
+ Example: "Quantum entanglement allows particles to share states instantly [^src:chapter-3-quantum-states]"
99
+ Only cite when a fact clearly originates from a specific source page. Not every sentence needs a citation.
100
+
59
101
  Return a JSON array where each element has:
60
102
  - "title": string — Short concept name, 1-3 words (e.g., "Synchrotron Radiation", "Flux Density", "Angular Resolution"). Keep titles short so they match naturally in text.
61
- - "content": string — Educational markdown content with [[wiki links]] to other concepts and source pages
103
+ - "content": string — Educational markdown content with [[wiki links]] to other concepts and source pages, and [^src:slug] citations where appropriate${categoryField}
62
104
  - "suggested_links": Array<{text: string, url: string}> — Wikipedia/external reference links`;
63
105
  }
64
106
 
@@ -82,6 +124,7 @@ interface StructurePage {
82
124
  interface ConceptPage {
83
125
  title: string;
84
126
  content: string;
127
+ category?: string;
85
128
  suggested_links?: Array<{ text: string; url: string }>;
86
129
  }
87
130
 
@@ -146,7 +189,7 @@ function splitBySize(text: string, maxSize: number): Array<{ chapterHint: string
146
189
  }
147
190
 
148
191
  function parseJSON<T>(raw: string): T {
149
- let cleaned = raw.replace(/^```json?\n?/m, "").replace(/\n?```\s*$/m, "").trim();
192
+ let cleaned = stripJsonFences(raw);
150
193
  try {
151
194
  return JSON.parse(cleaned);
152
195
  } catch (e1) {
@@ -215,15 +258,17 @@ export async function llmChunkDocument(
215
258
  store: Store,
216
259
  maxChunks: number = 0, // 0 = unlimited
217
260
  persona: Persona | null = null,
218
- llmClient?: LLMClient
261
+ llmClient: LLMClient,
262
+ onProgress?: (status: string) => void,
263
+ schema?: WikiSchema
219
264
  ): Promise<{ sourceCount: number; conceptCount: number }> {
220
- // Use provided client or fall back to deprecated global chatComplete
221
- const chat = llmClient
222
- ? (system: string, user: string, maxTokens?: number) => llmClient.chatComplete(system, user, maxTokens)
223
- : async (system: string, user: string, maxTokens?: number) => {
224
- const { chatComplete } = await import("../llm-client");
225
- return chatComplete(system, user, maxTokens);
226
- };
265
+ const chat = (system: string, user: string, maxTokens?: number) =>
266
+ llmClient.chatComplete(system, user, maxTokens);
267
+
268
+ // Pre-compile term standardization regexes if schema.terms is defined
269
+ const compiledTerms = schema?.terms && Object.keys(schema.terms).length > 0
270
+ ? compileTerms(schema.terms)
271
+ : null;
227
272
 
228
273
  let chunks = splitByChapters(rawText);
229
274
  if (maxChunks > 0 && chunks.length > maxChunks) {
@@ -233,133 +278,229 @@ export async function llmChunkDocument(
233
278
  if (persona) {
234
279
  console.log(`\x1b[35m🎭 페르소나: ${persona.name}\x1b[0m`);
235
280
  }
236
- console.log(`\x1b[34m⏳ Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
237
-
238
281
  // ── Phase 1: Extract source pages (parallel LLM calls) ──
239
- const phase1Start = performance.now();
240
- let completedCount = 0;
241
- const structureSystem = getStructureSystem(persona);
242
-
243
- const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
244
- console.log(` Phase 1: 처리 중 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
282
+ let sourceCount: number;
283
+ let sourcePageSummaries: string[];
284
+
285
+ if (store.hasPhaseCheckpoint(sourceId, 'phase1')) {
286
+ // Resume: Phase 1 already done, rebuild summaries from DB
287
+ const existingPages = store.getSourcePages(sourceId);
288
+ sourceCount = existingPages.length;
289
+ sourcePageSummaries = existingPages.map(p =>
290
+ `- ${p.title} [slug: ${p.slug}]: ${p.content.slice(0, 150).replace(/\n/g, " ")}`
291
+ );
292
+ console.log(`\x1b[32m⏭ Phase 1 건너뜀 (이미 완료) — 📖 ${sourceCount}개 원본 페이지\x1b[0m`);
293
+ onProgress?.(`Phase 1 건너뜀 (${sourceCount}개 페이지 이미 존재)`);
294
+ } else {
295
+ console.log(`\x1b[34m⏳ Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
296
+ onProgress?.(`Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)`);
297
+
298
+ const phase1Start = performance.now();
299
+ let completedCount = 0;
300
+ const structureSystem = getStructureSystem(persona);
301
+
302
+ const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
303
+ console.log(` Phase 1: 처리 중 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
304
+
305
+ const prompt = STRUCTURE_PROMPT
306
+ .replace("{sourceTitle}", sourceTitle)
307
+ .replace("{text}", chunk.text.slice(0, 80000));
245
308
 
246
- const prompt = STRUCTURE_PROMPT
247
- .replace("{sourceTitle}", sourceTitle)
248
- .replace("{text}", chunk.text.slice(0, 80000));
249
-
250
- try {
251
- let raw = await chat(structureSystem, prompt, 16384);
252
- if (!raw || raw.trim().length < 10) {
253
- console.log(` \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
254
- raw = await chat(structureSystem, prompt, 16384);
309
+ try {
310
+ let raw = await chat(structureSystem, prompt, 16384);
255
311
  if (!raw || raw.trim().length < 10) {
256
- console.log(` \x1b[31m✗ 재시도도 응답\x1b[0m`);
257
- completedCount++;
258
- return [] as StructurePage[];
312
+ console.log(` \x1b[33m⚠응답, 재시도...\x1b[0m`);
313
+ raw = await chat(structureSystem, prompt, 16384);
314
+ if (!raw || raw.trim().length < 10) {
315
+ console.log(` \x1b[31m✗ 재시도도 빈 응답\x1b[0m`);
316
+ completedCount++;
317
+ return [] as StructurePage[];
318
+ }
259
319
  }
320
+ const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
321
+ completedCount++;
322
+ console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
323
+ onProgress?.(`Phase 1: ${completedCount}/${chunks.length} 청크 완료`);
324
+ return sections;
325
+ } catch (e: unknown) {
326
+ const message = e instanceof Error ? e.message : String(e);
327
+ console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
328
+ completedCount++;
329
+ return [] as StructurePage[];
260
330
  }
261
- const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
262
- completedCount++;
263
- console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
264
- return sections;
265
- } catch (e: unknown) {
266
- const message = e instanceof Error ? e.message : String(e);
267
- console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
268
- completedCount++;
269
- return [] as StructurePage[];
270
- }
271
- });
331
+ });
272
332
 
273
- // Store results sequentially (SQLite writes must be sequential)
274
- let orderCounter = 0;
275
- const sourcePageSummaries: string[] = [];
333
+ // Store results sequentially (SQLite writes must be sequential)
334
+ let orderCounter = 0;
335
+ sourcePageSummaries = [];
276
336
 
277
- for (const sections of chunkResults) {
278
- for (const section of sections) {
279
- const slug = slugify(section.title);
280
- if (!slug) continue;
337
+ for (const sections of chunkResults) {
338
+ for (const section of sections) {
339
+ const slug = slugify(section.title);
340
+ if (!slug) continue;
281
341
 
282
- const existing = store.getPage(slug);
283
- if (existing) {
284
- store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
285
- } else {
286
- store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
287
- sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
342
+ const existing = store.getPage(slug);
343
+ if (existing) {
344
+ store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
345
+ } else {
346
+ const page = store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
347
+ store.addActivityLog('page_created', `Created page: ${section.title}`, 'page', page.id);
348
+ sourcePageSummaries.push(`- ${section.title} [slug: ${slug}]: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
349
+
350
+ }
288
351
  }
289
352
  }
290
- }
291
353
 
292
- const sourceCount = orderCounter;
293
- const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
294
- console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
354
+ sourceCount = orderCounter;
355
+ store.setCheckpoint(sourceId, 'phase1');
356
+ const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
357
+ console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) — 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
358
+ }
295
359
 
296
360
  // ── Phase 2: Extract concept pages ──
297
- const phase2Start = performance.now();
298
- console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
299
-
300
- // Process source pages in small batches for concept extraction
301
361
  const batchSize = 5;
302
362
  let conceptCount = 0;
363
+ // Cache concept pages list for reuse in Phase 2 and Phase 2.5
364
+ let cachedConceptPages: ReturnType<typeof store.listConceptPages> | null = null;
365
+
366
+ if (sourcePageSummaries.length === 0) {
367
+ console.log(`\x1b[33m⏭ Phase 2 건너뜀 (원본 페이지 없음)\x1b[0m`);
368
+ onProgress?.(`Phase 2 건너뜀 (원본 페이지 없음)`);
369
+ } else {
370
+ const totalBatches = Math.ceil(sourcePageSummaries.length / batchSize);
371
+ const lastCompletedBatch = store.getLastCompletedBatch(sourceId, 'phase2');
372
+
373
+ if (lastCompletedBatch >= totalBatches - 1 && store.hasPhaseCheckpoint(sourceId, 'phase2')) {
374
+ cachedConceptPages = store.listConceptPages();
375
+ conceptCount = cachedConceptPages.length;
376
+ console.log(`\x1b[32m⏭ Phase 2 건너뜀 (이미 완료) — 📝 ${conceptCount}개 개념 페이지\x1b[0m`);
377
+ onProgress?.(`Phase 2 건너뜀 (${conceptCount}개 개념 이미 존재)`);
378
+ } else {
379
+ const phase2Start = performance.now();
380
+ const resumeFrom = lastCompletedBatch + 1;
381
+ if (resumeFrom > 0) {
382
+ console.log(`\x1b[34m⏳ Phase 2: 개념 추출 재개 (배치 ${resumeFrom + 1}/${totalBatches}부터)...\x1b[0m`);
383
+ onProgress?.(`Phase 2: 배치 ${resumeFrom + 1}/${totalBatches}부터 재개`);
384
+ } else {
385
+ console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
386
+ onProgress?.(`Phase 2: 개념 추출 중...`);
387
+ }
303
388
 
304
- for (let i = 0; i < sourcePageSummaries.length; i += batchSize) {
305
- const batch = sourcePageSummaries.slice(i, i + batchSize);
306
- const batchLabel = ` [${Math.floor(i / batchSize) + 1}/${Math.ceil(sourcePageSummaries.length / batchSize)}]`;
307
- console.log(`${batchLabel} 개념 추출 중...`);
389
+ // Cache existing concept titles in memory to avoid repeated DB queries
390
+ const existingConceptTitles = new Set(store.listConceptPages().map(p => p.title));
308
391
 
309
- const existingConcepts = conceptCount > 0
310
- ? `\n\nAlready created concept pages (do not duplicate): ${store.listConceptPages().map(p => p.title).join(", ")}`
311
- : "";
312
- const conceptPrompt = getConceptPrompt(persona);
313
- const prompt = conceptPrompt.replace("{sourcePages}", batch.join("\n")) + existingConcepts;
314
- const conceptSystem = getConceptSystem(persona);
392
+ for (let i = 0; i < sourcePageSummaries.length; i += batchSize) {
393
+ const batchIdx = Math.floor(i / batchSize);
394
+ const batchLabel = ` [${batchIdx + 1}/${totalBatches}]`;
315
395
 
316
- try {
317
- const raw = await chat(conceptSystem, prompt, 16384);
318
- const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
396
+ if (batchIdx <= lastCompletedBatch) {
397
+ console.log(`${batchLabel} 이미 완료 건너뜀`);
398
+ continue;
399
+ }
319
400
 
320
- for (const concept of concepts) {
321
- const slug = slugify(concept.title);
322
- if (!slug) continue;
401
+ console.log(`${batchLabel} 개념 추출 중...`);
323
402
 
324
- // Don't create concept page if source page with same slug exists
325
- const existing = store.getPage(slug);
326
- if (existing) continue;
403
+ const batch = sourcePageSummaries.slice(i, i + batchSize);
404
+ const existingConceptsNote = existingConceptTitles.size > 0
405
+ ? `\n\nAlready created concept pages (do not duplicate): ${[...existingConceptTitles].join(", ")}`
406
+ : "";
407
+ const conceptPrompt = getConceptPrompt(persona, schema);
408
+ const prompt = conceptPrompt.replace("{sourcePages}", batch.join("\n")) + existingConceptsNote;
409
+ const conceptSystem = getConceptSystem(persona, schema);
410
+
411
+ try {
412
+ const raw = await chat(conceptSystem, prompt, 16384);
413
+ const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
414
+
415
+ for (const concept of concepts) {
416
+ const slug = slugify(concept.title);
417
+ if (!slug) continue;
418
+
419
+ const existing = store.getPage(slug);
420
+ if (existing) continue;
421
+
422
+ let content = concept.content;
423
+ // Apply term standardization if schema.terms is defined
424
+ if (compiledTerms) {
425
+ content = standardizeTerms(content, compiledTerms);
426
+ }
427
+ if (concept.suggested_links?.length) {
428
+ content += "\n\n## External References\n\n";
429
+ for (const link of concept.suggested_links) {
430
+ content += `- [${link.text}](${link.url})\n`;
431
+ }
432
+ }
327
433
 
328
- let content = concept.content;
329
- if (concept.suggested_links?.length) {
330
- content += "\n\n## External References\n\n";
331
- for (const link of concept.suggested_links) {
332
- content += `- [${link.text}](${link.url})\n`;
434
+ const conceptPage = store.addPage(slug, concept.title, content, sourceId, slug, "concept", 0);
435
+ store.addActivityLog('page_created', `Created page: ${concept.title}`, 'page', conceptPage.id);
436
+ // Store category if provided by LLM and schema supports it
437
+ if (concept.category && schema?.categories?.length) {
438
+ store.updatePageCategory(conceptPage.id, concept.category);
439
+ }
440
+ existingConceptTitles.add(concept.title);
441
+ conceptCount++;
333
442
  }
443
+ store.setCheckpoint(sourceId, 'phase2', batchIdx);
444
+ console.log(` → ${concepts.length}개 개념`);
445
+ onProgress?.(`Phase 2: ${batchIdx + 1}/${totalBatches} 배치 완료`);
446
+ } catch (e: unknown) {
447
+ const message = e instanceof Error ? e.message : String(e);
448
+ console.log(` \x1b[31m✗ 배치 ${batchIdx + 1} 실패 (건너뜀): ${message}\x1b[0m`);
449
+ // Non-retryable errors (parse failures, etc.) — skip batch, continue pipeline
450
+ // Rate-limit errors are already retried in LLMClient; if we reach here, retries exhausted
451
+ store.setCheckpoint(sourceId, 'phase2', batchIdx);
334
452
  }
335
-
336
- store.addPage(slug, concept.title, content, sourceId, slug, "concept", 0);
337
- conceptCount++;
338
453
  }
339
- console.log(` → ${concepts.length}개 개념`);
340
- } catch (e: unknown) {
341
- const message = e instanceof Error ? e.message : String(e);
342
- console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
454
+
455
+ const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
456
+ console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
457
+ // Invalidate cache since new concepts were added
458
+ cachedConceptPages = null;
343
459
  }
344
460
  }
345
461
 
346
- const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
347
- console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) — 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
462
+ // ── Phase 2 post-processing: Parse citation markers ──
463
+ {
464
+ const conceptPagesForCitations = store.listConceptPages();
465
+ let citationCount = 0;
466
+ for (const page of conceptPagesForCitations) {
467
+ if (page.content.includes("[^src:")) {
468
+ const parsed = parseCitations(page.content, page.id, store);
469
+ if (parsed !== page.content) {
470
+ store.updatePageContent(page.id, parsed);
471
+ citationCount++;
472
+ }
473
+ }
474
+ }
475
+ if (citationCount > 0) {
476
+ console.log(`\x1b[32m 📚 ${citationCount}개 페이지에서 인용 정보 생성 완료\x1b[0m`);
477
+ }
478
+ }
348
479
 
349
480
  // ── Phase 2.5: Generate quizzes from concept pages ──
350
481
  let quizCount = 0;
351
- try {
352
- const conceptPagesForQuiz = store.listConceptPages();
353
- if (conceptPagesForQuiz.length > 0) {
354
- console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
355
-
356
- const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
357
- Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.
482
+ if (store.hasPhaseCheckpoint(sourceId, 'phase2_5')) {
483
+ console.log(`\x1b[32m⏭ Phase 2.5 건너뜀 (퀴즈 이미 생성됨)\x1b[0m`);
484
+ onProgress?.(`Phase 2.5 건너뜀 (퀴즈 이미 존재)`);
485
+ } else {
486
+ try {
487
+ const conceptPagesForQuiz = cachedConceptPages ?? store.listConceptPages();
488
+ if (conceptPagesForQuiz.length > 0) {
489
+ console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
490
+ onProgress?.(`Phase 2.5: 퀴즈 생성 중...`);
491
+
492
+ let quizSystemExtra = "";
493
+ if (schema?.terms && Object.keys(schema.terms).length > 0) {
494
+ const termList = Object.entries(schema.terms).map(([k, v]) => `${k} -> ${v}`).join(", ");
495
+ quizSystemExtra = `\nUse these standard terms in questions and answers (replace abbreviations with full forms): ${termList}`;
496
+ }
497
+ const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
498
+ Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.${quizSystemExtra}
358
499
  Return valid JSON only. No markdown fences.`;
359
500
 
360
- await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
361
- try {
362
- const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
501
+ await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
502
+ try {
503
+ const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
363
504
  Include questions that ask "왜?", "어떻게?", "비교하라" etc.
364
505
  Types: "fill_blank" (빈칸 채우기), "ox" (OX 퀴즈 - true/false), "short_answer" (단답형)
365
506
 
@@ -378,28 +519,33 @@ Rules:
378
519
  - Questions should test understanding, application, or analysis — not just recall
379
520
  - Write questions in Korean when the content is in Korean`;
380
521
 
381
- const raw = await chat(quizSystem, quizPrompt, 2048);
382
- const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
383
-
384
- for (const q of quizzes) {
385
- if (q.question && q.answer && q.type) {
386
- store.addQuiz(page.id, q.question, q.answer, q.type, q.explanation || "");
387
- quizCount++;
522
+ const raw = await chat(quizSystem, quizPrompt, 2048);
523
+ const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
524
+
525
+ for (const q of quizzes) {
526
+ if (q.question && q.answer && q.type) {
527
+ const question = compiledTerms ? standardizeTerms(q.question, compiledTerms) : q.question;
528
+ const answer = compiledTerms ? standardizeTerms(q.answer, compiledTerms) : q.answer;
529
+ const explanation = compiledTerms && q.explanation ? standardizeTerms(q.explanation, compiledTerms) : (q.explanation || "");
530
+ store.addQuiz(page.id, question, answer, q.type, explanation);
531
+ quizCount++;
532
+ }
388
533
  }
534
+ } catch (e: unknown) {
535
+ // Quiz generation is non-critical; silently skip failures
536
+ const message = e instanceof Error ? e.message : String(e);
537
+ console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
389
538
  }
390
- } catch (e: unknown) {
391
- // Quiz generation is non-critical; silently skip failures
392
- const message = e instanceof Error ? e.message : String(e);
393
- console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
394
- }
395
- });
539
+ });
396
540
 
397
- console.log(`\x1b[32m 🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
541
+ store.setCheckpoint(sourceId, 'phase2_5');
542
+ console.log(`\x1b[32m 🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
543
+ }
544
+ } catch (e: unknown) {
545
+ // Phase 2.5 is optional — don't block the pipeline
546
+ const message = e instanceof Error ? e.message : String(e);
547
+ console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
398
548
  }
399
- } catch (e: unknown) {
400
- // Phase 2.5 is optional — don't block the pipeline
401
- const message = e instanceof Error ? e.message : String(e);
402
- console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
403
549
  }
404
550
 
405
551
  // ── Phase 3: Resolve wiki links + inject concept links into source pages ──
@@ -477,8 +623,8 @@ Rules:
477
623
  return { sourceCount, conceptCount };
478
624
  }
479
625
 
480
- export function htmlToRawText(html: string): string {
481
- const { load } = require("cheerio");
626
+ export async function htmlToRawText(html: string): Promise<string> {
627
+ const { load } = await import("cheerio");
482
628
  const $ = load(html);
483
629
  $("script, style, nav, header, footer, noscript").remove();
484
630
  return $("body").text() || $.text();
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Term standardization post-processor.
3
+ * Replaces abbreviations/variants with their standard forms,
4
+ * using case-insensitive word-boundary matching.
5
+ */
6
+
7
+ interface CompiledTerm {
8
+ regex: RegExp;
9
+ replacement: string;
10
+ }
11
+
12
+ /**
13
+ * Compile term mappings into reusable RegExp objects.
14
+ * Call once, reuse the result for multiple standardizeTerms calls.
15
+ */
16
+ export function compileTerms(terms: Record<string, string>): CompiledTerm[] {
17
+ const compiled: CompiledTerm[] = [];
18
+ for (const [abbrev, standard] of Object.entries(terms)) {
19
+ // Escape special regex chars in the abbreviation
20
+ const escaped = abbrev.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
21
+ // Word-boundary aware, case-insensitive
22
+ // Negative lookbehind/lookahead to avoid matching inside markdown links
23
+ compiled.push({
24
+ regex: new RegExp(`(?<!\\[)\\b(${escaped})\\b(?!\\])(?![^[]*\\])`, "gi"),
25
+ replacement: standard,
26
+ });
27
+ }
28
+ return compiled;
29
+ }
30
+
31
+ /**
32
+ * Apply term standardization to content.
33
+ * Replaces abbreviations with standard terms using pre-compiled regexes.
34
+ */
35
+ export function standardizeTerms(content: string, compiledTerms: CompiledTerm[]): string {
36
+ let result = content;
37
+ for (const { regex, replacement } of compiledTerms) {
38
+ result = result.replace(regex, replacement);
39
+ }
40
+ return result;
41
+ }