@open330/kiwimu 0.7.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +189 -62
- package/package.json +1 -1
- package/src/build/renderer.ts +273 -32
- package/src/build/static/dynamic-qa.js +423 -0
- package/src/build/static/edit-page.js +58 -0
- package/src/build/static/peek-panel.css +201 -0
- package/src/build/static/peek-panel.js +470 -0
- package/src/build/static/search.js +30 -15
- package/src/build/static/style.css +821 -6
- package/src/build/templates.ts +757 -49
- package/src/config.ts +41 -3
- package/src/demo/sample-data.ts +75 -8
- package/src/demo/setup.ts +26 -7
- package/src/expand/llm.ts +2 -2
- package/src/index.ts +497 -64
- package/src/ingest/docx.ts +1 -1
- package/src/ingest/markdown.ts +21 -0
- package/src/ingest/pdf.ts +4 -2
- package/src/llm-client.ts +63 -69
- package/src/pipeline/citations.ts +107 -0
- package/src/pipeline/llm-chunker.ts +281 -128
- package/src/pipeline/standardizer.ts +41 -0
- package/src/server.ts +466 -33
- package/src/services/dynamic-qa.ts +190 -0
- package/src/services/embedding.ts +122 -0
- package/src/services/index-generator.ts +185 -0
- package/src/services/ingest.ts +84 -26
- package/src/services/lint.ts +249 -0
- package/src/services/promote.ts +150 -0
- package/src/store.test.ts +11 -0
- package/src/store.ts +652 -15
- package/src/utils.ts +30 -0
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import { LLMClient } from "../llm-client";
|
|
2
2
|
import type { Store } from "../store";
|
|
3
3
|
import { slugify } from "./chunker";
|
|
4
|
-
import type { Persona } from "../config";
|
|
4
|
+
import type { Persona, WikiSchema } from "../config";
|
|
5
|
+
import { compileTerms, standardizeTerms } from "./standardizer";
|
|
6
|
+
import { parseCitations } from "./citations";
|
|
7
|
+
import { stripJsonFences } from "../utils";
|
|
5
8
|
|
|
6
9
|
// ── Phase 1: Extract original document structure ──
|
|
7
10
|
|
|
@@ -16,15 +19,16 @@ TEXT:
|
|
|
16
19
|
|
|
17
20
|
Return a JSON array of sections in order. Each element:
|
|
18
21
|
- "title": string — Original section/chapter title from the document
|
|
19
|
-
- "content": string — The full content of this section, converted to clean markdown. Preserve all information. Use LaTeX ($..$ inline, $$...$$ display) for equations. Clean up OCR artifacts.
|
|
22
|
+
- "content": string — The full content of this section, converted to clean markdown. Preserve all information. Use LaTeX ($..$ inline, $$...$$ display) for equations. Clean up OCR artifacts. When the content describes processes, workflows, hierarchies, state transitions, or relationships, add a Mermaid diagram using fenced code blocks (\`\`\`mermaid). Supported types: flowchart, sequenceDiagram, classDiagram, stateDiagram-v2, erDiagram, gantt, pie, mindmap, timeline.
|
|
20
23
|
- "level": number — 1 for chapter, 2 for section, 3 for subsection
|
|
21
24
|
|
|
22
25
|
Keep the content faithful to the original. Do not add or remove information. Just clean up formatting.
|
|
26
|
+
When appropriate, enhance understanding by including Mermaid diagrams that visualize key concepts, flows, or relationships described in the text.
|
|
23
27
|
Return at most 8 sections per response to keep output manageable.`;
|
|
24
28
|
|
|
25
29
|
// ── Phase 2: Extract concepts for separate pages ──
|
|
26
30
|
|
|
27
|
-
function getConceptSystem(persona: Persona | null): string {
|
|
31
|
+
function getConceptSystem(persona: Persona | null, schema?: WikiSchema): string {
|
|
28
32
|
const base = `You are a study wiki editor. Given source material pages, identify important concepts, terms, and definitions that deserve their own dedicated wiki pages.
|
|
29
33
|
|
|
30
34
|
Rules:
|
|
@@ -32,21 +36,53 @@ Rules:
|
|
|
32
36
|
- Each concept page should have substantial educational content (2+ paragraphs)
|
|
33
37
|
- Explain the concept clearly with definitions, formulas, examples, and context
|
|
34
38
|
- Use [[wiki links]] to reference other concepts and source pages. Example: "[[Synchrotron Radiation]] is observed at [[radio frequencies]]"
|
|
39
|
+
- Use LaTeX ($..$ inline, $$...$$ display) for equations
|
|
40
|
+
- When a concept involves processes, relationships, hierarchies, or state transitions, include a Mermaid diagram using fenced code blocks (\`\`\`mermaid). Supported: flowchart, sequenceDiagram, classDiagram, stateDiagram-v2, erDiagram, mindmap, pie
|
|
35
41
|
- Suggest Wikipedia links for further reading
|
|
36
42
|
|
|
37
43
|
Return valid JSON only. No markdown fences.`;
|
|
38
44
|
|
|
45
|
+
let schemaRules = "";
|
|
46
|
+
if (schema) {
|
|
47
|
+
const rules: string[] = [];
|
|
48
|
+
if (schema.categories?.length) {
|
|
49
|
+
rules.push(`- Assign each concept to one of these categories: ${schema.categories.join(", ")}. Include a "category" field in your JSON output.`);
|
|
50
|
+
}
|
|
51
|
+
if (schema.page_template?.sections?.length) {
|
|
52
|
+
rules.push(`- Structure each concept page with these sections (use ## headings): ${schema.page_template.sections.join(", ")}`);
|
|
53
|
+
}
|
|
54
|
+
if (schema.naming_convention) {
|
|
55
|
+
const conventions: Record<string, string> = {
|
|
56
|
+
noun_phrase: "Use noun phrases for titles (e.g., 'Neural Network', 'Gradient Descent')",
|
|
57
|
+
question: "Use question form for titles (e.g., 'What is a Neural Network?', 'How does Gradient Descent work?')",
|
|
58
|
+
topic: "Use simple topic words for titles (e.g., 'Backpropagation', 'Optimization')",
|
|
59
|
+
};
|
|
60
|
+
rules.push(`- Title format: ${conventions[schema.naming_convention] || schema.naming_convention}`);
|
|
61
|
+
}
|
|
62
|
+
if (schema.terms && Object.keys(schema.terms).length > 0) {
|
|
63
|
+
const termList = Object.entries(schema.terms).map(([k, v]) => `${k} -> ${v}`).join(", ");
|
|
64
|
+
rules.push(`- Use these standard terms (replace abbreviations with full forms): ${termList}`);
|
|
65
|
+
}
|
|
66
|
+
if (rules.length > 0) {
|
|
67
|
+
schemaRules = `\n\nSchema rules:\n${rules.join("\n")}`;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
39
71
|
if (persona) {
|
|
40
|
-
return `${persona.system_prompt}\n\n${base}\n\nIMPORTANT: ${persona.content_style}`;
|
|
72
|
+
return `${persona.system_prompt}\n\n${base}${schemaRules}\n\nIMPORTANT: ${persona.content_style}`;
|
|
41
73
|
}
|
|
42
|
-
return base
|
|
74
|
+
return `${base}${schemaRules}`;
|
|
43
75
|
}
|
|
44
76
|
|
|
45
|
-
function getConceptPrompt(persona: Persona | null): string {
|
|
77
|
+
function getConceptPrompt(persona: Persona | null, schema?: WikiSchema): string {
|
|
46
78
|
const styleNote = persona
|
|
47
79
|
? `\n\nWrite content in the following style:\n${persona.content_style}`
|
|
48
80
|
: "";
|
|
49
81
|
|
|
82
|
+
const categoryField = schema?.categories?.length
|
|
83
|
+
? `\n- "category": string — One of: ${schema.categories.join(", ")}`
|
|
84
|
+
: "";
|
|
85
|
+
|
|
50
86
|
return `Based on these source pages, create concept/glossary wiki pages for important terms.
|
|
51
87
|
|
|
52
88
|
Source pages already created:
|
|
@@ -56,9 +92,15 @@ Create 3-6 concept pages for the most important terms, definitions, laws, and eq
|
|
|
56
92
|
Do NOT duplicate the source pages — instead, create focused concept pages that the source pages can link to.
|
|
57
93
|
Keep each page concise (2-3 paragraphs).${styleNote}
|
|
58
94
|
|
|
95
|
+
IMPORTANT — Provenance citations:
|
|
96
|
+
When a claim or fact comes from a specific source page, add an inline citation marker at the end of that sentence using the format [^src:SOURCE_PAGE_SLUG].
|
|
97
|
+
The SOURCE_PAGE_SLUG must match one of the source page slugs listed above (the hyphenated identifier shown after the title).
|
|
98
|
+
Example: "Quantum entanglement allows particles to share states instantly [^src:chapter-3-quantum-states]"
|
|
99
|
+
Only cite when a fact clearly originates from a specific source page. Not every sentence needs a citation.
|
|
100
|
+
|
|
59
101
|
Return a JSON array where each element has:
|
|
60
102
|
- "title": string — Short concept name, 1-3 words (e.g., "Synchrotron Radiation", "Flux Density", "Angular Resolution"). Keep titles short so they match naturally in text.
|
|
61
|
-
- "content": string — Educational markdown content with [[wiki links]] to other concepts and source pages
|
|
103
|
+
- "content": string — Educational markdown content with [[wiki links]] to other concepts and source pages, and [^src:slug] citations where appropriate${categoryField}
|
|
62
104
|
- "suggested_links": Array<{text: string, url: string}> — Wikipedia/external reference links`;
|
|
63
105
|
}
|
|
64
106
|
|
|
@@ -82,6 +124,7 @@ interface StructurePage {
|
|
|
82
124
|
interface ConceptPage {
|
|
83
125
|
title: string;
|
|
84
126
|
content: string;
|
|
127
|
+
category?: string;
|
|
85
128
|
suggested_links?: Array<{ text: string; url: string }>;
|
|
86
129
|
}
|
|
87
130
|
|
|
@@ -146,7 +189,7 @@ function splitBySize(text: string, maxSize: number): Array<{ chapterHint: string
|
|
|
146
189
|
}
|
|
147
190
|
|
|
148
191
|
function parseJSON<T>(raw: string): T {
|
|
149
|
-
let cleaned = raw
|
|
192
|
+
let cleaned = stripJsonFences(raw);
|
|
150
193
|
try {
|
|
151
194
|
return JSON.parse(cleaned);
|
|
152
195
|
} catch (e1) {
|
|
@@ -215,15 +258,17 @@ export async function llmChunkDocument(
|
|
|
215
258
|
store: Store,
|
|
216
259
|
maxChunks: number = 0, // 0 = unlimited
|
|
217
260
|
persona: Persona | null = null,
|
|
218
|
-
llmClient
|
|
261
|
+
llmClient: LLMClient,
|
|
262
|
+
onProgress?: (status: string) => void,
|
|
263
|
+
schema?: WikiSchema
|
|
219
264
|
): Promise<{ sourceCount: number; conceptCount: number }> {
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
265
|
+
const chat = (system: string, user: string, maxTokens?: number) =>
|
|
266
|
+
llmClient.chatComplete(system, user, maxTokens);
|
|
267
|
+
|
|
268
|
+
// Pre-compile term standardization regexes if schema.terms is defined
|
|
269
|
+
const compiledTerms = schema?.terms && Object.keys(schema.terms).length > 0
|
|
270
|
+
? compileTerms(schema.terms)
|
|
271
|
+
: null;
|
|
227
272
|
|
|
228
273
|
let chunks = splitByChapters(rawText);
|
|
229
274
|
if (maxChunks > 0 && chunks.length > maxChunks) {
|
|
@@ -233,128 +278,230 @@ export async function llmChunkDocument(
|
|
|
233
278
|
if (persona) {
|
|
234
279
|
console.log(`\x1b[35m🎭 페르소나: ${persona.name}\x1b[0m`);
|
|
235
280
|
}
|
|
236
|
-
console.log(`\x1b[34m🧠 Phase 1: 원본 구조 추출 (${chunks.length}개 청크)...\x1b[0m`);
|
|
237
|
-
|
|
238
281
|
// ── Phase 1: Extract source pages (parallel LLM calls) ──
|
|
239
|
-
let
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
282
|
+
let sourceCount: number;
|
|
283
|
+
let sourcePageSummaries: string[];
|
|
284
|
+
|
|
285
|
+
if (store.hasPhaseCheckpoint(sourceId, 'phase1')) {
|
|
286
|
+
// Resume: Phase 1 already done, rebuild summaries from DB
|
|
287
|
+
const existingPages = store.getSourcePages(sourceId);
|
|
288
|
+
sourceCount = existingPages.length;
|
|
289
|
+
sourcePageSummaries = existingPages.map(p =>
|
|
290
|
+
`- ${p.title} [slug: ${p.slug}]: ${p.content.slice(0, 150).replace(/\n/g, " ")}`
|
|
291
|
+
);
|
|
292
|
+
console.log(`\x1b[32m⏭ Phase 1 건너뜀 (이미 완료) — 📖 ${sourceCount}개 원본 페이지\x1b[0m`);
|
|
293
|
+
onProgress?.(`Phase 1 건너뜀 (${sourceCount}개 페이지 이미 존재)`);
|
|
294
|
+
} else {
|
|
295
|
+
console.log(`\x1b[34m⏳ Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
|
|
296
|
+
onProgress?.(`Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)`);
|
|
297
|
+
|
|
298
|
+
const phase1Start = performance.now();
|
|
299
|
+
let completedCount = 0;
|
|
300
|
+
const structureSystem = getStructureSystem(persona);
|
|
301
|
+
|
|
302
|
+
const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
|
|
303
|
+
console.log(` Phase 1: 처리 중 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
|
|
304
|
+
|
|
305
|
+
const prompt = STRUCTURE_PROMPT
|
|
306
|
+
.replace("{sourceTitle}", sourceTitle)
|
|
307
|
+
.replace("{text}", chunk.text.slice(0, 80000));
|
|
244
308
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
.replace("{text}", chunk.text.slice(0, 80000));
|
|
248
|
-
|
|
249
|
-
try {
|
|
250
|
-
let raw = await chat(structureSystem, prompt, 16384);
|
|
251
|
-
if (!raw || raw.trim().length < 10) {
|
|
252
|
-
console.log(` \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
|
|
253
|
-
raw = await chat(structureSystem, prompt, 16384);
|
|
309
|
+
try {
|
|
310
|
+
let raw = await chat(structureSystem, prompt, 16384);
|
|
254
311
|
if (!raw || raw.trim().length < 10) {
|
|
255
|
-
console.log(` \x1b[
|
|
256
|
-
|
|
257
|
-
|
|
312
|
+
console.log(` \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
|
|
313
|
+
raw = await chat(structureSystem, prompt, 16384);
|
|
314
|
+
if (!raw || raw.trim().length < 10) {
|
|
315
|
+
console.log(` \x1b[31m✗ 재시도도 빈 응답\x1b[0m`);
|
|
316
|
+
completedCount++;
|
|
317
|
+
return [] as StructurePage[];
|
|
318
|
+
}
|
|
258
319
|
}
|
|
320
|
+
const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
|
|
321
|
+
completedCount++;
|
|
322
|
+
console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
|
|
323
|
+
onProgress?.(`Phase 1: ${completedCount}/${chunks.length} 청크 완료`);
|
|
324
|
+
return sections;
|
|
325
|
+
} catch (e: unknown) {
|
|
326
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
327
|
+
console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
|
|
328
|
+
completedCount++;
|
|
329
|
+
return [] as StructurePage[];
|
|
259
330
|
}
|
|
260
|
-
|
|
261
|
-
completedCount++;
|
|
262
|
-
console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
|
|
263
|
-
return sections;
|
|
264
|
-
} catch (e: unknown) {
|
|
265
|
-
const message = e instanceof Error ? e.message : String(e);
|
|
266
|
-
console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
|
|
267
|
-
completedCount++;
|
|
268
|
-
return [] as StructurePage[];
|
|
269
|
-
}
|
|
270
|
-
});
|
|
331
|
+
});
|
|
271
332
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
333
|
+
// Store results sequentially (SQLite writes must be sequential)
|
|
334
|
+
let orderCounter = 0;
|
|
335
|
+
sourcePageSummaries = [];
|
|
275
336
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
337
|
+
for (const sections of chunkResults) {
|
|
338
|
+
for (const section of sections) {
|
|
339
|
+
const slug = slugify(section.title);
|
|
340
|
+
if (!slug) continue;
|
|
280
341
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
342
|
+
const existing = store.getPage(slug);
|
|
343
|
+
if (existing) {
|
|
344
|
+
store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
|
|
345
|
+
} else {
|
|
346
|
+
const page = store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
|
|
347
|
+
store.addActivityLog('page_created', `Created page: ${section.title}`, 'page', page.id);
|
|
348
|
+
sourcePageSummaries.push(`- ${section.title} [slug: ${slug}]: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
|
|
349
|
+
|
|
350
|
+
}
|
|
287
351
|
}
|
|
288
352
|
}
|
|
289
|
-
}
|
|
290
353
|
|
|
291
|
-
|
|
292
|
-
|
|
354
|
+
sourceCount = orderCounter;
|
|
355
|
+
store.setCheckpoint(sourceId, 'phase1');
|
|
356
|
+
const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
|
|
357
|
+
console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) — 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
|
|
358
|
+
}
|
|
293
359
|
|
|
294
360
|
// ── Phase 2: Extract concept pages ──
|
|
295
|
-
console.log(`\x1b[34m🧠 Phase 2: 개념 페이지 추출...\x1b[0m`);
|
|
296
|
-
|
|
297
|
-
// Process source pages in small batches for concept extraction
|
|
298
361
|
const batchSize = 5;
|
|
299
362
|
let conceptCount = 0;
|
|
363
|
+
// Cache concept pages list for reuse in Phase 2 and Phase 2.5
|
|
364
|
+
let cachedConceptPages: ReturnType<typeof store.listConceptPages> | null = null;
|
|
365
|
+
|
|
366
|
+
if (sourcePageSummaries.length === 0) {
|
|
367
|
+
console.log(`\x1b[33m⏭ Phase 2 건너뜀 (원본 페이지 없음)\x1b[0m`);
|
|
368
|
+
onProgress?.(`Phase 2 건너뜀 (원본 페이지 없음)`);
|
|
369
|
+
} else {
|
|
370
|
+
const totalBatches = Math.ceil(sourcePageSummaries.length / batchSize);
|
|
371
|
+
const lastCompletedBatch = store.getLastCompletedBatch(sourceId, 'phase2');
|
|
372
|
+
|
|
373
|
+
if (lastCompletedBatch >= totalBatches - 1 && store.hasPhaseCheckpoint(sourceId, 'phase2')) {
|
|
374
|
+
cachedConceptPages = store.listConceptPages();
|
|
375
|
+
conceptCount = cachedConceptPages.length;
|
|
376
|
+
console.log(`\x1b[32m⏭ Phase 2 건너뜀 (이미 완료) — 📝 ${conceptCount}개 개념 페이지\x1b[0m`);
|
|
377
|
+
onProgress?.(`Phase 2 건너뜀 (${conceptCount}개 개념 이미 존재)`);
|
|
378
|
+
} else {
|
|
379
|
+
const phase2Start = performance.now();
|
|
380
|
+
const resumeFrom = lastCompletedBatch + 1;
|
|
381
|
+
if (resumeFrom > 0) {
|
|
382
|
+
console.log(`\x1b[34m⏳ Phase 2: 개념 추출 재개 (배치 ${resumeFrom + 1}/${totalBatches}부터)...\x1b[0m`);
|
|
383
|
+
onProgress?.(`Phase 2: 배치 ${resumeFrom + 1}/${totalBatches}부터 재개`);
|
|
384
|
+
} else {
|
|
385
|
+
console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
|
|
386
|
+
onProgress?.(`Phase 2: 개념 추출 중...`);
|
|
387
|
+
}
|
|
300
388
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
const batchLabel = ` [${Math.floor(i / batchSize) + 1}/${Math.ceil(sourcePageSummaries.length / batchSize)}]`;
|
|
304
|
-
console.log(`${batchLabel} 개념 추출 중...`);
|
|
389
|
+
// Cache existing concept titles in memory to avoid repeated DB queries
|
|
390
|
+
const existingConceptTitles = new Set(store.listConceptPages().map(p => p.title));
|
|
305
391
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
const conceptPrompt = getConceptPrompt(persona);
|
|
310
|
-
const prompt = conceptPrompt.replace("{sourcePages}", batch.join("\n")) + existingConcepts;
|
|
311
|
-
const conceptSystem = getConceptSystem(persona);
|
|
392
|
+
for (let i = 0; i < sourcePageSummaries.length; i += batchSize) {
|
|
393
|
+
const batchIdx = Math.floor(i / batchSize);
|
|
394
|
+
const batchLabel = ` [${batchIdx + 1}/${totalBatches}]`;
|
|
312
395
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
396
|
+
if (batchIdx <= lastCompletedBatch) {
|
|
397
|
+
console.log(`${batchLabel} 이미 완료 — 건너뜀`);
|
|
398
|
+
continue;
|
|
399
|
+
}
|
|
316
400
|
|
|
317
|
-
|
|
318
|
-
const slug = slugify(concept.title);
|
|
319
|
-
if (!slug) continue;
|
|
401
|
+
console.log(`${batchLabel} 개념 추출 중...`);
|
|
320
402
|
|
|
321
|
-
|
|
322
|
-
const
|
|
323
|
-
|
|
403
|
+
const batch = sourcePageSummaries.slice(i, i + batchSize);
|
|
404
|
+
const existingConceptsNote = existingConceptTitles.size > 0
|
|
405
|
+
? `\n\nAlready created concept pages (do not duplicate): ${[...existingConceptTitles].join(", ")}`
|
|
406
|
+
: "";
|
|
407
|
+
const conceptPrompt = getConceptPrompt(persona, schema);
|
|
408
|
+
const prompt = conceptPrompt.replace("{sourcePages}", batch.join("\n")) + existingConceptsNote;
|
|
409
|
+
const conceptSystem = getConceptSystem(persona, schema);
|
|
410
|
+
|
|
411
|
+
try {
|
|
412
|
+
const raw = await chat(conceptSystem, prompt, 16384);
|
|
413
|
+
const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
|
|
414
|
+
|
|
415
|
+
for (const concept of concepts) {
|
|
416
|
+
const slug = slugify(concept.title);
|
|
417
|
+
if (!slug) continue;
|
|
418
|
+
|
|
419
|
+
const existing = store.getPage(slug);
|
|
420
|
+
if (existing) continue;
|
|
421
|
+
|
|
422
|
+
let content = concept.content;
|
|
423
|
+
// Apply term standardization if schema.terms is defined
|
|
424
|
+
if (compiledTerms) {
|
|
425
|
+
content = standardizeTerms(content, compiledTerms);
|
|
426
|
+
}
|
|
427
|
+
if (concept.suggested_links?.length) {
|
|
428
|
+
content += "\n\n## External References\n\n";
|
|
429
|
+
for (const link of concept.suggested_links) {
|
|
430
|
+
content += `- [${link.text}](${link.url})\n`;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
324
433
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
434
|
+
const conceptPage = store.addPage(slug, concept.title, content, sourceId, slug, "concept", 0);
|
|
435
|
+
store.addActivityLog('page_created', `Created page: ${concept.title}`, 'page', conceptPage.id);
|
|
436
|
+
// Store category if provided by LLM and schema supports it
|
|
437
|
+
if (concept.category && schema?.categories?.length) {
|
|
438
|
+
store.updatePageCategory(conceptPage.id, concept.category);
|
|
439
|
+
}
|
|
440
|
+
existingConceptTitles.add(concept.title);
|
|
441
|
+
conceptCount++;
|
|
330
442
|
}
|
|
443
|
+
store.setCheckpoint(sourceId, 'phase2', batchIdx);
|
|
444
|
+
console.log(` → ${concepts.length}개 개념`);
|
|
445
|
+
onProgress?.(`Phase 2: ${batchIdx + 1}/${totalBatches} 배치 완료`);
|
|
446
|
+
} catch (e: unknown) {
|
|
447
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
448
|
+
console.log(` \x1b[31m✗ 배치 ${batchIdx + 1} 실패 (건너뜀): ${message}\x1b[0m`);
|
|
449
|
+
// Non-retryable errors (parse failures, etc.) — skip batch, continue pipeline
|
|
450
|
+
// Rate-limit errors are already retried in LLMClient; if we reach here, retries exhausted
|
|
451
|
+
store.setCheckpoint(sourceId, 'phase2', batchIdx);
|
|
331
452
|
}
|
|
332
|
-
|
|
333
|
-
store.addPage(slug, concept.title, content, sourceId, slug, "concept", 0);
|
|
334
|
-
conceptCount++;
|
|
335
453
|
}
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
454
|
+
|
|
455
|
+
const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
|
|
456
|
+
console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) — 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
|
|
457
|
+
// Invalidate cache since new concepts were added
|
|
458
|
+
cachedConceptPages = null;
|
|
340
459
|
}
|
|
341
460
|
}
|
|
342
461
|
|
|
343
|
-
|
|
462
|
+
// ── Phase 2 post-processing: Parse citation markers ──
|
|
463
|
+
{
|
|
464
|
+
const conceptPagesForCitations = store.listConceptPages();
|
|
465
|
+
let citationCount = 0;
|
|
466
|
+
for (const page of conceptPagesForCitations) {
|
|
467
|
+
if (page.content.includes("[^src:")) {
|
|
468
|
+
const parsed = parseCitations(page.content, page.id, store);
|
|
469
|
+
if (parsed !== page.content) {
|
|
470
|
+
store.updatePageContent(page.id, parsed);
|
|
471
|
+
citationCount++;
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
if (citationCount > 0) {
|
|
476
|
+
console.log(`\x1b[32m 📚 ${citationCount}개 페이지에서 인용 정보 생성 완료\x1b[0m`);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
344
479
|
|
|
345
480
|
// ── Phase 2.5: Generate quizzes from concept pages ──
|
|
346
481
|
let quizCount = 0;
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
const
|
|
482
|
+
if (store.hasPhaseCheckpoint(sourceId, 'phase2_5')) {
|
|
483
|
+
console.log(`\x1b[32m⏭ Phase 2.5 건너뜀 (퀴즈 이미 생성됨)\x1b[0m`);
|
|
484
|
+
onProgress?.(`Phase 2.5 건너뜀 (퀴즈 이미 존재)`);
|
|
485
|
+
} else {
|
|
486
|
+
try {
|
|
487
|
+
const conceptPagesForQuiz = cachedConceptPages ?? store.listConceptPages();
|
|
488
|
+
if (conceptPagesForQuiz.length > 0) {
|
|
489
|
+
console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
|
|
490
|
+
onProgress?.(`Phase 2.5: 퀴즈 생성 중...`);
|
|
491
|
+
|
|
492
|
+
let quizSystemExtra = "";
|
|
493
|
+
if (schema?.terms && Object.keys(schema.terms).length > 0) {
|
|
494
|
+
const termList = Object.entries(schema.terms).map(([k, v]) => `${k} -> ${v}`).join(", ");
|
|
495
|
+
quizSystemExtra = `\nUse these standard terms in questions and answers (replace abbreviations with full forms): ${termList}`;
|
|
496
|
+
}
|
|
497
|
+
const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
|
|
498
|
+
Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.${quizSystemExtra}
|
|
353
499
|
Return valid JSON only. No markdown fences.`;
|
|
354
500
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
501
|
+
await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
|
|
502
|
+
try {
|
|
503
|
+
const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
|
|
504
|
+
Include questions that ask "왜?", "어떻게?", "비교하라" etc.
|
|
358
505
|
Types: "fill_blank" (빈칸 채우기), "ox" (OX 퀴즈 - true/false), "short_answer" (단답형)
|
|
359
506
|
|
|
360
507
|
Content title: ${page.title}
|
|
@@ -362,37 +509,43 @@ Content:
|
|
|
362
509
|
${page.content.slice(0, 3000)}
|
|
363
510
|
|
|
364
511
|
Respond with a JSON array only:
|
|
365
|
-
[{"question": "___은 양자역학에서 위치와 운동량을 동시에 측정할 수 없다는 원리이다.", "answer": "불확정성 원리", "type": "fill_blank"}]
|
|
512
|
+
[{"question": "___은 양자역학에서 위치와 운동량을 동시에 측정할 수 없다는 원리이다.", "answer": "불확정성 원리", "explanation": "이 원리는 양자역학의 근본적 한계를 보여주며, 측정 행위 자체가 시스템에 영향을 주기 때문입니다.", "type": "fill_blank"}]
|
|
366
513
|
|
|
367
514
|
Rules:
|
|
368
515
|
- For fill_blank: use ___ to mark the blank in the question
|
|
369
516
|
- For ox: question should be a statement, answer should be "O" or "X"
|
|
370
517
|
- For short_answer: question should be answerable in 1-3 words
|
|
371
|
-
-
|
|
518
|
+
- Include "explanation" field: a brief 1-2 sentence explanation of WHY the answer is correct
|
|
519
|
+
- Questions should test understanding, application, or analysis — not just recall
|
|
372
520
|
- Write questions in Korean when the content is in Korean`;
|
|
373
521
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
522
|
+
const raw = await chat(quizSystem, quizPrompt, 2048);
|
|
523
|
+
const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
|
|
524
|
+
|
|
525
|
+
for (const q of quizzes) {
|
|
526
|
+
if (q.question && q.answer && q.type) {
|
|
527
|
+
const question = compiledTerms ? standardizeTerms(q.question, compiledTerms) : q.question;
|
|
528
|
+
const answer = compiledTerms ? standardizeTerms(q.answer, compiledTerms) : q.answer;
|
|
529
|
+
const explanation = compiledTerms && q.explanation ? standardizeTerms(q.explanation, compiledTerms) : (q.explanation || "");
|
|
530
|
+
store.addQuiz(page.id, question, answer, q.type, explanation);
|
|
531
|
+
quizCount++;
|
|
532
|
+
}
|
|
381
533
|
}
|
|
534
|
+
} catch (e: unknown) {
|
|
535
|
+
// Quiz generation is non-critical; silently skip failures
|
|
536
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
537
|
+
console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
|
|
382
538
|
}
|
|
383
|
-
}
|
|
384
|
-
// Quiz generation is non-critical; silently skip failures
|
|
385
|
-
const message = e instanceof Error ? e.message : String(e);
|
|
386
|
-
console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
|
|
387
|
-
}
|
|
388
|
-
});
|
|
539
|
+
});
|
|
389
540
|
|
|
390
|
-
|
|
541
|
+
store.setCheckpoint(sourceId, 'phase2_5');
|
|
542
|
+
console.log(`\x1b[32m 🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
|
|
543
|
+
}
|
|
544
|
+
} catch (e: unknown) {
|
|
545
|
+
// Phase 2.5 is optional — don't block the pipeline
|
|
546
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
547
|
+
console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
|
|
391
548
|
}
|
|
392
|
-
} catch (e: unknown) {
|
|
393
|
-
// Phase 2.5 is optional — don't block the pipeline
|
|
394
|
-
const message = e instanceof Error ? e.message : String(e);
|
|
395
|
-
console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
|
|
396
549
|
}
|
|
397
550
|
|
|
398
551
|
// ── Phase 3: Resolve wiki links + inject concept links into source pages ──
|
|
@@ -470,8 +623,8 @@ Rules:
|
|
|
470
623
|
return { sourceCount, conceptCount };
|
|
471
624
|
}
|
|
472
625
|
|
|
473
|
-
export function htmlToRawText(html: string): string {
|
|
474
|
-
const { load } =
|
|
626
|
+
export async function htmlToRawText(html: string): Promise<string> {
|
|
627
|
+
const { load } = await import("cheerio");
|
|
475
628
|
const $ = load(html);
|
|
476
629
|
$("script, style, nav, header, footer, noscript").remove();
|
|
477
630
|
return $("body").text() || $.text();
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Term standardization post-processor.
|
|
3
|
+
* Replaces abbreviations/variants with their standard forms,
|
|
4
|
+
* using case-insensitive word-boundary matching.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
interface CompiledTerm {
|
|
8
|
+
regex: RegExp;
|
|
9
|
+
replacement: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Compile term mappings into reusable RegExp objects.
|
|
14
|
+
* Call once, reuse the result for multiple standardizeTerms calls.
|
|
15
|
+
*/
|
|
16
|
+
export function compileTerms(terms: Record<string, string>): CompiledTerm[] {
|
|
17
|
+
const compiled: CompiledTerm[] = [];
|
|
18
|
+
for (const [abbrev, standard] of Object.entries(terms)) {
|
|
19
|
+
// Escape special regex chars in the abbreviation
|
|
20
|
+
const escaped = abbrev.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
21
|
+
// Word-boundary aware, case-insensitive
|
|
22
|
+
// Negative lookbehind/lookahead to avoid matching inside markdown links
|
|
23
|
+
compiled.push({
|
|
24
|
+
regex: new RegExp(`(?<!\\[)\\b(${escaped})\\b(?!\\])(?![^[]*\\])`, "gi"),
|
|
25
|
+
replacement: standard,
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
return compiled;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Apply term standardization to content.
|
|
33
|
+
* Replaces abbreviations with standard terms using pre-compiled regexes.
|
|
34
|
+
*/
|
|
35
|
+
export function standardizeTerms(content: string, compiledTerms: CompiledTerm[]): string {
|
|
36
|
+
let result = content;
|
|
37
|
+
for (const { regex, replacement } of compiledTerms) {
|
|
38
|
+
result = result.replace(regex, replacement);
|
|
39
|
+
}
|
|
40
|
+
return result;
|
|
41
|
+
}
|