@open330/kiwimu 0.4.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -49
- package/bin/kiwimu +1 -1
- package/package.json +4 -1
- package/personas/namuwiki.json +6 -0
- package/src/build/renderer.ts +50 -2
- package/src/build/static/search.js +33 -2
- package/src/build/static/style.css +84 -1
- package/src/build/templates.ts +353 -167
- package/src/config.ts +35 -29
- package/src/demo/sample-data.ts +70 -0
- package/src/demo/setup.ts +31 -0
- package/src/expand/llm.ts +1 -1
- package/src/index.ts +234 -458
- package/src/ingest/docx.ts +0 -8
- package/src/ingest/legacy.ts +4 -4
- package/src/ingest/pdf.ts +1 -1
- package/src/ingest/pptx.ts +0 -1
- package/src/ingest/web.test.ts +41 -0
- package/src/ingest/web.ts +61 -62
- package/src/llm-client.ts +203 -126
- package/src/pipeline/chunker.test.ts +42 -0
- package/src/pipeline/chunker.ts +1 -48
- package/src/pipeline/llm-chunker.ts +144 -59
- package/src/server.ts +327 -0
- package/src/services/ingest.ts +100 -0
- package/src/store.test.ts +132 -0
- package/src/store.ts +206 -2
- package/src/pipeline/llm-linker.ts +0 -84
package/src/pipeline/chunker.ts
CHANGED
|
@@ -1,63 +1,16 @@
|
|
|
1
|
-
import TurndownService from "turndown";
|
|
2
|
-
import type { Section } from "../ingest/web";
|
|
3
|
-
import type { Store } from "../store";
|
|
4
|
-
|
|
5
|
-
const turndown = new TurndownService({ headingStyle: "atx" });
|
|
6
|
-
turndown.remove(["script", "style"]);
|
|
7
|
-
|
|
8
1
|
export function slugify(text: string): string {
|
|
9
2
|
return text
|
|
10
|
-
.normalize("NFKD")
|
|
11
3
|
.toLowerCase()
|
|
12
4
|
.trim()
|
|
13
|
-
.replace(/[^\w\s
|
|
5
|
+
.replace(/[^\w\s가-힣ㄱ-ㅎㅏ-ㅣ-]/g, "")
|
|
14
6
|
.replace(/[-\s]+/g, "-")
|
|
15
7
|
.replace(/^-|-$/g, "")
|
|
16
8
|
.slice(0, 80);
|
|
17
9
|
}
|
|
18
10
|
|
|
19
|
-
const STOP_TITLES = new Set([
|
|
20
|
-
"introduction", "overview", "summary", "conclusion", "references",
|
|
21
|
-
"bibliography", "appendix", "abstract", "preface", "contents",
|
|
22
|
-
"table of contents", "index", "acknowledgments", "notes",
|
|
23
|
-
]);
|
|
24
|
-
|
|
25
11
|
export function cleanTitle(title: string): string {
|
|
26
12
|
return title
|
|
27
13
|
.replace(/^\s*(Chapter\s+)?\d+(\.\d+)*\s*/i, "")
|
|
28
14
|
.replace(/\s+/g, " ")
|
|
29
15
|
.trim();
|
|
30
16
|
}
|
|
31
|
-
|
|
32
|
-
export function chunkSections(sections: Section[], sourceId: number, store: Store, minWords = 30): number {
|
|
33
|
-
let count = 0;
|
|
34
|
-
|
|
35
|
-
for (const section of sections) {
|
|
36
|
-
const title = cleanTitle(section.title);
|
|
37
|
-
if (!title) continue;
|
|
38
|
-
|
|
39
|
-
const slug = slugify(title);
|
|
40
|
-
if (!slug) continue;
|
|
41
|
-
|
|
42
|
-
const htmlContent = section.htmlParts.join("\n");
|
|
43
|
-
if (!htmlContent.trim()) continue;
|
|
44
|
-
|
|
45
|
-
const content = turndown.turndown(htmlContent).trim();
|
|
46
|
-
const wordCount = content.split(/\s+/).length;
|
|
47
|
-
|
|
48
|
-
if (wordCount < minWords) continue;
|
|
49
|
-
if (STOP_TITLES.has(slug) || STOP_TITLES.has(title.toLowerCase())) {
|
|
50
|
-
if (wordCount < 100) continue;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
const existing = store.getPage(slug);
|
|
54
|
-
if (existing) {
|
|
55
|
-
store.updatePageContent(existing.id, existing.content + "\n\n" + content);
|
|
56
|
-
} else {
|
|
57
|
-
store.addPage(slug, title, content, sourceId, slug);
|
|
58
|
-
count++;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
return count;
|
|
63
|
-
}
|
|
@@ -1,14 +1,10 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { LLMClient } from "../llm-client";
|
|
2
2
|
import type { Store } from "../store";
|
|
3
3
|
import { slugify } from "./chunker";
|
|
4
4
|
import type { Persona } from "../config";
|
|
5
5
|
|
|
6
6
|
// ── Phase 1: Extract original document structure ──
|
|
7
7
|
|
|
8
|
-
const STRUCTURE_SYSTEM = `You are a document analyzer. Extract the chapter/section structure from this textbook content, preserving the original order and hierarchy.
|
|
9
|
-
|
|
10
|
-
Return valid JSON only. No markdown fences.`;
|
|
11
|
-
|
|
12
8
|
const STRUCTURE_PROMPT = `Extract the document structure from this text. Preserve the original chapter/section ordering.
|
|
13
9
|
|
|
14
10
|
Source: "{sourceTitle}"
|
|
@@ -89,6 +85,25 @@ interface ConceptPage {
|
|
|
89
85
|
suggested_links?: Array<{ text: string; url: string }>;
|
|
90
86
|
}
|
|
91
87
|
|
|
88
|
+
async function parallelMap<T, R>(
|
|
89
|
+
items: T[],
|
|
90
|
+
concurrency: number,
|
|
91
|
+
fn: (item: T, index: number) => Promise<R>
|
|
92
|
+
): Promise<R[]> {
|
|
93
|
+
const results: R[] = new Array(items.length);
|
|
94
|
+
let nextIndex = 0;
|
|
95
|
+
|
|
96
|
+
async function worker() {
|
|
97
|
+
while (nextIndex < items.length) {
|
|
98
|
+
const i = nextIndex++;
|
|
99
|
+
results[i] = await fn(items[i], i);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
|
|
104
|
+
return results;
|
|
105
|
+
}
|
|
106
|
+
|
|
92
107
|
function splitByChapters(text: string): Array<{ chapterHint: string; text: string }> {
|
|
93
108
|
const chapterPattern = /\n(?=(?:CHAPTER\s*\d+|Chapter\s+\d+)[A-Z\s])/g;
|
|
94
109
|
const positions: number[] = [];
|
|
@@ -199,8 +214,17 @@ export async function llmChunkDocument(
|
|
|
199
214
|
sourceId: number,
|
|
200
215
|
store: Store,
|
|
201
216
|
maxChunks: number = 0, // 0 = unlimited
|
|
202
|
-
persona: Persona | null = null
|
|
217
|
+
persona: Persona | null = null,
|
|
218
|
+
llmClient?: LLMClient
|
|
203
219
|
): Promise<{ sourceCount: number; conceptCount: number }> {
|
|
220
|
+
// Use provided client or fall back to deprecated global chatComplete
|
|
221
|
+
const chat = llmClient
|
|
222
|
+
? (system: string, user: string, maxTokens?: number) => llmClient.chatComplete(system, user, maxTokens)
|
|
223
|
+
: async (system: string, user: string, maxTokens?: number) => {
|
|
224
|
+
const { chatComplete } = await import("../llm-client");
|
|
225
|
+
return chatComplete(system, user, maxTokens);
|
|
226
|
+
};
|
|
227
|
+
|
|
204
228
|
let chunks = splitByChapters(rawText);
|
|
205
229
|
if (maxChunks > 0 && chunks.length > maxChunks) {
|
|
206
230
|
console.log(`\x1b[33m⚠ ${chunks.length}개 청크 중 ${maxChunks}개만 처리합니다\x1b[0m`);
|
|
@@ -209,71 +233,69 @@ export async function llmChunkDocument(
|
|
|
209
233
|
if (persona) {
|
|
210
234
|
console.log(`\x1b[35m🎭 페르소나: ${persona.name}\x1b[0m`);
|
|
211
235
|
}
|
|
212
|
-
console.log(`\x1b[34m
|
|
236
|
+
console.log(`\x1b[34m⏳ Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
|
|
213
237
|
|
|
214
|
-
// ── Phase 1: Extract source pages ──
|
|
215
|
-
|
|
216
|
-
|
|
238
|
+
// ── Phase 1: Extract source pages (parallel LLM calls) ──
|
|
239
|
+
const phase1Start = performance.now();
|
|
240
|
+
let completedCount = 0;
|
|
241
|
+
const structureSystem = getStructureSystem(persona);
|
|
217
242
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
console.log(` [${i + 1}/${chunks.length}] ${chunk.chapterHint}`);
|
|
243
|
+
const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
|
|
244
|
+
console.log(` Phase 1: 처리 중 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
|
|
221
245
|
|
|
222
246
|
const prompt = STRUCTURE_PROMPT
|
|
223
247
|
.replace("{sourceTitle}", sourceTitle)
|
|
224
248
|
.replace("{text}", chunk.text.slice(0, 80000));
|
|
225
249
|
|
|
226
|
-
const structureSystem = getStructureSystem(persona);
|
|
227
250
|
try {
|
|
228
|
-
|
|
251
|
+
let raw = await chat(structureSystem, prompt, 16384);
|
|
229
252
|
if (!raw || raw.trim().length < 10) {
|
|
230
253
|
console.log(` \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
|
|
231
|
-
|
|
232
|
-
if (!
|
|
254
|
+
raw = await chat(structureSystem, prompt, 16384);
|
|
255
|
+
if (!raw || raw.trim().length < 10) {
|
|
233
256
|
console.log(` \x1b[31m✗ 재시도도 빈 응답\x1b[0m`);
|
|
234
|
-
|
|
257
|
+
completedCount++;
|
|
258
|
+
return [] as StructurePage[];
|
|
235
259
|
}
|
|
236
|
-
const sections = parseJSON<StructurePage[]>(retry).filter(s => s.title && s.content && s.content.length > 30);
|
|
237
|
-
// fall through to process sections below
|
|
238
|
-
for (const section of sections) {
|
|
239
|
-
const slug = slugify(section.title);
|
|
240
|
-
if (!slug) continue;
|
|
241
|
-
const existing = store.getPage(slug);
|
|
242
|
-
if (existing) {
|
|
243
|
-
store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
|
|
244
|
-
} else {
|
|
245
|
-
store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
|
|
246
|
-
sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
console.log(` → ${sections.length}개 섹션`);
|
|
250
|
-
continue;
|
|
251
260
|
}
|
|
252
261
|
const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
|
|
262
|
+
completedCount++;
|
|
263
|
+
console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
|
|
264
|
+
return sections;
|
|
265
|
+
} catch (e: unknown) {
|
|
266
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
267
|
+
console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
|
|
268
|
+
completedCount++;
|
|
269
|
+
return [] as StructurePage[];
|
|
270
|
+
}
|
|
271
|
+
});
|
|
253
272
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
273
|
+
// Store results sequentially (SQLite writes must be sequential)
|
|
274
|
+
let orderCounter = 0;
|
|
275
|
+
const sourcePageSummaries: string[] = [];
|
|
257
276
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
277
|
+
for (const sections of chunkResults) {
|
|
278
|
+
for (const section of sections) {
|
|
279
|
+
const slug = slugify(section.title);
|
|
280
|
+
if (!slug) continue;
|
|
281
|
+
|
|
282
|
+
const existing = store.getPage(slug);
|
|
283
|
+
if (existing) {
|
|
284
|
+
store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
|
|
285
|
+
} else {
|
|
286
|
+
store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
|
|
287
|
+
sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
|
|
265
288
|
}
|
|
266
|
-
console.log(` → ${sections.length}개 섹션`);
|
|
267
|
-
} catch (e: any) {
|
|
268
|
-
console.log(` \x1b[31m✗ 실패: ${e.message}\x1b[0m`);
|
|
269
289
|
}
|
|
270
290
|
}
|
|
271
291
|
|
|
272
292
|
const sourceCount = orderCounter;
|
|
273
|
-
|
|
293
|
+
const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
|
|
294
|
+
console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) — 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
|
|
274
295
|
|
|
275
296
|
// ── Phase 2: Extract concept pages ──
|
|
276
|
-
|
|
297
|
+
const phase2Start = performance.now();
|
|
298
|
+
console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
|
|
277
299
|
|
|
278
300
|
// Process source pages in small batches for concept extraction
|
|
279
301
|
const batchSize = 5;
|
|
@@ -292,7 +314,7 @@ export async function llmChunkDocument(
|
|
|
292
314
|
const conceptSystem = getConceptSystem(persona);
|
|
293
315
|
|
|
294
316
|
try {
|
|
295
|
-
const raw = await
|
|
317
|
+
const raw = await chat(conceptSystem, prompt, 16384);
|
|
296
318
|
const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
|
|
297
319
|
|
|
298
320
|
for (const concept of concepts) {
|
|
@@ -315,12 +337,70 @@ export async function llmChunkDocument(
|
|
|
315
337
|
conceptCount++;
|
|
316
338
|
}
|
|
317
339
|
console.log(` → ${concepts.length}개 개념`);
|
|
318
|
-
} catch (e:
|
|
319
|
-
|
|
340
|
+
} catch (e: unknown) {
|
|
341
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
342
|
+
console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
|
|
320
343
|
}
|
|
321
344
|
}
|
|
322
345
|
|
|
323
|
-
|
|
346
|
+
const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
|
|
347
|
+
console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) — 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
|
|
348
|
+
|
|
349
|
+
// ── Phase 2.5: Generate quizzes from concept pages ──
|
|
350
|
+
let quizCount = 0;
|
|
351
|
+
try {
|
|
352
|
+
const conceptPagesForQuiz = store.listConceptPages();
|
|
353
|
+
if (conceptPagesForQuiz.length > 0) {
|
|
354
|
+
console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
|
|
355
|
+
|
|
356
|
+
const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
|
|
357
|
+
Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.
|
|
358
|
+
Return valid JSON only. No markdown fences.`;
|
|
359
|
+
|
|
360
|
+
await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
|
|
361
|
+
try {
|
|
362
|
+
const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
|
|
363
|
+
Include questions that ask "왜?", "어떻게?", "비교하라" etc.
|
|
364
|
+
Types: "fill_blank" (빈칸 채우기), "ox" (OX 퀴즈 - true/false), "short_answer" (단답형)
|
|
365
|
+
|
|
366
|
+
Content title: ${page.title}
|
|
367
|
+
Content:
|
|
368
|
+
${page.content.slice(0, 3000)}
|
|
369
|
+
|
|
370
|
+
Respond with a JSON array only:
|
|
371
|
+
[{"question": "___은 양자역학에서 위치와 운동량을 동시에 측정할 수 없다는 원리이다.", "answer": "불확정성 원리", "explanation": "이 원리는 양자역학의 근본적 한계를 보여주며, 측정 행위 자체가 시스템에 영향을 주기 때문입니다.", "type": "fill_blank"}]
|
|
372
|
+
|
|
373
|
+
Rules:
|
|
374
|
+
- For fill_blank: use ___ to mark the blank in the question
|
|
375
|
+
- For ox: question should be a statement, answer should be "O" or "X"
|
|
376
|
+
- For short_answer: question should be answerable in 1-3 words
|
|
377
|
+
- Include "explanation" field: a brief 1-2 sentence explanation of WHY the answer is correct
|
|
378
|
+
- Questions should test understanding, application, or analysis — not just recall
|
|
379
|
+
- Write questions in Korean when the content is in Korean`;
|
|
380
|
+
|
|
381
|
+
const raw = await chat(quizSystem, quizPrompt, 2048);
|
|
382
|
+
const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
|
|
383
|
+
|
|
384
|
+
for (const q of quizzes) {
|
|
385
|
+
if (q.question && q.answer && q.type) {
|
|
386
|
+
store.addQuiz(page.id, q.question, q.answer, q.type, q.explanation || "");
|
|
387
|
+
quizCount++;
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
} catch (e: unknown) {
|
|
391
|
+
// Quiz generation is non-critical; silently skip failures
|
|
392
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
393
|
+
console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
|
|
394
|
+
}
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
console.log(`\x1b[32m 🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
|
|
398
|
+
}
|
|
399
|
+
} catch (e: unknown) {
|
|
400
|
+
// Phase 2.5 is optional — don't block the pipeline
|
|
401
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
402
|
+
console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
|
|
403
|
+
}
|
|
324
404
|
|
|
325
405
|
// ── Phase 3: Resolve wiki links + inject concept links into source pages ──
|
|
326
406
|
console.log(`\x1b[34m🔗 위키 링크 해석 중...\x1b[0m`);
|
|
@@ -347,31 +427,36 @@ export async function llmChunkDocument(
|
|
|
347
427
|
const srcPages = allPages.filter(p => p.page_type === "source");
|
|
348
428
|
|
|
349
429
|
// Build search terms: full title + key words from title (2+ words long)
|
|
350
|
-
const searchTerms: Array<{ term: string; concept: typeof conceptPages[0] }> = [];
|
|
430
|
+
const searchTerms: Array<{ term: string; concept: typeof conceptPages[0]; regex: RegExp | null }> = [];
|
|
351
431
|
for (const concept of conceptPages) {
|
|
352
|
-
searchTerms.push({ term: concept.title, concept });
|
|
432
|
+
searchTerms.push({ term: concept.title, concept, regex: null });
|
|
353
433
|
// Also try individual significant words from multi-word titles
|
|
354
434
|
const words = concept.title.split(/\s+/).filter(w => w.length >= 4 && !/^(and|the|for|with|from|into)$/i.test(w));
|
|
355
435
|
if (words.length >= 2) {
|
|
356
436
|
// Try pairs of consecutive words
|
|
357
437
|
for (let i = 0; i < words.length - 1; i++) {
|
|
358
|
-
searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept });
|
|
438
|
+
searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept, regex: null });
|
|
359
439
|
}
|
|
360
440
|
}
|
|
361
441
|
}
|
|
362
442
|
// Sort by term length descending for longest match first
|
|
363
443
|
searchTerms.sort((a, b) => b.term.length - a.term.length);
|
|
364
444
|
|
|
445
|
+
// Pre-compile RegExp objects outside the page loop
|
|
446
|
+
for (const entry of searchTerms) {
|
|
447
|
+
if (entry.term.length < 3) continue;
|
|
448
|
+
const escaped = entry.term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
449
|
+
entry.regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
|
|
450
|
+
}
|
|
451
|
+
|
|
365
452
|
for (const srcPage of srcPages) {
|
|
366
453
|
let content = srcPage.content;
|
|
367
454
|
let modified = false;
|
|
368
455
|
const linkedConcepts = new Set<number>();
|
|
369
456
|
|
|
370
|
-
for (const { term, concept } of searchTerms) {
|
|
457
|
+
for (const { term, concept, regex } of searchTerms) {
|
|
371
458
|
if (linkedConcepts.has(concept.id)) continue; // One link per concept per page
|
|
372
|
-
if (term.length < 3) continue;
|
|
373
|
-
const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
374
|
-
const regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
|
|
459
|
+
if (term.length < 3 || !regex) continue;
|
|
375
460
|
const match = regex.exec(content);
|
|
376
461
|
if (match) {
|
|
377
462
|
const replacement = `[${match[1]}](/wiki/${concept.slug})`;
|