@open330/kiwimu 0.4.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,63 +1,16 @@
1
- import TurndownService from "turndown";
2
- import type { Section } from "../ingest/web";
3
- import type { Store } from "../store";
4
-
5
- const turndown = new TurndownService({ headingStyle: "atx" });
6
- turndown.remove(["script", "style"]);
7
-
8
1
  export function slugify(text: string): string {
9
2
  return text
10
- .normalize("NFKD")
11
3
  .toLowerCase()
12
4
  .trim()
13
- .replace(/[^\w\s-]/g, "")
5
+ .replace(/[^\w\s가-힣ㄱ-ㅎㅏ-ㅣ-]/g, "")
14
6
  .replace(/[-\s]+/g, "-")
15
7
  .replace(/^-|-$/g, "")
16
8
  .slice(0, 80);
17
9
  }
18
10
 
19
- const STOP_TITLES = new Set([
20
- "introduction", "overview", "summary", "conclusion", "references",
21
- "bibliography", "appendix", "abstract", "preface", "contents",
22
- "table of contents", "index", "acknowledgments", "notes",
23
- ]);
24
-
25
11
  export function cleanTitle(title: string): string {
26
12
  return title
27
13
  .replace(/^\s*(Chapter\s+)?\d+(\.\d+)*\s*/i, "")
28
14
  .replace(/\s+/g, " ")
29
15
  .trim();
30
16
  }
31
-
32
- export function chunkSections(sections: Section[], sourceId: number, store: Store, minWords = 30): number {
33
- let count = 0;
34
-
35
- for (const section of sections) {
36
- const title = cleanTitle(section.title);
37
- if (!title) continue;
38
-
39
- const slug = slugify(title);
40
- if (!slug) continue;
41
-
42
- const htmlContent = section.htmlParts.join("\n");
43
- if (!htmlContent.trim()) continue;
44
-
45
- const content = turndown.turndown(htmlContent).trim();
46
- const wordCount = content.split(/\s+/).length;
47
-
48
- if (wordCount < minWords) continue;
49
- if (STOP_TITLES.has(slug) || STOP_TITLES.has(title.toLowerCase())) {
50
- if (wordCount < 100) continue;
51
- }
52
-
53
- const existing = store.getPage(slug);
54
- if (existing) {
55
- store.updatePageContent(existing.id, existing.content + "\n\n" + content);
56
- } else {
57
- store.addPage(slug, title, content, sourceId, slug);
58
- count++;
59
- }
60
- }
61
-
62
- return count;
63
- }
@@ -1,14 +1,10 @@
1
- import { chatComplete } from "../llm-client";
1
+ import { LLMClient } from "../llm-client";
2
2
  import type { Store } from "../store";
3
3
  import { slugify } from "./chunker";
4
4
  import type { Persona } from "../config";
5
5
 
6
6
  // ── Phase 1: Extract original document structure ──
7
7
 
8
- const STRUCTURE_SYSTEM = `You are a document analyzer. Extract the chapter/section structure from this textbook content, preserving the original order and hierarchy.
9
-
10
- Return valid JSON only. No markdown fences.`;
11
-
12
8
  const STRUCTURE_PROMPT = `Extract the document structure from this text. Preserve the original chapter/section ordering.
13
9
 
14
10
  Source: "{sourceTitle}"
@@ -89,6 +85,25 @@ interface ConceptPage {
89
85
  suggested_links?: Array<{ text: string; url: string }>;
90
86
  }
91
87
 
88
+ async function parallelMap<T, R>(
89
+ items: T[],
90
+ concurrency: number,
91
+ fn: (item: T, index: number) => Promise<R>
92
+ ): Promise<R[]> {
93
+ const results: R[] = new Array(items.length);
94
+ let nextIndex = 0;
95
+
96
+ async function worker() {
97
+ while (nextIndex < items.length) {
98
+ const i = nextIndex++;
99
+ results[i] = await fn(items[i], i);
100
+ }
101
+ }
102
+
103
+ await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
104
+ return results;
105
+ }
106
+
92
107
  function splitByChapters(text: string): Array<{ chapterHint: string; text: string }> {
93
108
  const chapterPattern = /\n(?=(?:CHAPTER\s*\d+|Chapter\s+\d+)[A-Z\s])/g;
94
109
  const positions: number[] = [];
@@ -199,8 +214,17 @@ export async function llmChunkDocument(
199
214
  sourceId: number,
200
215
  store: Store,
201
216
  maxChunks: number = 0, // 0 = unlimited
202
- persona: Persona | null = null
217
+ persona: Persona | null = null,
218
+ llmClient?: LLMClient
203
219
  ): Promise<{ sourceCount: number; conceptCount: number }> {
220
+ // Use provided client or fall back to deprecated global chatComplete
221
+ const chat = llmClient
222
+ ? (system: string, user: string, maxTokens?: number) => llmClient.chatComplete(system, user, maxTokens)
223
+ : async (system: string, user: string, maxTokens?: number) => {
224
+ const { chatComplete } = await import("../llm-client");
225
+ return chatComplete(system, user, maxTokens);
226
+ };
227
+
204
228
  let chunks = splitByChapters(rawText);
205
229
  if (maxChunks > 0 && chunks.length > maxChunks) {
206
230
  console.log(`\x1b[33m⚠ ${chunks.length}개 청크 중 ${maxChunks}개만 처리합니다\x1b[0m`);
@@ -209,71 +233,69 @@ export async function llmChunkDocument(
209
233
  if (persona) {
210
234
  console.log(`\x1b[35m🎭 페르소나: ${persona.name}\x1b[0m`);
211
235
  }
212
- console.log(`\x1b[34m🧠 Phase 1: 원본 구조 추출 (${chunks.length}개 청크)...\x1b[0m`);
236
+ console.log(`\x1b[34m Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
213
237
 
214
- // ── Phase 1: Extract source pages ──
215
- let orderCounter = 0;
216
- const sourcePageSummaries: string[] = [];
238
+ // ── Phase 1: Extract source pages (parallel LLM calls) ──
239
+ const phase1Start = performance.now();
240
+ let completedCount = 0;
241
+ const structureSystem = getStructureSystem(persona);
217
242
 
218
- for (let i = 0; i < chunks.length; i++) {
219
- const chunk = chunks[i];
220
- console.log(` [${i + 1}/${chunks.length}] ${chunk.chapterHint}`);
243
+ const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
244
+ console.log(` Phase 1: 처리 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
221
245
 
222
246
  const prompt = STRUCTURE_PROMPT
223
247
  .replace("{sourceTitle}", sourceTitle)
224
248
  .replace("{text}", chunk.text.slice(0, 80000));
225
249
 
226
- const structureSystem = getStructureSystem(persona);
227
250
  try {
228
- const raw = await chatComplete(structureSystem, prompt, 16384);
251
+ let raw = await chat(structureSystem, prompt, 16384);
229
252
  if (!raw || raw.trim().length < 10) {
230
253
  console.log(` \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
231
- const retry = await chatComplete(structureSystem, prompt, 16384);
232
- if (!retry || retry.trim().length < 10) {
254
+ raw = await chat(structureSystem, prompt, 16384);
255
+ if (!raw || raw.trim().length < 10) {
233
256
  console.log(` \x1b[31m✗ 재시도도 빈 응답\x1b[0m`);
234
- continue;
257
+ completedCount++;
258
+ return [] as StructurePage[];
235
259
  }
236
- const sections = parseJSON<StructurePage[]>(retry).filter(s => s.title && s.content && s.content.length > 30);
237
- // fall through to process sections below
238
- for (const section of sections) {
239
- const slug = slugify(section.title);
240
- if (!slug) continue;
241
- const existing = store.getPage(slug);
242
- if (existing) {
243
- store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
244
- } else {
245
- store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
246
- sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
247
- }
248
- }
249
- console.log(` → ${sections.length}개 섹션`);
250
- continue;
251
260
  }
252
261
  const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
262
+ completedCount++;
263
+ console.log(` → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
264
+ return sections;
265
+ } catch (e: unknown) {
266
+ const message = e instanceof Error ? e.message : String(e);
267
+ console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
268
+ completedCount++;
269
+ return [] as StructurePage[];
270
+ }
271
+ });
253
272
 
254
- for (const section of sections) {
255
- const slug = slugify(section.title);
256
- if (!slug) continue;
273
+ // Store results sequentially (SQLite writes must be sequential)
274
+ let orderCounter = 0;
275
+ const sourcePageSummaries: string[] = [];
257
276
 
258
- const existing = store.getPage(slug);
259
- if (existing) {
260
- store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
261
- } else {
262
- store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
263
- sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
264
- }
277
+ for (const sections of chunkResults) {
278
+ for (const section of sections) {
279
+ const slug = slugify(section.title);
280
+ if (!slug) continue;
281
+
282
+ const existing = store.getPage(slug);
283
+ if (existing) {
284
+ store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
285
+ } else {
286
+ store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
287
+ sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
265
288
  }
266
- console.log(` → ${sections.length}개 섹션`);
267
- } catch (e: any) {
268
- console.log(` \x1b[31m✗ 실패: ${e.message}\x1b[0m`);
269
289
  }
270
290
  }
271
291
 
272
292
  const sourceCount = orderCounter;
273
- console.log(`\x1b[32m 📖 ${sourceCount}개 원본 페이지 생성 완료\x1b[0m`);
293
+ const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
294
+ console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) — 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
274
295
 
275
296
  // ── Phase 2: Extract concept pages ──
276
- console.log(`\x1b[34m🧠 Phase 2: 개념 페이지 추출...\x1b[0m`);
297
+ const phase2Start = performance.now();
298
+ console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
277
299
 
278
300
  // Process source pages in small batches for concept extraction
279
301
  const batchSize = 5;
@@ -292,7 +314,7 @@ export async function llmChunkDocument(
292
314
  const conceptSystem = getConceptSystem(persona);
293
315
 
294
316
  try {
295
- const raw = await chatComplete(conceptSystem, prompt, 16384);
317
+ const raw = await chat(conceptSystem, prompt, 16384);
296
318
  const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
297
319
 
298
320
  for (const concept of concepts) {
@@ -315,12 +337,70 @@ export async function llmChunkDocument(
315
337
  conceptCount++;
316
338
  }
317
339
  console.log(` → ${concepts.length}개 개념`);
318
- } catch (e: any) {
319
- console.log(` \x1b[31m✗ 실패: ${e.message}\x1b[0m`);
340
+ } catch (e: unknown) {
341
+ const message = e instanceof Error ? e.message : String(e);
342
+ console.log(` \x1b[31m✗ 실패: ${message}\x1b[0m`);
320
343
  }
321
344
  }
322
345
 
323
- console.log(`\x1b[32m 📝 ${conceptCount}개 개념 페이지 생성 완료\x1b[0m`);
346
+ const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
347
+ console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) — 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
348
+
349
+ // ── Phase 2.5: Generate quizzes from concept pages ──
350
+ let quizCount = 0;
351
+ try {
352
+ const conceptPagesForQuiz = store.listConceptPages();
353
+ if (conceptPagesForQuiz.length > 0) {
354
+ console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
355
+
356
+ const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
357
+ Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.
358
+ Return valid JSON only. No markdown fences.`;
359
+
360
+ await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
361
+ try {
362
+ const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
363
+ Include questions that ask "왜?", "어떻게?", "비교하라" etc.
364
+ Types: "fill_blank" (빈칸 채우기), "ox" (OX 퀴즈 - true/false), "short_answer" (단답형)
365
+
366
+ Content title: ${page.title}
367
+ Content:
368
+ ${page.content.slice(0, 3000)}
369
+
370
+ Respond with a JSON array only:
371
+ [{"question": "___은 양자역학에서 위치와 운동량을 동시에 측정할 수 없다는 원리이다.", "answer": "불확정성 원리", "explanation": "이 원리는 양자역학의 근본적 한계를 보여주며, 측정 행위 자체가 시스템에 영향을 주기 때문입니다.", "type": "fill_blank"}]
372
+
373
+ Rules:
374
+ - For fill_blank: use ___ to mark the blank in the question
375
+ - For ox: question should be a statement, answer should be "O" or "X"
376
+ - For short_answer: question should be answerable in 1-3 words
377
+ - Include "explanation" field: a brief 1-2 sentence explanation of WHY the answer is correct
378
+ - Questions should test understanding, application, or analysis — not just recall
379
+ - Write questions in Korean when the content is in Korean`;
380
+
381
+ const raw = await chat(quizSystem, quizPrompt, 2048);
382
+ const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
383
+
384
+ for (const q of quizzes) {
385
+ if (q.question && q.answer && q.type) {
386
+ store.addQuiz(page.id, q.question, q.answer, q.type, q.explanation || "");
387
+ quizCount++;
388
+ }
389
+ }
390
+ } catch (e: unknown) {
391
+ // Quiz generation is non-critical; silently skip failures
392
+ const message = e instanceof Error ? e.message : String(e);
393
+ console.log(` \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
394
+ }
395
+ });
396
+
397
+ console.log(`\x1b[32m 🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
398
+ }
399
+ } catch (e: unknown) {
400
+ // Phase 2.5 is optional — don't block the pipeline
401
+ const message = e instanceof Error ? e.message : String(e);
402
+ console.log(`\x1b[33m ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
403
+ }
324
404
 
325
405
  // ── Phase 3: Resolve wiki links + inject concept links into source pages ──
326
406
  console.log(`\x1b[34m🔗 위키 링크 해석 중...\x1b[0m`);
@@ -347,31 +427,36 @@ export async function llmChunkDocument(
347
427
  const srcPages = allPages.filter(p => p.page_type === "source");
348
428
 
349
429
  // Build search terms: full title + key words from title (2+ words long)
350
- const searchTerms: Array<{ term: string; concept: typeof conceptPages[0] }> = [];
430
+ const searchTerms: Array<{ term: string; concept: typeof conceptPages[0]; regex: RegExp | null }> = [];
351
431
  for (const concept of conceptPages) {
352
- searchTerms.push({ term: concept.title, concept });
432
+ searchTerms.push({ term: concept.title, concept, regex: null });
353
433
  // Also try individual significant words from multi-word titles
354
434
  const words = concept.title.split(/\s+/).filter(w => w.length >= 4 && !/^(and|the|for|with|from|into)$/i.test(w));
355
435
  if (words.length >= 2) {
356
436
  // Try pairs of consecutive words
357
437
  for (let i = 0; i < words.length - 1; i++) {
358
- searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept });
438
+ searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept, regex: null });
359
439
  }
360
440
  }
361
441
  }
362
442
  // Sort by term length descending for longest match first
363
443
  searchTerms.sort((a, b) => b.term.length - a.term.length);
364
444
 
445
+ // Pre-compile RegExp objects outside the page loop
446
+ for (const entry of searchTerms) {
447
+ if (entry.term.length < 3) continue;
448
+ const escaped = entry.term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
449
+ entry.regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
450
+ }
451
+
365
452
  for (const srcPage of srcPages) {
366
453
  let content = srcPage.content;
367
454
  let modified = false;
368
455
  const linkedConcepts = new Set<number>();
369
456
 
370
- for (const { term, concept } of searchTerms) {
457
+ for (const { term, concept, regex } of searchTerms) {
371
458
  if (linkedConcepts.has(concept.id)) continue; // One link per concept per page
372
- if (term.length < 3) continue;
373
- const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
374
- const regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
459
+ if (term.length < 3 || !regex) continue;
375
460
  const match = regex.exec(content);
376
461
  if (match) {
377
462
  const replacement = `[${match[1]}](/wiki/${concept.slug})`;