npm - @open330/kiwimu - Versions diffs - 0.4.1 → 0.8.0 - Mend

@open330/kiwimu 0.4.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +98 -49
package/bin/kiwimu +1 -1
package/package.json +4 -1
package/personas/namuwiki.json +6 -0
package/src/build/renderer.ts +50 -2
package/src/build/static/search.js +33 -2
package/src/build/static/style.css +84 -1
package/src/build/templates.ts +353 -167
package/src/config.ts +35 -29
package/src/demo/sample-data.ts +70 -0
package/src/demo/setup.ts +31 -0
package/src/expand/llm.ts +1 -1
package/src/index.ts +234 -458
package/src/ingest/docx.ts +0 -8
package/src/ingest/legacy.ts +4 -4
package/src/ingest/pdf.ts +1 -1
package/src/ingest/pptx.ts +0 -1
package/src/ingest/web.test.ts +41 -0
package/src/ingest/web.ts +61 -62
package/src/llm-client.ts +203 -126
package/src/pipeline/chunker.test.ts +42 -0
package/src/pipeline/chunker.ts +1 -48
package/src/pipeline/llm-chunker.ts +144 -59
package/src/server.ts +327 -0
package/src/services/ingest.ts +100 -0
package/src/store.test.ts +132 -0
package/src/store.ts +206 -2
package/src/pipeline/llm-linker.ts +0 -84

package/src/pipeline/chunker.ts CHANGED Viewed

@@ -1,63 +1,16 @@
-import TurndownService from "turndown";
-import type { Section } from "../ingest/web";
-import type { Store } from "../store";
-const turndown = new TurndownService({ headingStyle: "atx" });
-turndown.remove(["script", "style"]);
 export function slugify(text: string): string {
   return text
-    .normalize("NFKD")
     .toLowerCase()
     .trim()
-    .replace(/[^\w\s-]/g, "")
+    .replace(/[^\w\s가-힣ㄱ-ㅎㅏ-ㅣ-]/g, "")
     .replace(/[-\s]+/g, "-")
     .replace(/^-|-$/g, "")
     .slice(0, 80);
 }
-const STOP_TITLES = new Set([
-  "introduction", "overview", "summary", "conclusion", "references",
-  "bibliography", "appendix", "abstract", "preface", "contents",
-  "table of contents", "index", "acknowledgments", "notes",
-]);
 export function cleanTitle(title: string): string {
   return title
     .replace(/^\s*(Chapter\s+)?\d+(\.\d+)*\s*/i, "")
     .replace(/\s+/g, " ")
     .trim();
 }
-export function chunkSections(sections: Section[], sourceId: number, store: Store, minWords = 30): number {
-  let count = 0;
-  for (const section of sections) {
-    const title = cleanTitle(section.title);
-    if (!title) continue;
-    const slug = slugify(title);
-    if (!slug) continue;
-    const htmlContent = section.htmlParts.join("\n");
-    if (!htmlContent.trim()) continue;
-    const content = turndown.turndown(htmlContent).trim();
-    const wordCount = content.split(/\s+/).length;
-    if (wordCount < minWords) continue;
-    if (STOP_TITLES.has(slug) || STOP_TITLES.has(title.toLowerCase())) {
-      if (wordCount < 100) continue;
-    }
-    const existing = store.getPage(slug);
-    if (existing) {
-      store.updatePageContent(existing.id, existing.content + "\n\n" + content);
-    } else {
-      store.addPage(slug, title, content, sourceId, slug);
-      count++;
-    }
-  }
-  return count;
-}

package/src/pipeline/llm-chunker.ts CHANGED Viewed

@@ -1,14 +1,10 @@
-import { chatComplete } from "../llm-client";
+import { LLMClient } from "../llm-client";
 import type { Store } from "../store";
 import { slugify } from "./chunker";
 import type { Persona } from "../config";
 // ── Phase 1: Extract original document structure ──
-const STRUCTURE_SYSTEM = `You are a document analyzer. Extract the chapter/section structure from this textbook content, preserving the original order and hierarchy.
-Return valid JSON only. No markdown fences.`;
 const STRUCTURE_PROMPT = `Extract the document structure from this text. Preserve the original chapter/section ordering.
 Source: "{sourceTitle}"
@@ -89,6 +85,25 @@ interface ConceptPage {
   suggested_links?: Array<{ text: string; url: string }>;
 }
+async function parallelMap<T, R>(
+  items: T[],
+  concurrency: number,
+  fn: (item: T, index: number) => Promise<R>
+): Promise<R[]> {
+  const results: R[] = new Array(items.length);
+  let nextIndex = 0;
+  async function worker() {
+    while (nextIndex < items.length) {
+      const i = nextIndex++;
+      results[i] = await fn(items[i], i);
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
+  return results;
+}
 function splitByChapters(text: string): Array<{ chapterHint: string; text: string }> {
   const chapterPattern = /\n(?=(?:CHAPTER\s*\d+|Chapter\s+\d+)[A-Z\s])/g;
   const positions: number[] = [];
@@ -199,8 +214,17 @@ export async function llmChunkDocument(
   sourceId: number,
   store: Store,
   maxChunks: number = 0, // 0 = unlimited
-  persona: Persona | null = null
+  persona: Persona | null = null,
+  llmClient?: LLMClient
 ): Promise<{ sourceCount: number; conceptCount: number }> {
+  // Use provided client or fall back to deprecated global chatComplete
+  const chat = llmClient
+    ? (system: string, user: string, maxTokens?: number) => llmClient.chatComplete(system, user, maxTokens)
+    : async (system: string, user: string, maxTokens?: number) => {
+        const { chatComplete } = await import("../llm-client");
+        return chatComplete(system, user, maxTokens);
+      };
   let chunks = splitByChapters(rawText);
   if (maxChunks > 0 && chunks.length > maxChunks) {
     console.log(`\x1b[33m⚠ ${chunks.length}개 청크 중 ${maxChunks}개만 처리합니다\x1b[0m`);
@@ -209,71 +233,69 @@ export async function llmChunkDocument(
   if (persona) {
     console.log(`\x1b[35m🎭 페르소나: ${persona.name}\x1b[0m`);
   }
-  console.log(`\x1b[34m🧠 Phase 1: 원본 구조 추출 (${chunks.length}개 청크)...\x1b[0m`);
+  console.log(`\x1b[34m⏳ Phase 1: 원본 구조 추출 중... (${chunks.length}개 청크)\x1b[0m`);
-  // ── Phase 1: Extract source pages ──
-  let orderCounter = 0;
-  const sourcePageSummaries: string[] = [];
+  // ── Phase 1: Extract source pages (parallel LLM calls) ──
+  const phase1Start = performance.now();
+  let completedCount = 0;
+  const structureSystem = getStructureSystem(persona);
-  for (let i = 0; i < chunks.length; i++) {
-    const chunk = chunks[i];
-    console.log(`  [${i + 1}/${chunks.length}] ${chunk.chapterHint}`);
+  const chunkResults = await parallelMap(chunks, 3, async (chunk, i) => {
+    console.log(`  Phase 1: 처리 중 [${i + 1}/${chunks.length}] ${chunk.chapterHint}...`);
     const prompt = STRUCTURE_PROMPT
       .replace("{sourceTitle}", sourceTitle)
       .replace("{text}", chunk.text.slice(0, 80000));
-    const structureSystem = getStructureSystem(persona);
     try {
-      const raw = await chatComplete(structureSystem, prompt, 16384);
+      let raw = await chat(structureSystem, prompt, 16384);
       if (!raw || raw.trim().length < 10) {
         console.log(`    \x1b[33m⚠ 빈 응답, 재시도...\x1b[0m`);
-        const retry = await chatComplete(structureSystem, prompt, 16384);
-        if (!retry || retry.trim().length < 10) {
+        raw = await chat(structureSystem, prompt, 16384);
+        if (!raw || raw.trim().length < 10) {
           console.log(`    \x1b[31m✗ 재시도도 빈 응답\x1b[0m`);
-          continue;
+          completedCount++;
+          return [] as StructurePage[];
         }
-        const sections = parseJSON<StructurePage[]>(retry).filter(s => s.title && s.content && s.content.length > 30);
-        // fall through to process sections below
-        for (const section of sections) {
-          const slug = slugify(section.title);
-          if (!slug) continue;
-          const existing = store.getPage(slug);
-          if (existing) {
-            store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
-          } else {
-            store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
-            sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
-          }
-        }
-        console.log(`    → ${sections.length}개 섹션`);
-        continue;
       }
       const sections = parseJSON<StructurePage[]>(raw).filter(s => s.title && s.content && s.content.length > 30);
+      completedCount++;
+      console.log(`    → ${sections.length}개 섹션 (완료 ${completedCount}/${chunks.length})`);
+      return sections;
+    } catch (e: unknown) {
+      const message = e instanceof Error ? e.message : String(e);
+      console.log(`    \x1b[31m✗ 실패: ${message}\x1b[0m`);
+      completedCount++;
+      return [] as StructurePage[];
+    }
+  });
-      for (const section of sections) {
-        const slug = slugify(section.title);
-        if (!slug) continue;
+  // Store results sequentially (SQLite writes must be sequential)
+  let orderCounter = 0;
+  const sourcePageSummaries: string[] = [];
-        const existing = store.getPage(slug);
-        if (existing) {
-          store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
-        } else {
-          store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
-          sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
-        }
+  for (const sections of chunkResults) {
+    for (const section of sections) {
+      const slug = slugify(section.title);
+      if (!slug) continue;
+      const existing = store.getPage(slug);
+      if (existing) {
+        store.updatePageContent(existing.id, existing.content + "\n\n" + section.content);
+      } else {
+        store.addPage(slug, section.title, section.content, sourceId, slug, "source", orderCounter++);
+        sourcePageSummaries.push(`- ${section.title}: ${section.content.slice(0, 150).replace(/\n/g, " ")}`);
       }
-      console.log(`    → ${sections.length}개 섹션`);
-    } catch (e: any) {
-      console.log(`    \x1b[31m✗ 실패: ${e.message}\x1b[0m`);
     }
   }
   const sourceCount = orderCounter;
-  console.log(`\x1b[32m  📖 ${sourceCount}개 원본 페이지 생성 완료\x1b[0m`);
+  const phase1Sec = ((performance.now() - phase1Start) / 1000).toFixed(1);
+  console.log(`\x1b[32m✅ Phase 1 완료 (${phase1Sec}초) — 📖 ${sourceCount}개 원본 페이지 생성\x1b[0m`);
   // ── Phase 2: Extract concept pages ──
-  console.log(`\x1b[34m🧠 Phase 2: 개념 페이지 추출...\x1b[0m`);
+  const phase2Start = performance.now();
+  console.log(`\x1b[34m⏳ Phase 2: 개념 추출 중...\x1b[0m`);
   // Process source pages in small batches for concept extraction
   const batchSize = 5;
@@ -292,7 +314,7 @@ export async function llmChunkDocument(
     const conceptSystem = getConceptSystem(persona);
     try {
-      const raw = await chatComplete(conceptSystem, prompt, 16384);
+      const raw = await chat(conceptSystem, prompt, 16384);
       const concepts = parseJSON<ConceptPage[]>(raw).filter(c => c.title && c.content && c.content.length > 50);
       for (const concept of concepts) {
@@ -315,12 +337,70 @@ export async function llmChunkDocument(
         conceptCount++;
       }
       console.log(`    → ${concepts.length}개 개념`);
-    } catch (e: any) {
-      console.log(`    \x1b[31m✗ 실패: ${e.message}\x1b[0m`);
+    } catch (e: unknown) {
+      const message = e instanceof Error ? e.message : String(e);
+      console.log(`    \x1b[31m✗ 실패: ${message}\x1b[0m`);
     }
   }
-  console.log(`\x1b[32m  📝 ${conceptCount}개 개념 페이지 생성 완료\x1b[0m`);
+  const phase2Sec = ((performance.now() - phase2Start) / 1000).toFixed(1);
+  console.log(`\x1b[32m✅ Phase 2 완료 (${phase2Sec}초) — 📝 ${conceptCount}개 개념 페이지 생성\x1b[0m`);
+  // ── Phase 2.5: Generate quizzes from concept pages ──
+  let quizCount = 0;
+  try {
+    const conceptPagesForQuiz = store.listConceptPages();
+    if (conceptPagesForQuiz.length > 0) {
+      console.log(`\x1b[34m⏳ Phase 2.5: 퀴즈 생성 중... (${conceptPagesForQuiz.length}개 개념 페이지)\x1b[0m`);
+      const quizSystem = `You are a quiz generator for a study wiki. Generate quiz questions that test UNDERSTANDING, not just memorization.
+Focus on higher-order thinking: "왜?", "어떻게?", "비교하라", "설명하라" style questions.
+Return valid JSON only. No markdown fences.`;
+      await parallelMap(conceptPagesForQuiz, 3, async (page, i) => {
+        try {
+          const quizPrompt = `Based on this wiki content, generate 2-3 quiz questions that test UNDERSTANDING, not just memorization.
+Include questions that ask "왜?", "어떻게?", "비교하라" etc.
+Types: "fill_blank" (빈칸 채우기), "ox" (OX 퀴즈 - true/false), "short_answer" (단답형)
+Content title: ${page.title}
+Content:
+${page.content.slice(0, 3000)}
+Respond with a JSON array only:
+[{"question": "___은 양자역학에서 위치와 운동량을 동시에 측정할 수 없다는 원리이다.", "answer": "불확정성 원리", "explanation": "이 원리는 양자역학의 근본적 한계를 보여주며, 측정 행위 자체가 시스템에 영향을 주기 때문입니다.", "type": "fill_blank"}]
+Rules:
+- For fill_blank: use ___ to mark the blank in the question
+- For ox: question should be a statement, answer should be "O" or "X"
+- For short_answer: question should be answerable in 1-3 words
+- Include "explanation" field: a brief 1-2 sentence explanation of WHY the answer is correct
+- Questions should test understanding, application, or analysis — not just recall
+- Write questions in Korean when the content is in Korean`;
+          const raw = await chat(quizSystem, quizPrompt, 2048);
+          const quizzes = parseJSON<Array<{ question: string; answer: string; explanation?: string; type: string }>>(raw);
+          for (const q of quizzes) {
+            if (q.question && q.answer && q.type) {
+              store.addQuiz(page.id, q.question, q.answer, q.type, q.explanation || "");
+              quizCount++;
+            }
+          }
+        } catch (e: unknown) {
+          // Quiz generation is non-critical; silently skip failures
+          const message = e instanceof Error ? e.message : String(e);
+          console.log(`    \x1b[33m⚠ 퀴즈 생성 실패 (${page.title}): ${message}\x1b[0m`);
+        }
+      });
+      console.log(`\x1b[32m  🧩 ${quizCount}개 퀴즈 생성 완료\x1b[0m`);
+    }
+  } catch (e: unknown) {
+    // Phase 2.5 is optional — don't block the pipeline
+    const message = e instanceof Error ? e.message : String(e);
+    console.log(`\x1b[33m  ⚠ 퀴즈 생성 단계 건너뜀: ${message}\x1b[0m`);
+  }
   // ── Phase 3: Resolve wiki links + inject concept links into source pages ──
   console.log(`\x1b[34m🔗 위키 링크 해석 중...\x1b[0m`);
@@ -347,31 +427,36 @@ export async function llmChunkDocument(
   const srcPages = allPages.filter(p => p.page_type === "source");
   // Build search terms: full title + key words from title (2+ words long)
-  const searchTerms: Array<{ term: string; concept: typeof conceptPages[0] }> = [];
+  const searchTerms: Array<{ term: string; concept: typeof conceptPages[0]; regex: RegExp | null }> = [];
   for (const concept of conceptPages) {
-    searchTerms.push({ term: concept.title, concept });
+    searchTerms.push({ term: concept.title, concept, regex: null });
     // Also try individual significant words from multi-word titles
     const words = concept.title.split(/\s+/).filter(w => w.length >= 4 && !/^(and|the|for|with|from|into)$/i.test(w));
     if (words.length >= 2) {
       // Try pairs of consecutive words
       for (let i = 0; i < words.length - 1; i++) {
-        searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept });
+        searchTerms.push({ term: `${words[i]} ${words[i + 1]}`, concept, regex: null });
       }
     }
   }
   // Sort by term length descending for longest match first
   searchTerms.sort((a, b) => b.term.length - a.term.length);
+  // Pre-compile RegExp objects outside the page loop
+  for (const entry of searchTerms) {
+    if (entry.term.length < 3) continue;
+    const escaped = entry.term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+    entry.regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
+  }
   for (const srcPage of srcPages) {
     let content = srcPage.content;
     let modified = false;
     const linkedConcepts = new Set<number>();
-    for (const { term, concept } of searchTerms) {
+    for (const { term, concept, regex } of searchTerms) {
       if (linkedConcepts.has(concept.id)) continue; // One link per concept per page
-      if (term.length < 3) continue;
-      const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-      const regex = new RegExp(`(?<!\\[)(?<![\\w/])(${escaped})(?![\\w])(?!\\])(?![^[]*\\])`, "i");
+      if (term.length < 3 || !regex) continue;
       const match = regex.exec(content);
       if (match) {
         const replacement = `[${match[1]}](/wiki/${concept.slug})`;