@open330/kiwimu 0.8.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ import { Store } from "../store";
2
+ import { LLMClient } from "../llm-client";
3
+ import { loadConfig, getActivePersona, type Persona, type LLMConfig } from "../config";
4
+ import { slugify } from "../pipeline/chunker";
5
+ import { stripJsonFences } from "../utils";
6
+
7
+ export interface DynamicQAResult {
8
+ pageId: number;
9
+ slug: string;
10
+ title: string;
11
+ content: string;
12
+ isPromotable: boolean;
13
+ suggestedTitle: string;
14
+ keyConcepts: string[];
15
+ }
16
+
17
+ export async function generateDynamicPage(
18
+ store: Store,
19
+ llmClient: LLMClient,
20
+ persona: Persona | null,
21
+ parentPage: { id: number; slug: string; title: string; content: string },
22
+ selectedText: string,
23
+ userQuestion: string
24
+ ): Promise<DynamicQAResult> {
25
+ // 1. Build context hierarchy
26
+ // L1: selectedText (truncate to 2000 chars)
27
+ // L2: parentPage.content
28
+ // L3: related pages (backlinks + forward links, title + first 300 chars)
29
+ // L4: all concept titles
30
+
31
+ const backlinks = store.getBacklinks(parentPage.id);
32
+ const allLinks = store.getForwardLinks(parentPage.id);
33
+
34
+ const relatedSummaries = [...backlinks, ...allLinks]
35
+ .map(p => `- ${p.title}: ${(p as any).content?.slice(0, 300) || ''}`)
36
+ .slice(0, 10)
37
+ .join('\n');
38
+
39
+ const conceptTitles = store.listConceptPages()
40
+ .slice(0, 50)
41
+ .map(p => p.title)
42
+ .join(', ');
43
+
44
+ // 2. Build system prompt
45
+ const personaStyle = persona ? `\n\nStyle: ${persona.content_style || persona.system_prompt || ''}` : '';
46
+
47
+ const systemPrompt = `You are a study wiki editor. A student is reading a wiki page and has selected some text. They have a follow-up question about it. Your job is to create a new, self-contained concept page that answers their question thoroughly.
48
+
49
+ Rules:
50
+ - Create a focused wiki page (2-4 paragraphs) that answers the question
51
+ - Use [[wiki links]] to reference existing concepts where relevant
52
+ - Include examples, formulas (LaTeX $..$ / $$...$$), and definitions as appropriate
53
+ - The page should be educational and self-contained
54
+ - Return valid JSON only${personaStyle}`;
55
+
56
+ const userPrompt = `## Student's Question
57
+ ${userQuestion}
58
+
59
+ ## Selected Text
60
+ ${selectedText.slice(0, 2000)}
61
+
62
+ ## Current Page: ${parentPage.title}
63
+ ${parentPage.content.slice(0, 5000)}
64
+
65
+ ## Related Pages
66
+ ${relatedSummaries || '(none)'}
67
+
68
+ ## All Wiki Concepts
69
+ ${conceptTitles || '(none)'}
70
+
71
+ Return a JSON object:
72
+ {"title": "Short concept title", "content": "Full markdown content with [[wiki links]]", "isPromotable": true, "keyConcepts": ["concept1", "concept2"]}
73
+
74
+ - "isPromotable": true if the answer has enough educational substance (2+ paragraphs, definitions, examples) to be a standalone wiki page, false if it's just a brief clarification
75
+ - "keyConcepts": array of 1-5 key concept terms mentioned in the answer`;
76
+
77
+ // 3. Call LLM
78
+ const raw = await llmClient.chatComplete(systemPrompt, userPrompt, 4096);
79
+
80
+ // 4. Parse response — robust JSON extraction with multiple fallbacks
81
+ let parsed: { title: string; content: string; isPromotable?: boolean; keyConcepts?: string[] };
82
+ try {
83
+ // Remove markdown code fences if present
84
+ let cleaned = stripJsonFences(raw);
85
+
86
+ // Try to extract JSON object from response
87
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
88
+ if (!jsonMatch) throw new Error("No JSON found");
89
+
90
+ let jsonStr = jsonMatch[0];
91
+ try {
92
+ parsed = JSON.parse(jsonStr);
93
+ } catch {
94
+ // Try repairs for truncated JSON
95
+ // Close unclosed strings
96
+ const lastQuote = jsonStr.lastIndexOf('"');
97
+ const afterQuote = jsonStr.slice(lastQuote + 1);
98
+ if (afterQuote.indexOf('"') === -1 && afterQuote.length > 0) {
99
+ jsonStr = jsonStr.slice(0, jsonStr.lastIndexOf('",') + 2) + '}';
100
+ }
101
+ // Balance braces
102
+ const openBraces = (jsonStr.match(/\{/g) || []).length;
103
+ const closeBraces = (jsonStr.match(/\}/g) || []).length;
104
+ for (let i = 0; i < openBraces - closeBraces; i++) jsonStr += "}";
105
+ parsed = JSON.parse(jsonStr);
106
+ }
107
+
108
+ // Ensure content is not JSON-encoded (sometimes LLM double-encodes)
109
+ if (parsed.content && parsed.content.startsWith('{')) {
110
+ try {
111
+ const inner = JSON.parse(parsed.content);
112
+ if (inner.content) parsed.content = inner.content;
113
+ } catch { /* not double-encoded, use as-is */ }
114
+ }
115
+ } catch {
116
+ // Fallback: treat the entire raw response as markdown content
117
+ // Strip any JSON artifacts from the beginning
118
+ let fallbackContent = stripJsonFences(raw)
119
+ .replace(/^\s*\{\s*"title"\s*:\s*"[^"]*"\s*,\s*"content"\s*:\s*"?/m, "")
120
+ .replace(/"\s*\}\s*$/m, "")
121
+ .trim();
122
+
123
+ // If it still looks like JSON, try one more parse
124
+ if (fallbackContent.startsWith('{')) {
125
+ try {
126
+ const lastTry = JSON.parse(fallbackContent);
127
+ if (lastTry.content) fallbackContent = lastTry.content;
128
+ } catch { /* use as-is */ }
129
+ }
130
+
131
+ parsed = {
132
+ title: userQuestion.slice(0, 50),
133
+ content: fallbackContent || raw
134
+ };
135
+ }
136
+
137
+ // Unescape JSON string escapes that might remain in content
138
+ if (parsed.content) {
139
+ parsed.content = parsed.content
140
+ .replace(/\\n/g, '\n')
141
+ .replace(/\\t/g, '\t')
142
+ .replace(/\\"/g, '"')
143
+ .replace(/\\\\/g, '\\');
144
+ }
145
+
146
+ if (!parsed.title || !parsed.content || parsed.content.length < 20) {
147
+ throw new Error("LLM 응답이 불충분합니다. 다시 시도해주세요.");
148
+ }
149
+
150
+ // 5. Generate slug, handle collision
151
+ let slug = slugify(parsed.title);
152
+ if (!slug) slug = slugify(userQuestion);
153
+ if (!slug) slug = `dynamic-${Date.now()}`;
154
+
155
+ let finalSlug = slug;
156
+ let counter = 2;
157
+ while (store.getPage(finalSlug)) {
158
+ finalSlug = `${slug}-${counter++}`;
159
+ }
160
+
161
+ // 6. Store the page
162
+ const pageId = store.addDynamicPage(finalSlug, parsed.title, parsed.content, parentPage.id, userQuestion);
163
+
164
+ // 7. Add link from parent to new page
165
+ store.addLink(parentPage.id, pageId, parsed.title);
166
+
167
+ // 8. Log usage
168
+ const usage = llmClient.getUsageStats();
169
+ const estimatedCostUsd = llmClient.getEstimatedCost();
170
+ store.addUsageLog(null, usage.totalCalls, usage.promptTokens, usage.completionTokens, usage.totalTokens, estimatedCostUsd);
171
+
172
+ store.addActivityLog('query', `Asked: ${userQuestion.slice(0, 80)}`, 'page', pageId, { parentSlug: parentPage.slug, selectedText: selectedText.slice(0, 200) });
173
+
174
+ // Determine promotability: content should be substantial (2+ paragraphs, 200+ chars)
175
+ const isPromotable = parsed.isPromotable !== undefined
176
+ ? parsed.isPromotable
177
+ : parsed.content.length >= 200 && (parsed.content.match(/\n\n/g) || []).length >= 1;
178
+
179
+ const keyConcepts = Array.isArray(parsed.keyConcepts) ? parsed.keyConcepts.filter(c => typeof c === 'string') : [];
180
+
181
+ return {
182
+ pageId,
183
+ slug: finalSlug,
184
+ title: parsed.title,
185
+ content: parsed.content,
186
+ isPromotable,
187
+ suggestedTitle: parsed.title,
188
+ keyConcepts,
189
+ };
190
+ }
@@ -0,0 +1,122 @@
1
+ import type { Store } from "../store";
2
+ import type { LLMConfig } from "../config";
3
+
4
+ // Cosine similarity between two vectors
5
+ function cosineSimilarity(a: Float32Array, b: Float32Array): number {
6
+ let dot = 0, normA = 0, normB = 0;
7
+ for (let i = 0; i < a.length; i++) {
8
+ dot += a[i] * b[i];
9
+ normA += a[i] * a[i];
10
+ normB += b[i] * b[i];
11
+ }
12
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
13
+ }
14
+
15
+ // Get embedding — auto-detect provider
16
+ async function getEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
17
+ const input = text.slice(0, 8000);
18
+
19
+ if (config.provider === "gemini") {
20
+ return await geminiEmbedding(input, config);
21
+ } else if (config.provider === "azure-openai") {
22
+ return await azureEmbedding(input, config);
23
+ } else if (config.provider === "openai") {
24
+ return await openaiEmbedding(input, config);
25
+ }
26
+ throw new Error(`Embedding not supported for provider: ${config.provider}`);
27
+ }
28
+
29
+ // Gemini Embedding API (free)
30
+ async function geminiEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
31
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent`;
32
+ const resp = await fetch(url, {
33
+ method: "POST",
34
+ headers: { "Content-Type": "application/json", "x-goog-api-key": config.api_key },
35
+ body: JSON.stringify({
36
+ model: "models/gemini-embedding-001",
37
+ content: { parts: [{ text }] }
38
+ })
39
+ });
40
+ if (!resp.ok) throw new Error(`Gemini embedding error (${resp.status})`);
41
+ const data = await resp.json() as { embedding: { values: number[] } };
42
+ return new Float32Array(data.embedding.values);
43
+ }
44
+
45
+ // Azure OpenAI Embedding
46
+ async function azureEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
47
+ const url = `${config.endpoint}/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-06-01`;
48
+ const resp = await fetch(url, {
49
+ method: "POST",
50
+ headers: { "Content-Type": "application/json", "api-key": config.api_key },
51
+ body: JSON.stringify({ input: text, model: "text-embedding-3-small" })
52
+ });
53
+ if (!resp.ok) throw new Error(`Azure embedding error (${resp.status})`);
54
+ const data = await resp.json() as { data: Array<{ embedding: number[] }> };
55
+ return new Float32Array(data.data[0].embedding);
56
+ }
57
+
58
+ // OpenAI Embedding
59
+ async function openaiEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
60
+ const resp = await fetch("https://api.openai.com/v1/embeddings", {
61
+ method: "POST",
62
+ headers: { "Content-Type": "application/json", "Authorization": `Bearer ${config.api_key}` },
63
+ body: JSON.stringify({ input: text, model: "text-embedding-3-small" })
64
+ });
65
+ if (!resp.ok) throw new Error(`OpenAI embedding error (${resp.status})`);
66
+ const data = await resp.json() as { data: Array<{ embedding: number[] }> };
67
+ return new Float32Array(data.data[0].embedding);
68
+ }
69
+
70
+ // Generate embeddings for all pages that don't have one
71
+ export async function generateMissingEmbeddings(store: Store, config: LLMConfig, onProgress?: (msg: string) => void): Promise<number> {
72
+ const pages = store.getPagesWithoutEmbeddings();
73
+ if (!pages.length) return 0;
74
+
75
+ onProgress?.(`⏳ ${pages.length}개 페이지 임베딩 생성 중...`);
76
+ let count = 0;
77
+
78
+ for (const page of pages) {
79
+ try {
80
+ const text = `${page.title}\n\n${page.content.slice(0, 4000)}`;
81
+ const embedding = await getEmbedding(text, config);
82
+ store.saveEmbedding(page.id, embedding, "text-embedding-3-small");
83
+ count++;
84
+ if (count % 10 === 0) onProgress?.(` ${count}/${pages.length} 완료`);
85
+ } catch (e) {
86
+ // Skip failed pages silently
87
+ onProgress?.(` ⚠ ${page.title} 실패: ${e instanceof Error ? e.message : String(e)}`);
88
+ }
89
+ }
90
+
91
+ onProgress?.(`✅ ${count}개 임베딩 생성 완료`);
92
+ return count;
93
+ }
94
+
95
+ // Semantic search: find pages similar to query text
96
+ export async function semanticSearch(
97
+ query: string,
98
+ store: Store,
99
+ config: LLMConfig,
100
+ limit: number = 5
101
+ ): Promise<Array<{slug: string; title: string; pageType: string; origin: string; similarity: number}>> {
102
+ const allEmbeddings = store.getAllEmbeddings();
103
+ if (!allEmbeddings.length) return [];
104
+
105
+ // Get query embedding
106
+ const queryEmbedding = await getEmbedding(query, config);
107
+
108
+ // Calculate similarities
109
+ const results = allEmbeddings.map(page => ({
110
+ slug: page.slug,
111
+ title: page.title,
112
+ pageType: page.pageType,
113
+ origin: page.origin,
114
+ similarity: cosineSimilarity(queryEmbedding, page.embedding)
115
+ }));
116
+
117
+ // Sort by similarity descending, return top N
118
+ return results
119
+ .sort((a, b) => b.similarity - a.similarity)
120
+ .slice(0, limit)
121
+ .filter(r => r.similarity > 0.3); // Minimum threshold
122
+ }
@@ -0,0 +1,185 @@
1
+ import type { Store } from "../store";
2
+ import type { LLMConfig } from "../config";
3
+
4
+ export interface IndexPage {
5
+ id: number;
6
+ title: string;
7
+ slug: string;
8
+ type: "source" | "concept";
9
+ linkCount: number;
10
+ }
11
+
12
+ export interface IndexCategory {
13
+ name: string;
14
+ slug: string;
15
+ description?: string;
16
+ pages: IndexPage[];
17
+ }
18
+
19
+ export interface ContentIndex {
20
+ categories: IndexCategory[];
21
+ totalPages: number;
22
+ totalLinks: number;
23
+ generatedAt: string;
24
+ }
25
+
26
+ interface IndexConfig {
27
+ useLLM?: boolean;
28
+ llmConfig?: LLMConfig;
29
+ }
30
+
31
+ function slugify(text: string): string {
32
+ return text
33
+ .toLowerCase()
34
+ .replace(/[^\w\s가-힣-]/g, "")
35
+ .replace(/\s+/g, "-")
36
+ .replace(/-+/g, "-")
37
+ .trim();
38
+ }
39
+
40
+ /**
41
+ * Simple grouping: group pages by their source document, then orphan concepts separately.
42
+ */
43
+ function groupBySource(store: Store): IndexCategory[] {
44
+ const grouped = store.getPagesBySource();
45
+ const categories: IndexCategory[] = [];
46
+
47
+ for (const group of grouped) {
48
+ categories.push({
49
+ name: group.sourceTitle,
50
+ slug: slugify(group.sourceTitle),
51
+ description: `${group.sourceTitle} 소스 문서에서 생성된 페이지`,
52
+ pages: group.pages.map((p) => ({
53
+ id: p.id,
54
+ title: p.title,
55
+ slug: p.slug,
56
+ type: p.page_type as "source" | "concept",
57
+ linkCount: p.linkCount,
58
+ })),
59
+ });
60
+ }
61
+
62
+ // Sort categories alphabetically
63
+ categories.sort((a, b) => a.name.localeCompare(b.name));
64
+
65
+ // Sort pages within each category by link count (descending)
66
+ for (const cat of categories) {
67
+ cat.pages.sort((a, b) => b.linkCount - a.linkCount);
68
+ }
69
+
70
+ return categories;
71
+ }
72
+
73
+ /**
74
+ * Smart grouping: use LLM to categorize pages into topic clusters.
75
+ */
76
+ async function groupByLLM(store: Store, llmConfig: LLMConfig): Promise<IndexCategory[]> {
77
+ const { LLMClient } = await import("../llm-client");
78
+ const client = new LLMClient(llmConfig);
79
+
80
+ const pages = store.listPages();
81
+ const links = store.getAllLinks();
82
+
83
+ // Build link count map
84
+ const linkCountMap = new Map<number, number>();
85
+ for (const link of links) {
86
+ linkCountMap.set(link.to_page_id, (linkCountMap.get(link.to_page_id) || 0) + 1);
87
+ linkCountMap.set(link.from_page_id, (linkCountMap.get(link.from_page_id) || 0) + 1);
88
+ }
89
+
90
+ const pageTitles = pages.map((p) => `- ${p.title} (${p.page_type})`).join("\n");
91
+
92
+ const system = `You are a knowledge organizer. Given a list of wiki page titles, categorize them into 5-10 meaningful topic clusters. Return ONLY a JSON array of objects with: name (category name), description (short description), pages (array of page titles belonging to this category). Every page must be assigned to exactly one category.`;
93
+
94
+ const userMessage = `Categorize these wiki pages into topic clusters:\n\n${pageTitles}\n\nReturn JSON array only, no markdown fences.`;
95
+
96
+ try {
97
+ const response = await client.chatComplete(system, userMessage, 4096);
98
+
99
+ // Parse the JSON response
100
+ const cleaned = response.replace(/```json?\s*|```\s*/g, "").trim();
101
+ const clusters = JSON.parse(cleaned) as Array<{
102
+ name: string;
103
+ description?: string;
104
+ pages: string[];
105
+ }>;
106
+
107
+ // Build page lookup by title
108
+ const pageByTitle = new Map(pages.map((p) => [p.title, p]));
109
+
110
+ const categories: IndexCategory[] = clusters.map((cluster) => ({
111
+ name: cluster.name,
112
+ slug: slugify(cluster.name),
113
+ description: cluster.description,
114
+ pages: cluster.pages
115
+ .map((title) => {
116
+ const page = pageByTitle.get(title);
117
+ if (!page) return null;
118
+ return {
119
+ id: page.id,
120
+ title: page.title,
121
+ slug: page.slug,
122
+ type: page.page_type as "source" | "concept",
123
+ linkCount: linkCountMap.get(page.id) || 0,
124
+ };
125
+ })
126
+ .filter((p): p is IndexPage => p !== null),
127
+ }));
128
+
129
+ // Add any uncategorized pages
130
+ const categorizedTitles = new Set(clusters.flatMap((c) => c.pages));
131
+ const uncategorized = pages.filter((p) => !categorizedTitles.has(p.title));
132
+ if (uncategorized.length > 0) {
133
+ categories.push({
134
+ name: "기타",
135
+ slug: "etc",
136
+ description: "분류되지 않은 페이지",
137
+ pages: uncategorized.map((p) => ({
138
+ id: p.id,
139
+ title: p.title,
140
+ slug: p.slug,
141
+ type: p.page_type as "source" | "concept",
142
+ linkCount: linkCountMap.get(p.id) || 0,
143
+ })),
144
+ });
145
+ }
146
+
147
+ // Sort
148
+ categories.sort((a, b) => a.name.localeCompare(b.name));
149
+ for (const cat of categories) {
150
+ cat.pages.sort((a, b) => b.linkCount - a.linkCount);
151
+ }
152
+
153
+ return categories;
154
+ } catch {
155
+ // Fallback to simple grouping if LLM fails
156
+ console.warn("LLM categorization failed, falling back to source-based grouping");
157
+ return groupBySource(store);
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Generate a structured content index for all wiki pages.
163
+ */
164
+ export async function generateContentIndex(
165
+ store: Store,
166
+ config?: IndexConfig
167
+ ): Promise<ContentIndex> {
168
+ const totalPages = store.countPages();
169
+ const totalLinks = store.getAllLinks().length;
170
+
171
+ let categories: IndexCategory[];
172
+
173
+ if (config?.useLLM && config.llmConfig?.api_key) {
174
+ categories = await groupByLLM(store, config.llmConfig);
175
+ } else {
176
+ categories = groupBySource(store);
177
+ }
178
+
179
+ return {
180
+ categories,
181
+ totalPages,
182
+ totalLinks,
183
+ generatedAt: new Date().toISOString(),
184
+ };
185
+ }
@@ -1,5 +1,5 @@
1
1
  import { Store } from "../store";
2
- import { type LLMConfig, type Persona } from "../config";
2
+ import { type LLMConfig, type Persona, type WikiSchema } from "../config";
3
3
  import { LLMClient, type UsageStats } from "../llm-client";
4
4
 
5
5
  export interface IngestResult {
@@ -15,10 +15,16 @@ export async function ingestUrl(
15
15
  url: string,
16
16
  llmConfig: LLMConfig,
17
17
  persona: Persona | null,
18
- onProgress?: (status: string) => void
18
+ onProgress?: (status: string) => void,
19
+ schema?: WikiSchema
19
20
  ): Promise<IngestResult> {
20
21
  const client = new LLMClient(llmConfig);
21
22
  client.resetUsageStats();
23
+ client.onRetry = (attempt, max, delayMs) => {
24
+ const delaySec = Math.round(delayMs / 1000);
25
+ onProgress?.(`⏳ Rate limit — 재시도 ${attempt}/${max}, ${delaySec}초 대기...`);
26
+ console.log(`\x1b[33m⏳ Rate limit — retry ${attempt}/${max}, waiting ${delaySec}s...\x1b[0m`);
27
+ };
22
28
 
23
29
  const { fetchPage } = await import("../ingest/web");
24
30
  const { llmChunkDocument, htmlToRawText } = await import("../pipeline/llm-chunker");
@@ -27,15 +33,30 @@ export async function ingestUrl(
27
33
  const { title, html } = await fetchPage(url);
28
34
 
29
35
  const source = store.addSource(url, "web", title, html);
30
- const rawText = htmlToRawText(html);
36
+ const rawText = await htmlToRawText(html);
37
+
38
+ if (!rawText || rawText.trim().length < 50) {
39
+ throw new Error("추출된 텍스트가 너무 짧습니다. 파일 내용을 확인해주세요.");
40
+ }
31
41
 
32
- onProgress?.("⏳ LLM 분석 시작...");
33
- const { sourceCount, conceptCount } = await llmChunkDocument(rawText, title, source.id, store, 0, persona, client);
42
+ // Only delete existing pages if NOT resuming (no checkpoints = fresh ingest)
43
+ if (!store.hasCheckpoints(source.id)) {
44
+ store.deletePagesBySource(source.id);
45
+ }
46
+
47
+ const isResume = store.hasCheckpoints(source.id);
48
+ onProgress?.(isResume ? "⏳ LLM 분석 재개..." : "⏳ LLM 분석 시작...");
49
+ const { sourceCount, conceptCount } = await llmChunkDocument(rawText, title, source.id, store, 0, persona, client, onProgress, schema);
50
+
51
+ // Pipeline completed successfully — clear checkpoints for clean future re-ingests
52
+ store.clearCheckpoints(source.id);
34
53
 
35
54
  const u = client.getUsageStats();
36
55
  const estimatedCostUsd = client.getEstimatedCost();
37
56
  store.addUsageLog(source.id, u.totalCalls, u.promptTokens, u.completionTokens, u.totalTokens, estimatedCostUsd);
38
57
 
58
+ store.addActivityLog('ingest', `Ingested ${title}`, 'source', source.id, { url, sourceCount, conceptCount });
59
+
39
60
  return {
40
61
  sourceCount,
41
62
  conceptCount,
@@ -51,10 +72,16 @@ export async function ingestFile(
51
72
  originalName: string,
52
73
  llmConfig: LLMConfig,
53
74
  persona: Persona | null,
54
- onProgress?: (status: string) => void
75
+ onProgress?: (status: string) => void,
76
+ schema?: WikiSchema
55
77
  ): Promise<IngestResult> {
56
78
  const client = new LLMClient(llmConfig);
57
79
  client.resetUsageStats();
80
+ client.onRetry = (attempt, max, delayMs) => {
81
+ const delaySec = Math.round(delayMs / 1000);
82
+ onProgress?.(`⏳ Rate limit — 재시도 ${attempt}/${max}, ${delaySec}초 대기...`);
83
+ console.log(`\x1b[33m⏳ Rate limit — retry ${attempt}/${max}, waiting ${delaySec}s...\x1b[0m`);
84
+ };
58
85
 
59
86
  const { llmChunkDocument } = await import("../pipeline/llm-chunker");
60
87
 
@@ -63,34 +90,65 @@ export async function ingestFile(
63
90
  let title: string;
64
91
  let text: string;
65
92
 
66
- if (ext === "pdf") {
67
- const { extractTextFromPdf } = await import("../ingest/pdf");
68
- onProgress?.("⏳ PDF 텍스트 추출 중...");
69
- ({ title, text } = await extractTextFromPdf(filePath));
70
- } else if (ext === "docx") {
71
- const { extractTextFromDocx } = await import("../ingest/docx");
72
- onProgress?.("⏳ DOCX 텍스트 추출 중...");
73
- ({ title, text } = await extractTextFromDocx(filePath));
74
- } else if (ext === "pptx") {
75
- const { extractTextFromPptx } = await import("../ingest/pptx");
76
- onProgress?.("⏳ PPTX 텍스트 추출 중...");
77
- ({ title, text } = await extractTextFromPptx(filePath));
78
- } else {
79
- const { extractWithTextutil } = await import("../ingest/legacy");
80
- onProgress?.(`⏳ ${ext.toUpperCase()} 텍스트 추출 중...`);
81
- ({ title, text } = await extractWithTextutil(filePath));
93
+ switch (ext) {
94
+ case "pdf": {
95
+ const { extractTextFromPdf } = await import("../ingest/pdf");
96
+ onProgress?.("⏳ PDF 텍스트 추출 중...");
97
+ ({ title, text } = await extractTextFromPdf(filePath));
98
+ break;
99
+ }
100
+ case "docx": {
101
+ const { extractTextFromDocx } = await import("../ingest/docx");
102
+ onProgress?.("⏳ DOCX 텍스트 추출 중...");
103
+ ({ title, text } = await extractTextFromDocx(filePath));
104
+ break;
105
+ }
106
+ case "pptx": {
107
+ const { extractTextFromPptx } = await import("../ingest/pptx");
108
+ onProgress?.("⏳ PPTX 텍스트 추출 중...");
109
+ ({ title, text } = await extractTextFromPptx(filePath));
110
+ break;
111
+ }
112
+ case "md": {
113
+ const { extractTextFromMarkdown } = await import("../ingest/markdown");
114
+ onProgress?.("⏳ MD 텍스트 추출 중...");
115
+ const result = extractTextFromMarkdown(filePath);
116
+ title = result.title;
117
+ text = result.text;
118
+ break;
119
+ }
120
+ default: {
121
+ const { extractWithTextutil } = await import("../ingest/legacy");
122
+ onProgress?.(`⏳ ${ext.toUpperCase()} 텍스트 추출 중...`);
123
+ ({ title, text } = await extractWithTextutil(filePath));
124
+ break;
125
+ }
126
+ }
127
+
128
+ if (!text || text.trim().length < 50) {
129
+ throw new Error("추출된 텍스트가 너무 짧습니다. 파일 내용을 확인해주세요.");
82
130
  }
83
131
 
84
132
  const source = store.addSource(filePath, ext, title, "(file)");
85
- store.deletePagesBySource(source.id);
86
133
 
87
- onProgress?.("⏳ LLM 분석 시작...");
88
- const { sourceCount, conceptCount } = await llmChunkDocument(text, title, source.id, store, 0, persona, client);
134
+ // Only delete existing pages if NOT resuming (no checkpoints = fresh ingest)
135
+ if (!store.hasCheckpoints(source.id)) {
136
+ store.deletePagesBySource(source.id);
137
+ }
138
+
139
+ const isResume = store.hasCheckpoints(source.id);
140
+ onProgress?.(isResume ? "⏳ LLM 분석 재개..." : "⏳ LLM 분석 시작...");
141
+ const { sourceCount, conceptCount } = await llmChunkDocument(text, title, source.id, store, 0, persona, client, onProgress, schema);
142
+
143
+ // Pipeline completed successfully — clear checkpoints for clean future re-ingests
144
+ store.clearCheckpoints(source.id);
89
145
 
90
146
  const u = client.getUsageStats();
91
147
  const estimatedCostUsd = client.getEstimatedCost();
92
148
  store.addUsageLog(source.id, u.totalCalls, u.promptTokens, u.completionTokens, u.totalTokens, estimatedCostUsd);
93
149
 
150
+ store.addActivityLog('ingest', `Ingested ${originalName}`, 'source', source.id, { filePath, sourceCount, conceptCount });
151
+
94
152
  return {
95
153
  sourceCount,
96
154
  conceptCount,