@open330/kiwimu 0.8.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +105 -27
- package/package.json +1 -1
- package/src/build/renderer.ts +272 -32
- package/src/build/static/dynamic-qa.js +423 -0
- package/src/build/static/edit-page.js +58 -0
- package/src/build/static/peek-panel.css +201 -0
- package/src/build/static/peek-panel.js +470 -0
- package/src/build/static/search.js +30 -15
- package/src/build/static/style.css +821 -6
- package/src/build/templates.ts +700 -48
- package/src/config.ts +41 -3
- package/src/demo/sample-data.ts +69 -2
- package/src/demo/setup.ts +25 -6
- package/src/expand/llm.ts +2 -2
- package/src/index.ts +467 -60
- package/src/ingest/docx.ts +1 -1
- package/src/ingest/markdown.ts +21 -0
- package/src/ingest/pdf.ts +4 -2
- package/src/llm-client.ts +63 -69
- package/src/pipeline/citations.ts +107 -0
- package/src/pipeline/llm-chunker.ts +277 -131
- package/src/pipeline/standardizer.ts +41 -0
- package/src/server.ts +465 -32
- package/src/services/dynamic-qa.ts +190 -0
- package/src/services/embedding.ts +122 -0
- package/src/services/index-generator.ts +185 -0
- package/src/services/ingest.ts +83 -25
- package/src/services/lint.ts +249 -0
- package/src/services/promote.ts +150 -0
- package/src/store.test.ts +11 -0
- package/src/store.ts +561 -28
- package/src/utils.ts +30 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import { Store } from "../store";
|
|
2
|
+
import { LLMClient } from "../llm-client";
|
|
3
|
+
import { loadConfig, getActivePersona, type Persona, type LLMConfig } from "../config";
|
|
4
|
+
import { slugify } from "../pipeline/chunker";
|
|
5
|
+
import { stripJsonFences } from "../utils";
|
|
6
|
+
|
|
7
|
+
export interface DynamicQAResult {
|
|
8
|
+
pageId: number;
|
|
9
|
+
slug: string;
|
|
10
|
+
title: string;
|
|
11
|
+
content: string;
|
|
12
|
+
isPromotable: boolean;
|
|
13
|
+
suggestedTitle: string;
|
|
14
|
+
keyConcepts: string[];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function generateDynamicPage(
|
|
18
|
+
store: Store,
|
|
19
|
+
llmClient: LLMClient,
|
|
20
|
+
persona: Persona | null,
|
|
21
|
+
parentPage: { id: number; slug: string; title: string; content: string },
|
|
22
|
+
selectedText: string,
|
|
23
|
+
userQuestion: string
|
|
24
|
+
): Promise<DynamicQAResult> {
|
|
25
|
+
// 1. Build context hierarchy
|
|
26
|
+
// L1: selectedText (truncate to 2000 chars)
|
|
27
|
+
// L2: parentPage.content
|
|
28
|
+
// L3: related pages (backlinks + forward links, title + first 300 chars)
|
|
29
|
+
// L4: all concept titles
|
|
30
|
+
|
|
31
|
+
const backlinks = store.getBacklinks(parentPage.id);
|
|
32
|
+
const allLinks = store.getForwardLinks(parentPage.id);
|
|
33
|
+
|
|
34
|
+
const relatedSummaries = [...backlinks, ...allLinks]
|
|
35
|
+
.map(p => `- ${p.title}: ${(p as any).content?.slice(0, 300) || ''}`)
|
|
36
|
+
.slice(0, 10)
|
|
37
|
+
.join('\n');
|
|
38
|
+
|
|
39
|
+
const conceptTitles = store.listConceptPages()
|
|
40
|
+
.slice(0, 50)
|
|
41
|
+
.map(p => p.title)
|
|
42
|
+
.join(', ');
|
|
43
|
+
|
|
44
|
+
// 2. Build system prompt
|
|
45
|
+
const personaStyle = persona ? `\n\nStyle: ${persona.content_style || persona.system_prompt || ''}` : '';
|
|
46
|
+
|
|
47
|
+
const systemPrompt = `You are a study wiki editor. A student is reading a wiki page and has selected some text. They have a follow-up question about it. Your job is to create a new, self-contained concept page that answers their question thoroughly.
|
|
48
|
+
|
|
49
|
+
Rules:
|
|
50
|
+
- Create a focused wiki page (2-4 paragraphs) that answers the question
|
|
51
|
+
- Use [[wiki links]] to reference existing concepts where relevant
|
|
52
|
+
- Include examples, formulas (LaTeX $..$ / $$...$$), and definitions as appropriate
|
|
53
|
+
- The page should be educational and self-contained
|
|
54
|
+
- Return valid JSON only${personaStyle}`;
|
|
55
|
+
|
|
56
|
+
const userPrompt = `## Student's Question
|
|
57
|
+
${userQuestion}
|
|
58
|
+
|
|
59
|
+
## Selected Text
|
|
60
|
+
${selectedText.slice(0, 2000)}
|
|
61
|
+
|
|
62
|
+
## Current Page: ${parentPage.title}
|
|
63
|
+
${parentPage.content.slice(0, 5000)}
|
|
64
|
+
|
|
65
|
+
## Related Pages
|
|
66
|
+
${relatedSummaries || '(none)'}
|
|
67
|
+
|
|
68
|
+
## All Wiki Concepts
|
|
69
|
+
${conceptTitles || '(none)'}
|
|
70
|
+
|
|
71
|
+
Return a JSON object:
|
|
72
|
+
{"title": "Short concept title", "content": "Full markdown content with [[wiki links]]", "isPromotable": true, "keyConcepts": ["concept1", "concept2"]}
|
|
73
|
+
|
|
74
|
+
- "isPromotable": true if the answer has enough educational substance (2+ paragraphs, definitions, examples) to be a standalone wiki page, false if it's just a brief clarification
|
|
75
|
+
- "keyConcepts": array of 1-5 key concept terms mentioned in the answer`;
|
|
76
|
+
|
|
77
|
+
// 3. Call LLM
|
|
78
|
+
const raw = await llmClient.chatComplete(systemPrompt, userPrompt, 4096);
|
|
79
|
+
|
|
80
|
+
// 4. Parse response — robust JSON extraction with multiple fallbacks
|
|
81
|
+
let parsed: { title: string; content: string; isPromotable?: boolean; keyConcepts?: string[] };
|
|
82
|
+
try {
|
|
83
|
+
// Remove markdown code fences if present
|
|
84
|
+
let cleaned = stripJsonFences(raw);
|
|
85
|
+
|
|
86
|
+
// Try to extract JSON object from response
|
|
87
|
+
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
|
|
88
|
+
if (!jsonMatch) throw new Error("No JSON found");
|
|
89
|
+
|
|
90
|
+
let jsonStr = jsonMatch[0];
|
|
91
|
+
try {
|
|
92
|
+
parsed = JSON.parse(jsonStr);
|
|
93
|
+
} catch {
|
|
94
|
+
// Try repairs for truncated JSON
|
|
95
|
+
// Close unclosed strings
|
|
96
|
+
const lastQuote = jsonStr.lastIndexOf('"');
|
|
97
|
+
const afterQuote = jsonStr.slice(lastQuote + 1);
|
|
98
|
+
if (afterQuote.indexOf('"') === -1 && afterQuote.length > 0) {
|
|
99
|
+
jsonStr = jsonStr.slice(0, jsonStr.lastIndexOf('",') + 2) + '}';
|
|
100
|
+
}
|
|
101
|
+
// Balance braces
|
|
102
|
+
const openBraces = (jsonStr.match(/\{/g) || []).length;
|
|
103
|
+
const closeBraces = (jsonStr.match(/\}/g) || []).length;
|
|
104
|
+
for (let i = 0; i < openBraces - closeBraces; i++) jsonStr += "}";
|
|
105
|
+
parsed = JSON.parse(jsonStr);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Ensure content is not JSON-encoded (sometimes LLM double-encodes)
|
|
109
|
+
if (parsed.content && parsed.content.startsWith('{')) {
|
|
110
|
+
try {
|
|
111
|
+
const inner = JSON.parse(parsed.content);
|
|
112
|
+
if (inner.content) parsed.content = inner.content;
|
|
113
|
+
} catch { /* not double-encoded, use as-is */ }
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
// Fallback: treat the entire raw response as markdown content
|
|
117
|
+
// Strip any JSON artifacts from the beginning
|
|
118
|
+
let fallbackContent = stripJsonFences(raw)
|
|
119
|
+
.replace(/^\s*\{\s*"title"\s*:\s*"[^"]*"\s*,\s*"content"\s*:\s*"?/m, "")
|
|
120
|
+
.replace(/"\s*\}\s*$/m, "")
|
|
121
|
+
.trim();
|
|
122
|
+
|
|
123
|
+
// If it still looks like JSON, try one more parse
|
|
124
|
+
if (fallbackContent.startsWith('{')) {
|
|
125
|
+
try {
|
|
126
|
+
const lastTry = JSON.parse(fallbackContent);
|
|
127
|
+
if (lastTry.content) fallbackContent = lastTry.content;
|
|
128
|
+
} catch { /* use as-is */ }
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
parsed = {
|
|
132
|
+
title: userQuestion.slice(0, 50),
|
|
133
|
+
content: fallbackContent || raw
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Unescape JSON string escapes that might remain in content
|
|
138
|
+
if (parsed.content) {
|
|
139
|
+
parsed.content = parsed.content
|
|
140
|
+
.replace(/\\n/g, '\n')
|
|
141
|
+
.replace(/\\t/g, '\t')
|
|
142
|
+
.replace(/\\"/g, '"')
|
|
143
|
+
.replace(/\\\\/g, '\\');
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (!parsed.title || !parsed.content || parsed.content.length < 20) {
|
|
147
|
+
throw new Error("LLM 응답이 불충분합니다. 다시 시도해주세요.");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// 5. Generate slug, handle collision
|
|
151
|
+
let slug = slugify(parsed.title);
|
|
152
|
+
if (!slug) slug = slugify(userQuestion);
|
|
153
|
+
if (!slug) slug = `dynamic-${Date.now()}`;
|
|
154
|
+
|
|
155
|
+
let finalSlug = slug;
|
|
156
|
+
let counter = 2;
|
|
157
|
+
while (store.getPage(finalSlug)) {
|
|
158
|
+
finalSlug = `${slug}-${counter++}`;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// 6. Store the page
|
|
162
|
+
const pageId = store.addDynamicPage(finalSlug, parsed.title, parsed.content, parentPage.id, userQuestion);
|
|
163
|
+
|
|
164
|
+
// 7. Add link from parent to new page
|
|
165
|
+
store.addLink(parentPage.id, pageId, parsed.title);
|
|
166
|
+
|
|
167
|
+
// 8. Log usage
|
|
168
|
+
const usage = llmClient.getUsageStats();
|
|
169
|
+
const estimatedCostUsd = llmClient.getEstimatedCost();
|
|
170
|
+
store.addUsageLog(null, usage.totalCalls, usage.promptTokens, usage.completionTokens, usage.totalTokens, estimatedCostUsd);
|
|
171
|
+
|
|
172
|
+
store.addActivityLog('query', `Asked: ${userQuestion.slice(0, 80)}`, 'page', pageId, { parentSlug: parentPage.slug, selectedText: selectedText.slice(0, 200) });
|
|
173
|
+
|
|
174
|
+
// Determine promotability: content should be substantial (2+ paragraphs, 200+ chars)
|
|
175
|
+
const isPromotable = parsed.isPromotable !== undefined
|
|
176
|
+
? parsed.isPromotable
|
|
177
|
+
: parsed.content.length >= 200 && (parsed.content.match(/\n\n/g) || []).length >= 1;
|
|
178
|
+
|
|
179
|
+
const keyConcepts = Array.isArray(parsed.keyConcepts) ? parsed.keyConcepts.filter(c => typeof c === 'string') : [];
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
pageId,
|
|
183
|
+
slug: finalSlug,
|
|
184
|
+
title: parsed.title,
|
|
185
|
+
content: parsed.content,
|
|
186
|
+
isPromotable,
|
|
187
|
+
suggestedTitle: parsed.title,
|
|
188
|
+
keyConcepts,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import type { Store } from "../store";
|
|
2
|
+
import type { LLMConfig } from "../config";
|
|
3
|
+
|
|
4
|
+
// Cosine similarity between two vectors
|
|
5
|
+
function cosineSimilarity(a: Float32Array, b: Float32Array): number {
|
|
6
|
+
let dot = 0, normA = 0, normB = 0;
|
|
7
|
+
for (let i = 0; i < a.length; i++) {
|
|
8
|
+
dot += a[i] * b[i];
|
|
9
|
+
normA += a[i] * a[i];
|
|
10
|
+
normB += b[i] * b[i];
|
|
11
|
+
}
|
|
12
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
// Get embedding — auto-detect provider
|
|
16
|
+
async function getEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
|
|
17
|
+
const input = text.slice(0, 8000);
|
|
18
|
+
|
|
19
|
+
if (config.provider === "gemini") {
|
|
20
|
+
return await geminiEmbedding(input, config);
|
|
21
|
+
} else if (config.provider === "azure-openai") {
|
|
22
|
+
return await azureEmbedding(input, config);
|
|
23
|
+
} else if (config.provider === "openai") {
|
|
24
|
+
return await openaiEmbedding(input, config);
|
|
25
|
+
}
|
|
26
|
+
throw new Error(`Embedding not supported for provider: ${config.provider}`);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Gemini Embedding API (free)
|
|
30
|
+
async function geminiEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
|
|
31
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent`;
|
|
32
|
+
const resp = await fetch(url, {
|
|
33
|
+
method: "POST",
|
|
34
|
+
headers: { "Content-Type": "application/json", "x-goog-api-key": config.api_key },
|
|
35
|
+
body: JSON.stringify({
|
|
36
|
+
model: "models/gemini-embedding-001",
|
|
37
|
+
content: { parts: [{ text }] }
|
|
38
|
+
})
|
|
39
|
+
});
|
|
40
|
+
if (!resp.ok) throw new Error(`Gemini embedding error (${resp.status})`);
|
|
41
|
+
const data = await resp.json() as { embedding: { values: number[] } };
|
|
42
|
+
return new Float32Array(data.embedding.values);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Azure OpenAI Embedding
|
|
46
|
+
async function azureEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
|
|
47
|
+
const url = `${config.endpoint}/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-06-01`;
|
|
48
|
+
const resp = await fetch(url, {
|
|
49
|
+
method: "POST",
|
|
50
|
+
headers: { "Content-Type": "application/json", "api-key": config.api_key },
|
|
51
|
+
body: JSON.stringify({ input: text, model: "text-embedding-3-small" })
|
|
52
|
+
});
|
|
53
|
+
if (!resp.ok) throw new Error(`Azure embedding error (${resp.status})`);
|
|
54
|
+
const data = await resp.json() as { data: Array<{ embedding: number[] }> };
|
|
55
|
+
return new Float32Array(data.data[0].embedding);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// OpenAI Embedding
|
|
59
|
+
async function openaiEmbedding(text: string, config: LLMConfig): Promise<Float32Array> {
|
|
60
|
+
const resp = await fetch("https://api.openai.com/v1/embeddings", {
|
|
61
|
+
method: "POST",
|
|
62
|
+
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${config.api_key}` },
|
|
63
|
+
body: JSON.stringify({ input: text, model: "text-embedding-3-small" })
|
|
64
|
+
});
|
|
65
|
+
if (!resp.ok) throw new Error(`OpenAI embedding error (${resp.status})`);
|
|
66
|
+
const data = await resp.json() as { data: Array<{ embedding: number[] }> };
|
|
67
|
+
return new Float32Array(data.data[0].embedding);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Generate embeddings for all pages that don't have one
|
|
71
|
+
export async function generateMissingEmbeddings(store: Store, config: LLMConfig, onProgress?: (msg: string) => void): Promise<number> {
|
|
72
|
+
const pages = store.getPagesWithoutEmbeddings();
|
|
73
|
+
if (!pages.length) return 0;
|
|
74
|
+
|
|
75
|
+
onProgress?.(`⏳ ${pages.length}개 페이지 임베딩 생성 중...`);
|
|
76
|
+
let count = 0;
|
|
77
|
+
|
|
78
|
+
for (const page of pages) {
|
|
79
|
+
try {
|
|
80
|
+
const text = `${page.title}\n\n${page.content.slice(0, 4000)}`;
|
|
81
|
+
const embedding = await getEmbedding(text, config);
|
|
82
|
+
store.saveEmbedding(page.id, embedding, "text-embedding-3-small");
|
|
83
|
+
count++;
|
|
84
|
+
if (count % 10 === 0) onProgress?.(` ${count}/${pages.length} 완료`);
|
|
85
|
+
} catch (e) {
|
|
86
|
+
// Skip failed pages silently
|
|
87
|
+
onProgress?.(` ⚠ ${page.title} 실패: ${e instanceof Error ? e.message : String(e)}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
onProgress?.(`✅ ${count}개 임베딩 생성 완료`);
|
|
92
|
+
return count;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Semantic search: find pages similar to query text
|
|
96
|
+
export async function semanticSearch(
|
|
97
|
+
query: string,
|
|
98
|
+
store: Store,
|
|
99
|
+
config: LLMConfig,
|
|
100
|
+
limit: number = 5
|
|
101
|
+
): Promise<Array<{slug: string; title: string; pageType: string; origin: string; similarity: number}>> {
|
|
102
|
+
const allEmbeddings = store.getAllEmbeddings();
|
|
103
|
+
if (!allEmbeddings.length) return [];
|
|
104
|
+
|
|
105
|
+
// Get query embedding
|
|
106
|
+
const queryEmbedding = await getEmbedding(query, config);
|
|
107
|
+
|
|
108
|
+
// Calculate similarities
|
|
109
|
+
const results = allEmbeddings.map(page => ({
|
|
110
|
+
slug: page.slug,
|
|
111
|
+
title: page.title,
|
|
112
|
+
pageType: page.pageType,
|
|
113
|
+
origin: page.origin,
|
|
114
|
+
similarity: cosineSimilarity(queryEmbedding, page.embedding)
|
|
115
|
+
}));
|
|
116
|
+
|
|
117
|
+
// Sort by similarity descending, return top N
|
|
118
|
+
return results
|
|
119
|
+
.sort((a, b) => b.similarity - a.similarity)
|
|
120
|
+
.slice(0, limit)
|
|
121
|
+
.filter(r => r.similarity > 0.3); // Minimum threshold
|
|
122
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import type { Store } from "../store";
|
|
2
|
+
import type { LLMConfig } from "../config";
|
|
3
|
+
|
|
4
|
+
export interface IndexPage {
|
|
5
|
+
id: number;
|
|
6
|
+
title: string;
|
|
7
|
+
slug: string;
|
|
8
|
+
type: "source" | "concept";
|
|
9
|
+
linkCount: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface IndexCategory {
|
|
13
|
+
name: string;
|
|
14
|
+
slug: string;
|
|
15
|
+
description?: string;
|
|
16
|
+
pages: IndexPage[];
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ContentIndex {
|
|
20
|
+
categories: IndexCategory[];
|
|
21
|
+
totalPages: number;
|
|
22
|
+
totalLinks: number;
|
|
23
|
+
generatedAt: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
interface IndexConfig {
|
|
27
|
+
useLLM?: boolean;
|
|
28
|
+
llmConfig?: LLMConfig;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function slugify(text: string): string {
|
|
32
|
+
return text
|
|
33
|
+
.toLowerCase()
|
|
34
|
+
.replace(/[^\w\s가-힣-]/g, "")
|
|
35
|
+
.replace(/\s+/g, "-")
|
|
36
|
+
.replace(/-+/g, "-")
|
|
37
|
+
.trim();
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Simple grouping: group pages by their source document, then orphan concepts separately.
|
|
42
|
+
*/
|
|
43
|
+
function groupBySource(store: Store): IndexCategory[] {
|
|
44
|
+
const grouped = store.getPagesBySource();
|
|
45
|
+
const categories: IndexCategory[] = [];
|
|
46
|
+
|
|
47
|
+
for (const group of grouped) {
|
|
48
|
+
categories.push({
|
|
49
|
+
name: group.sourceTitle,
|
|
50
|
+
slug: slugify(group.sourceTitle),
|
|
51
|
+
description: `${group.sourceTitle} 소스 문서에서 생성된 페이지`,
|
|
52
|
+
pages: group.pages.map((p) => ({
|
|
53
|
+
id: p.id,
|
|
54
|
+
title: p.title,
|
|
55
|
+
slug: p.slug,
|
|
56
|
+
type: p.page_type as "source" | "concept",
|
|
57
|
+
linkCount: p.linkCount,
|
|
58
|
+
})),
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Sort categories alphabetically
|
|
63
|
+
categories.sort((a, b) => a.name.localeCompare(b.name));
|
|
64
|
+
|
|
65
|
+
// Sort pages within each category by link count (descending)
|
|
66
|
+
for (const cat of categories) {
|
|
67
|
+
cat.pages.sort((a, b) => b.linkCount - a.linkCount);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return categories;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Smart grouping: use LLM to categorize pages into topic clusters.
|
|
75
|
+
*/
|
|
76
|
+
async function groupByLLM(store: Store, llmConfig: LLMConfig): Promise<IndexCategory[]> {
|
|
77
|
+
const { LLMClient } = await import("../llm-client");
|
|
78
|
+
const client = new LLMClient(llmConfig);
|
|
79
|
+
|
|
80
|
+
const pages = store.listPages();
|
|
81
|
+
const links = store.getAllLinks();
|
|
82
|
+
|
|
83
|
+
// Build link count map
|
|
84
|
+
const linkCountMap = new Map<number, number>();
|
|
85
|
+
for (const link of links) {
|
|
86
|
+
linkCountMap.set(link.to_page_id, (linkCountMap.get(link.to_page_id) || 0) + 1);
|
|
87
|
+
linkCountMap.set(link.from_page_id, (linkCountMap.get(link.from_page_id) || 0) + 1);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const pageTitles = pages.map((p) => `- ${p.title} (${p.page_type})`).join("\n");
|
|
91
|
+
|
|
92
|
+
const system = `You are a knowledge organizer. Given a list of wiki page titles, categorize them into 5-10 meaningful topic clusters. Return ONLY a JSON array of objects with: name (category name), description (short description), pages (array of page titles belonging to this category). Every page must be assigned to exactly one category.`;
|
|
93
|
+
|
|
94
|
+
const userMessage = `Categorize these wiki pages into topic clusters:\n\n${pageTitles}\n\nReturn JSON array only, no markdown fences.`;
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
const response = await client.chatComplete(system, userMessage, 4096);
|
|
98
|
+
|
|
99
|
+
// Parse the JSON response
|
|
100
|
+
const cleaned = response.replace(/```json?\s*|```\s*/g, "").trim();
|
|
101
|
+
const clusters = JSON.parse(cleaned) as Array<{
|
|
102
|
+
name: string;
|
|
103
|
+
description?: string;
|
|
104
|
+
pages: string[];
|
|
105
|
+
}>;
|
|
106
|
+
|
|
107
|
+
// Build page lookup by title
|
|
108
|
+
const pageByTitle = new Map(pages.map((p) => [p.title, p]));
|
|
109
|
+
|
|
110
|
+
const categories: IndexCategory[] = clusters.map((cluster) => ({
|
|
111
|
+
name: cluster.name,
|
|
112
|
+
slug: slugify(cluster.name),
|
|
113
|
+
description: cluster.description,
|
|
114
|
+
pages: cluster.pages
|
|
115
|
+
.map((title) => {
|
|
116
|
+
const page = pageByTitle.get(title);
|
|
117
|
+
if (!page) return null;
|
|
118
|
+
return {
|
|
119
|
+
id: page.id,
|
|
120
|
+
title: page.title,
|
|
121
|
+
slug: page.slug,
|
|
122
|
+
type: page.page_type as "source" | "concept",
|
|
123
|
+
linkCount: linkCountMap.get(page.id) || 0,
|
|
124
|
+
};
|
|
125
|
+
})
|
|
126
|
+
.filter((p): p is IndexPage => p !== null),
|
|
127
|
+
}));
|
|
128
|
+
|
|
129
|
+
// Add any uncategorized pages
|
|
130
|
+
const categorizedTitles = new Set(clusters.flatMap((c) => c.pages));
|
|
131
|
+
const uncategorized = pages.filter((p) => !categorizedTitles.has(p.title));
|
|
132
|
+
if (uncategorized.length > 0) {
|
|
133
|
+
categories.push({
|
|
134
|
+
name: "기타",
|
|
135
|
+
slug: "etc",
|
|
136
|
+
description: "분류되지 않은 페이지",
|
|
137
|
+
pages: uncategorized.map((p) => ({
|
|
138
|
+
id: p.id,
|
|
139
|
+
title: p.title,
|
|
140
|
+
slug: p.slug,
|
|
141
|
+
type: p.page_type as "source" | "concept",
|
|
142
|
+
linkCount: linkCountMap.get(p.id) || 0,
|
|
143
|
+
})),
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Sort
|
|
148
|
+
categories.sort((a, b) => a.name.localeCompare(b.name));
|
|
149
|
+
for (const cat of categories) {
|
|
150
|
+
cat.pages.sort((a, b) => b.linkCount - a.linkCount);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return categories;
|
|
154
|
+
} catch {
|
|
155
|
+
// Fallback to simple grouping if LLM fails
|
|
156
|
+
console.warn("LLM categorization failed, falling back to source-based grouping");
|
|
157
|
+
return groupBySource(store);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Generate a structured content index for all wiki pages.
|
|
163
|
+
*/
|
|
164
|
+
export async function generateContentIndex(
|
|
165
|
+
store: Store,
|
|
166
|
+
config?: IndexConfig
|
|
167
|
+
): Promise<ContentIndex> {
|
|
168
|
+
const totalPages = store.countPages();
|
|
169
|
+
const totalLinks = store.getAllLinks().length;
|
|
170
|
+
|
|
171
|
+
let categories: IndexCategory[];
|
|
172
|
+
|
|
173
|
+
if (config?.useLLM && config.llmConfig?.api_key) {
|
|
174
|
+
categories = await groupByLLM(store, config.llmConfig);
|
|
175
|
+
} else {
|
|
176
|
+
categories = groupBySource(store);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return {
|
|
180
|
+
categories,
|
|
181
|
+
totalPages,
|
|
182
|
+
totalLinks,
|
|
183
|
+
generatedAt: new Date().toISOString(),
|
|
184
|
+
};
|
|
185
|
+
}
|
package/src/services/ingest.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Store } from "../store";
|
|
2
|
-
import { type LLMConfig, type Persona } from "../config";
|
|
2
|
+
import { type LLMConfig, type Persona, type WikiSchema } from "../config";
|
|
3
3
|
import { LLMClient, type UsageStats } from "../llm-client";
|
|
4
4
|
|
|
5
5
|
export interface IngestResult {
|
|
@@ -15,10 +15,16 @@ export async function ingestUrl(
|
|
|
15
15
|
url: string,
|
|
16
16
|
llmConfig: LLMConfig,
|
|
17
17
|
persona: Persona | null,
|
|
18
|
-
onProgress?: (status: string) => void
|
|
18
|
+
onProgress?: (status: string) => void,
|
|
19
|
+
schema?: WikiSchema
|
|
19
20
|
): Promise<IngestResult> {
|
|
20
21
|
const client = new LLMClient(llmConfig);
|
|
21
22
|
client.resetUsageStats();
|
|
23
|
+
client.onRetry = (attempt, max, delayMs) => {
|
|
24
|
+
const delaySec = Math.round(delayMs / 1000);
|
|
25
|
+
onProgress?.(`⏳ Rate limit — 재시도 ${attempt}/${max}, ${delaySec}초 대기...`);
|
|
26
|
+
console.log(`\x1b[33m⏳ Rate limit — retry ${attempt}/${max}, waiting ${delaySec}s...\x1b[0m`);
|
|
27
|
+
};
|
|
22
28
|
|
|
23
29
|
const { fetchPage } = await import("../ingest/web");
|
|
24
30
|
const { llmChunkDocument, htmlToRawText } = await import("../pipeline/llm-chunker");
|
|
@@ -27,15 +33,30 @@ export async function ingestUrl(
|
|
|
27
33
|
const { title, html } = await fetchPage(url);
|
|
28
34
|
|
|
29
35
|
const source = store.addSource(url, "web", title, html);
|
|
30
|
-
const rawText = htmlToRawText(html);
|
|
36
|
+
const rawText = await htmlToRawText(html);
|
|
37
|
+
|
|
38
|
+
if (!rawText || rawText.trim().length < 50) {
|
|
39
|
+
throw new Error("추출된 텍스트가 너무 짧습니다. 파일 내용을 확인해주세요.");
|
|
40
|
+
}
|
|
31
41
|
|
|
32
|
-
|
|
33
|
-
|
|
42
|
+
// Only delete existing pages if NOT resuming (no checkpoints = fresh ingest)
|
|
43
|
+
if (!store.hasCheckpoints(source.id)) {
|
|
44
|
+
store.deletePagesBySource(source.id);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const isResume = store.hasCheckpoints(source.id);
|
|
48
|
+
onProgress?.(isResume ? "⏳ LLM 분석 재개..." : "⏳ LLM 분석 시작...");
|
|
49
|
+
const { sourceCount, conceptCount } = await llmChunkDocument(rawText, title, source.id, store, 0, persona, client, onProgress, schema);
|
|
50
|
+
|
|
51
|
+
// Pipeline completed successfully — clear checkpoints for clean future re-ingests
|
|
52
|
+
store.clearCheckpoints(source.id);
|
|
34
53
|
|
|
35
54
|
const u = client.getUsageStats();
|
|
36
55
|
const estimatedCostUsd = client.getEstimatedCost();
|
|
37
56
|
store.addUsageLog(source.id, u.totalCalls, u.promptTokens, u.completionTokens, u.totalTokens, estimatedCostUsd);
|
|
38
57
|
|
|
58
|
+
store.addActivityLog('ingest', `Ingested ${title}`, 'source', source.id, { url, sourceCount, conceptCount });
|
|
59
|
+
|
|
39
60
|
return {
|
|
40
61
|
sourceCount,
|
|
41
62
|
conceptCount,
|
|
@@ -51,10 +72,16 @@ export async function ingestFile(
|
|
|
51
72
|
originalName: string,
|
|
52
73
|
llmConfig: LLMConfig,
|
|
53
74
|
persona: Persona | null,
|
|
54
|
-
onProgress?: (status: string) => void
|
|
75
|
+
onProgress?: (status: string) => void,
|
|
76
|
+
schema?: WikiSchema
|
|
55
77
|
): Promise<IngestResult> {
|
|
56
78
|
const client = new LLMClient(llmConfig);
|
|
57
79
|
client.resetUsageStats();
|
|
80
|
+
client.onRetry = (attempt, max, delayMs) => {
|
|
81
|
+
const delaySec = Math.round(delayMs / 1000);
|
|
82
|
+
onProgress?.(`⏳ Rate limit — 재시도 ${attempt}/${max}, ${delaySec}초 대기...`);
|
|
83
|
+
console.log(`\x1b[33m⏳ Rate limit — retry ${attempt}/${max}, waiting ${delaySec}s...\x1b[0m`);
|
|
84
|
+
};
|
|
58
85
|
|
|
59
86
|
const { llmChunkDocument } = await import("../pipeline/llm-chunker");
|
|
60
87
|
|
|
@@ -63,34 +90,65 @@ export async function ingestFile(
|
|
|
63
90
|
let title: string;
|
|
64
91
|
let text: string;
|
|
65
92
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
93
|
+
switch (ext) {
|
|
94
|
+
case "pdf": {
|
|
95
|
+
const { extractTextFromPdf } = await import("../ingest/pdf");
|
|
96
|
+
onProgress?.("⏳ PDF 텍스트 추출 중...");
|
|
97
|
+
({ title, text } = await extractTextFromPdf(filePath));
|
|
98
|
+
break;
|
|
99
|
+
}
|
|
100
|
+
case "docx": {
|
|
101
|
+
const { extractTextFromDocx } = await import("../ingest/docx");
|
|
102
|
+
onProgress?.("⏳ DOCX 텍스트 추출 중...");
|
|
103
|
+
({ title, text } = await extractTextFromDocx(filePath));
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
case "pptx": {
|
|
107
|
+
const { extractTextFromPptx } = await import("../ingest/pptx");
|
|
108
|
+
onProgress?.("⏳ PPTX 텍스트 추출 중...");
|
|
109
|
+
({ title, text } = await extractTextFromPptx(filePath));
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
case "md": {
|
|
113
|
+
const { extractTextFromMarkdown } = await import("../ingest/markdown");
|
|
114
|
+
onProgress?.("⏳ MD 텍스트 추출 중...");
|
|
115
|
+
const result = extractTextFromMarkdown(filePath);
|
|
116
|
+
title = result.title;
|
|
117
|
+
text = result.text;
|
|
118
|
+
break;
|
|
119
|
+
}
|
|
120
|
+
default: {
|
|
121
|
+
const { extractWithTextutil } = await import("../ingest/legacy");
|
|
122
|
+
onProgress?.(`⏳ ${ext.toUpperCase()} 텍스트 추출 중...`);
|
|
123
|
+
({ title, text } = await extractWithTextutil(filePath));
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (!text || text.trim().length < 50) {
|
|
129
|
+
throw new Error("추출된 텍스트가 너무 짧습니다. 파일 내용을 확인해주세요.");
|
|
82
130
|
}
|
|
83
131
|
|
|
84
132
|
const source = store.addSource(filePath, ext, title, "(file)");
|
|
85
|
-
store.deletePagesBySource(source.id);
|
|
86
133
|
|
|
87
|
-
|
|
88
|
-
|
|
134
|
+
// Only delete existing pages if NOT resuming (no checkpoints = fresh ingest)
|
|
135
|
+
if (!store.hasCheckpoints(source.id)) {
|
|
136
|
+
store.deletePagesBySource(source.id);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const isResume = store.hasCheckpoints(source.id);
|
|
140
|
+
onProgress?.(isResume ? "⏳ LLM 분석 재개..." : "⏳ LLM 분석 시작...");
|
|
141
|
+
const { sourceCount, conceptCount } = await llmChunkDocument(text, title, source.id, store, 0, persona, client, onProgress, schema);
|
|
142
|
+
|
|
143
|
+
// Pipeline completed successfully — clear checkpoints for clean future re-ingests
|
|
144
|
+
store.clearCheckpoints(source.id);
|
|
89
145
|
|
|
90
146
|
const u = client.getUsageStats();
|
|
91
147
|
const estimatedCostUsd = client.getEstimatedCost();
|
|
92
148
|
store.addUsageLog(source.id, u.totalCalls, u.promptTokens, u.completionTokens, u.totalTokens, estimatedCostUsd);
|
|
93
149
|
|
|
150
|
+
store.addActivityLog('ingest', `Ingested ${originalName}`, 'source', source.id, { filePath, sourceCount, conceptCount });
|
|
151
|
+
|
|
94
152
|
return {
|
|
95
153
|
sourceCount,
|
|
96
154
|
conceptCount,
|