@open330/kiwimu 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ export async function extractTextFromPdf(pdfPath: string): Promise<{ title: string; text: string }> {
2
+ let pdfParse: any;
3
+ try {
4
+ pdfParse = require("pdf-parse");
5
+ } catch {
6
+ throw new Error("PDF support requires pdf-parse. Run: bun add pdf-parse");
7
+ }
8
+
9
+ const buffer = await Bun.file(pdfPath).arrayBuffer();
10
+ const data = await pdfParse(Buffer.from(buffer));
11
+
12
+ const title = data.info?.Title || pdfPath.split("/").pop()?.replace(".pdf", "") || "Untitled";
13
+ return { title, text: data.text };
14
+ }
@@ -0,0 +1,39 @@
1
+ import { readFileSync } from "fs";
2
+
3
+ export async function extractTextFromPptx(filePath: string): Promise<{ title: string; text: string }> {
4
+ // PPTX is a ZIP containing XML files
5
+ const { Decompress } = await import("bun");
6
+ const JSZip = (await import("jszip")).default;
7
+
8
+ const buffer = readFileSync(filePath);
9
+ const zip = await JSZip.loadAsync(buffer);
10
+
11
+ const slides: string[] = [];
12
+
13
+ // Parse each slide XML
14
+ const slideFiles = Object.keys(zip.files)
15
+ .filter((f) => f.match(/^ppt\/slides\/slide\d+\.xml$/))
16
+ .sort((a, b) => {
17
+ const numA = parseInt(a.match(/slide(\d+)/)?.[1] || "0");
18
+ const numB = parseInt(b.match(/slide(\d+)/)?.[1] || "0");
19
+ return numA - numB;
20
+ });
21
+
22
+ for (const slidePath of slideFiles) {
23
+ const xml = await zip.files[slidePath].async("text");
24
+ // Extract text from <a:t> tags
25
+ const texts: string[] = [];
26
+ const regex = /<a:t>([^<]*)<\/a:t>/g;
27
+ let match;
28
+ while ((match = regex.exec(xml))) {
29
+ if (match[1].trim()) texts.push(match[1]);
30
+ }
31
+ if (texts.length) {
32
+ slides.push(texts.join(" "));
33
+ }
34
+ }
35
+
36
+ const title = filePath.split("/").pop()?.replace(/\.pptx?$/i, "") || "Untitled";
37
+ const text = slides.map((s, i) => `Slide ${i + 1}:\n${s}`).join("\n\n");
38
+ return { title, text };
39
+ }
@@ -0,0 +1,77 @@
1
+ import * as cheerio from "cheerio";
2
+
3
+ export interface Section {
4
+ level: number;
5
+ title: string;
6
+ htmlParts: string[];
7
+ }
8
+
9
+ const HEADING_TAGS = new Set(["h1", "h2", "h3", "h4"]);
10
+ const SKIP_TAGS = new Set(["nav", "header", "footer", "script", "style", "noscript"]);
11
+ const CONTAINER_TAGS = new Set([
12
+ "html", "head", "body", "div", "article", "main", "section", "aside", "details", "summary",
13
+ ]);
14
+
15
+ export async function fetchPage(url: string): Promise<{ title: string; html: string }> {
16
+ const resp = await fetch(url, {
17
+ headers: { "User-Agent": "kiwimu/0.2 (learning wiki builder)" },
18
+ });
19
+ if (!resp.ok) throw new Error(`Failed to fetch ${url}: ${resp.status}`);
20
+ const html = await resp.text();
21
+ const $ = cheerio.load(html);
22
+ const title = $("title").text().trim() || url;
23
+ const body = $("body").html() || html;
24
+ return { title, html: body };
25
+ }
26
+
27
+ export function extractSections(html: string): Section[] {
28
+ const $ = cheerio.load(html, null, false);
29
+ const sections: Section[] = [];
30
+ let current: Section = { level: 1, title: "Introduction", htmlParts: [] };
31
+
32
+ function walk(el: cheerio.AnyNode): void {
33
+ if (el.type === "text") return;
34
+ if (el.type !== "tag") return;
35
+
36
+ const tagName = (el as cheerio.Element).tagName.toLowerCase();
37
+
38
+ if (SKIP_TAGS.has(tagName)) return;
39
+
40
+ if (HEADING_TAGS.has(tagName)) {
41
+ if (current.htmlParts.length > 0) {
42
+ sections.push(current);
43
+ }
44
+ current = {
45
+ level: parseInt(tagName[1]),
46
+ title: $(el).text().trim(),
47
+ htmlParts: [],
48
+ };
49
+ return;
50
+ }
51
+
52
+ if (CONTAINER_TAGS.has(tagName)) {
53
+ for (const child of (el as cheerio.Element).children) {
54
+ walk(child);
55
+ }
56
+ return;
57
+ }
58
+
59
+ // Content element
60
+ const html = $.html(el)?.trim();
61
+ if (html) {
62
+ current.htmlParts.push(html);
63
+ }
64
+ }
65
+
66
+ // Walk root children
67
+ const root = $.root();
68
+ for (const child of root.contents().toArray()) {
69
+ walk(child);
70
+ }
71
+
72
+ if (current.htmlParts.length > 0) {
73
+ sections.push(current);
74
+ }
75
+
76
+ return sections.filter((s) => s.htmlParts.length > 0);
77
+ }
@@ -0,0 +1,177 @@
1
+ import type { LLMConfig } from "./config";
2
+
3
+ // Token usage tracking
4
+ export interface UsageStats {
5
+ totalCalls: number;
6
+ promptTokens: number;
7
+ completionTokens: number;
8
+ totalTokens: number;
9
+ }
10
+
11
+ const _usage: UsageStats = {
12
+ totalCalls: 0,
13
+ promptTokens: 0,
14
+ completionTokens: 0,
15
+ totalTokens: 0,
16
+ };
17
+
18
+ let _llmConfig: LLMConfig | null = null;
19
+
20
+ export function setLLMConfig(config: LLMConfig): void {
21
+ _llmConfig = config;
22
+ }
23
+
24
+ export function getLLMConfig(): LLMConfig {
25
+ if (!_llmConfig) throw new Error("LLM config not set. Call setLLMConfig() first.");
26
+ return _llmConfig;
27
+ }
28
+
29
+ export function getUsageStats(): UsageStats {
30
+ return { ..._usage };
31
+ }
32
+
33
+ export function resetUsageStats(): void {
34
+ _usage.totalCalls = 0;
35
+ _usage.promptTokens = 0;
36
+ _usage.completionTokens = 0;
37
+ _usage.totalTokens = 0;
38
+ }
39
+
40
+ export function getEstimatedCost(): number {
41
+ const config = _llmConfig;
42
+ if (!config) return 0;
43
+
44
+ // Pricing per 1M tokens (approximate)
45
+ const pricing: Record<string, { input: number; output: number }> = {
46
+ "gemini": { input: 0.075, output: 0.30 },
47
+ "azure-openai": { input: 0.10, output: 0.40 },
48
+ "openai": { input: 0.15, output: 0.60 },
49
+ "anthropic": { input: 3.00, output: 15.00 },
50
+ };
51
+ const p = pricing[config.provider] || pricing["gemini"];
52
+ return (_usage.promptTokens / 1_000_000) * p.input + (_usage.completionTokens / 1_000_000) * p.output;
53
+ }
54
+
55
+ export function printUsageSummary(): void {
56
+ const u = _usage;
57
+ const cost = getEstimatedCost();
58
+ const provider = _llmConfig?.provider || "unknown";
59
+ const model = _llmConfig?.model || "unknown";
60
+
61
+ console.log(`\x1b[34m📊 LLM 사용량 (${provider}/${model}):\x1b[0m`);
62
+ console.log(` 호출 횟수: ${u.totalCalls}회`);
63
+ console.log(` 입력 토큰: ${u.promptTokens.toLocaleString()}`);
64
+ console.log(` 출력 토큰: ${u.completionTokens.toLocaleString()}`);
65
+ console.log(` 총 토큰: ${u.totalTokens.toLocaleString()}`);
66
+ console.log(` 예상 비용: ~$${cost.toFixed(4)}`);
67
+ }
68
+
69
+ // ── Provider implementations ──
70
+
71
+ async function geminiComplete(system: string, userMessage: string, maxTokens: number): Promise<{ text: string; usage?: any }> {
72
+ const config = getLLMConfig();
73
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${config.model}:generateContent?key=${config.api_key}`;
74
+
75
+ const resp = await fetch(url, {
76
+ method: "POST",
77
+ headers: { "Content-Type": "application/json" },
78
+ body: JSON.stringify({
79
+ system_instruction: { parts: [{ text: system }] },
80
+ contents: [{ parts: [{ text: userMessage }] }],
81
+ generationConfig: { maxOutputTokens: maxTokens, temperature: 0.7 },
82
+ }),
83
+ });
84
+
85
+ if (!resp.ok) {
86
+ const err = await resp.text();
87
+ throw new Error(`Gemini API error (${resp.status}): ${err.slice(0, 200)}`);
88
+ }
89
+
90
+ const data = await resp.json();
91
+ const text = data.candidates?.[0]?.content?.parts?.[0]?.text || "";
92
+ const usage = data.usageMetadata;
93
+ return {
94
+ text,
95
+ usage: usage ? {
96
+ prompt_tokens: usage.promptTokenCount || 0,
97
+ completion_tokens: usage.candidatesTokenCount || 0,
98
+ total_tokens: usage.totalTokenCount || 0,
99
+ } : undefined,
100
+ };
101
+ }
102
+
103
+ async function azureOpenAIComplete(system: string, userMessage: string, maxTokens: number): Promise<{ text: string; usage?: any }> {
104
+ const config = getLLMConfig();
105
+
106
+ // Try loading from ~/keys/openai.azure.com/ if no api_key in config
107
+ let apiKey = config.api_key;
108
+ let endpoint = config.endpoint;
109
+ let model = config.model;
110
+
111
+ if (!apiKey) {
112
+ try {
113
+ const keyFile = `${process.env.HOME}/keys/openai.azure.com/${config.model}.json`;
114
+ const raw = require("fs").readFileSync(keyFile, "utf-8");
115
+ const keyConfig = JSON.parse(raw)[0];
116
+ apiKey = keyConfig.key;
117
+ endpoint = keyConfig.endpoint.split("/openai/")[0];
118
+ model = keyConfig.deployment;
119
+ } catch {
120
+ throw new Error("Azure OpenAI API key not configured");
121
+ }
122
+ }
123
+
124
+ const { AzureOpenAI } = await import("openai");
125
+ const client = new AzureOpenAI({ endpoint, apiKey, deployment: model, apiVersion: "2024-12-01-preview" });
126
+
127
+ const resp = await client.chat.completions.create({
128
+ model,
129
+ max_completion_tokens: maxTokens,
130
+ messages: [
131
+ { role: "system", content: system },
132
+ { role: "user", content: userMessage },
133
+ ],
134
+ });
135
+
136
+ return {
137
+ text: resp.choices[0]?.message?.content || "",
138
+ usage: resp.usage ? {
139
+ prompt_tokens: resp.usage.prompt_tokens || 0,
140
+ completion_tokens: resp.usage.completion_tokens || 0,
141
+ total_tokens: resp.usage.total_tokens || 0,
142
+ } : undefined,
143
+ };
144
+ }
145
+
146
+ // ── Main interface ──
147
+
148
+ export async function chatComplete(
149
+ system: string,
150
+ userMessage: string,
151
+ maxTokens = 8192
152
+ ): Promise<string> {
153
+ const config = getLLMConfig();
154
+
155
+ let result: { text: string; usage?: any };
156
+
157
+ switch (config.provider) {
158
+ case "gemini":
159
+ result = await geminiComplete(system, userMessage, maxTokens);
160
+ break;
161
+ case "azure-openai":
162
+ result = await azureOpenAIComplete(system, userMessage, maxTokens);
163
+ break;
164
+ default:
165
+ throw new Error(`Unknown LLM provider: ${config.provider}`);
166
+ }
167
+
168
+ // Track usage
169
+ if (result.usage) {
170
+ _usage.totalCalls++;
171
+ _usage.promptTokens += result.usage.prompt_tokens || 0;
172
+ _usage.completionTokens += result.usage.completion_tokens || 0;
173
+ _usage.totalTokens += result.usage.total_tokens || 0;
174
+ }
175
+
176
+ return result.text;
177
+ }
@@ -0,0 +1,63 @@
1
+ import TurndownService from "turndown";
2
+ import type { Section } from "../ingest/web";
3
+ import type { Store } from "../store";
4
+
5
+ const turndown = new TurndownService({ headingStyle: "atx" });
6
+ turndown.remove(["script", "style"]);
7
+
8
+ export function slugify(text: string): string {
9
+ return text
10
+ .normalize("NFKD")
11
+ .toLowerCase()
12
+ .trim()
13
+ .replace(/[^\w\s-]/g, "")
14
+ .replace(/[-\s]+/g, "-")
15
+ .replace(/^-|-$/g, "")
16
+ .slice(0, 80);
17
+ }
18
+
19
+ const STOP_TITLES = new Set([
20
+ "introduction", "overview", "summary", "conclusion", "references",
21
+ "bibliography", "appendix", "abstract", "preface", "contents",
22
+ "table of contents", "index", "acknowledgments", "notes",
23
+ ]);
24
+
25
+ export function cleanTitle(title: string): string {
26
+ return title
27
+ .replace(/^\s*(Chapter\s+)?\d+(\.\d+)*\s*/i, "")
28
+ .replace(/\s+/g, " ")
29
+ .trim();
30
+ }
31
+
32
+ export function chunkSections(sections: Section[], sourceId: number, store: Store, minWords = 30): number {
33
+ let count = 0;
34
+
35
+ for (const section of sections) {
36
+ const title = cleanTitle(section.title);
37
+ if (!title) continue;
38
+
39
+ const slug = slugify(title);
40
+ if (!slug) continue;
41
+
42
+ const htmlContent = section.htmlParts.join("\n");
43
+ if (!htmlContent.trim()) continue;
44
+
45
+ const content = turndown.turndown(htmlContent).trim();
46
+ const wordCount = content.split(/\s+/).length;
47
+
48
+ if (wordCount < minWords) continue;
49
+ if (STOP_TITLES.has(slug) || STOP_TITLES.has(title.toLowerCase())) {
50
+ if (wordCount < 100) continue;
51
+ }
52
+
53
+ const existing = store.getPage(slug);
54
+ if (existing) {
55
+ store.updatePageContent(existing.id, existing.content + "\n\n" + content);
56
+ } else {
57
+ store.addPage(slug, title, content, sourceId, slug);
58
+ count++;
59
+ }
60
+ }
61
+
62
+ return count;
63
+ }
@@ -0,0 +1,35 @@
1
+ import type { Store } from "../store";
2
+
3
+ export interface GraphData {
4
+ nodes: Array<{ id: string; title: string; degree: number; type: string }>;
5
+ links: Array<{ source: string; target: string }>;
6
+ }
7
+
8
+ export function buildGraphData(store: Store): GraphData {
9
+ const pages = store.listPages();
10
+ const links = store.getAllLinks();
11
+
12
+ const degree = new Map<number, number>();
13
+ for (const page of pages) degree.set(page.id, 0);
14
+ for (const link of links) {
15
+ degree.set(link.from_page_id, (degree.get(link.from_page_id) || 0) + 1);
16
+ degree.set(link.to_page_id, (degree.get(link.to_page_id) || 0) + 1);
17
+ }
18
+
19
+ const slugMap = new Map(pages.map((p) => [p.id, p.slug]));
20
+
21
+ return {
22
+ nodes: pages.map((p) => ({
23
+ id: p.slug,
24
+ title: p.title,
25
+ degree: degree.get(p.id) || 0,
26
+ type: p.page_type,
27
+ })),
28
+ links: links
29
+ .filter((l) => slugMap.has(l.from_page_id) && slugMap.has(l.to_page_id))
30
+ .map((l) => ({
31
+ source: slugMap.get(l.from_page_id)!,
32
+ target: slugMap.get(l.to_page_id)!,
33
+ })),
34
+ };
35
+ }
@@ -0,0 +1,49 @@
1
+ import type { Store, Page } from "../store";
2
+
3
+ export function autoLinkPages(store: Store): number {
4
+ const pages = store.listPages();
5
+ if (!pages.length) return 0;
6
+
7
+ store.clearLinks();
8
+ let totalLinks = 0;
9
+
10
+ // Sort targets by title length descending (longest match first)
11
+ const targets = [...pages].sort((a, b) => b.title.length - a.title.length);
12
+
13
+ // Precompile patterns
14
+ const patterns: Array<{ regex: RegExp; page: Page }> = [];
15
+ for (const target of targets) {
16
+ if (target.title.length < 3) continue;
17
+ const escaped = target.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
18
+ patterns.push({
19
+ regex: new RegExp(`(?<!\\[)(?<!\\w)(${escaped})(?!\\w)(?!\\])`, "i"),
20
+ page: target,
21
+ });
22
+ }
23
+
24
+ for (const page of pages) {
25
+ let content = page.content;
26
+ const linkedSlugs = new Set<string>();
27
+
28
+ for (const { regex, page: target } of patterns) {
29
+ if (target.id === page.id) continue;
30
+ if (linkedSlugs.has(target.slug)) continue;
31
+
32
+ const match = regex.exec(content);
33
+ if (match) {
34
+ const matched = match[1];
35
+ const replacement = `[${matched}](/wiki/${target.slug})`;
36
+ content = content.slice(0, match.index) + replacement + content.slice(match.index + match[0].length);
37
+ linkedSlugs.add(target.slug);
38
+ store.addLink(page.id, target.id, matched);
39
+ totalLinks++;
40
+ }
41
+ }
42
+
43
+ if (linkedSlugs.size > 0) {
44
+ store.updatePageContent(page.id, content);
45
+ }
46
+ }
47
+
48
+ return totalLinks;
49
+ }