@kibhq/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +40 -0
- package/src/compile/backlinks.test.ts +112 -0
- package/src/compile/backlinks.ts +80 -0
- package/src/compile/cache.test.ts +126 -0
- package/src/compile/cache.ts +125 -0
- package/src/compile/compiler.test.ts +278 -0
- package/src/compile/compiler.ts +305 -0
- package/src/compile/diff.test.ts +164 -0
- package/src/compile/diff.ts +121 -0
- package/src/compile/index-manager.test.ts +227 -0
- package/src/compile/index-manager.ts +148 -0
- package/src/compile/prompts.ts +124 -0
- package/src/constants.ts +40 -0
- package/src/errors.ts +66 -0
- package/src/hash.test.ts +21 -0
- package/src/hash.ts +24 -0
- package/src/index.ts +22 -0
- package/src/ingest/extractors/file.test.ts +129 -0
- package/src/ingest/extractors/file.ts +136 -0
- package/src/ingest/extractors/github.test.ts +47 -0
- package/src/ingest/extractors/github.ts +135 -0
- package/src/ingest/extractors/interface.ts +26 -0
- package/src/ingest/extractors/pdf.ts +130 -0
- package/src/ingest/extractors/web.test.ts +242 -0
- package/src/ingest/extractors/web.ts +163 -0
- package/src/ingest/extractors/youtube.test.ts +44 -0
- package/src/ingest/extractors/youtube.ts +166 -0
- package/src/ingest/ingest.test.ts +187 -0
- package/src/ingest/ingest.ts +179 -0
- package/src/ingest/normalize.test.ts +120 -0
- package/src/ingest/normalize.ts +83 -0
- package/src/ingest/router.test.ts +154 -0
- package/src/ingest/router.ts +119 -0
- package/src/lint/lint.test.ts +253 -0
- package/src/lint/lint.ts +43 -0
- package/src/lint/rules.ts +178 -0
- package/src/providers/anthropic.ts +107 -0
- package/src/providers/index.ts +4 -0
- package/src/providers/ollama.ts +101 -0
- package/src/providers/openai.ts +67 -0
- package/src/providers/router.ts +62 -0
- package/src/query/query.test.ts +165 -0
- package/src/query/query.ts +136 -0
- package/src/schemas.ts +193 -0
- package/src/search/engine.test.ts +230 -0
- package/src/search/engine.ts +390 -0
- package/src/skills/loader.ts +163 -0
- package/src/skills/runner.ts +139 -0
- package/src/skills/schema.ts +28 -0
- package/src/skills/skills.test.ts +134 -0
- package/src/types.ts +136 -0
- package/src/vault.test.ts +141 -0
- package/src/vault.ts +251 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { extractVideoId } from "./youtube.js";
|
|
3
|
+
|
|
4
|
+
describe("youtube extractor", () => {
|
|
5
|
+
describe("extractVideoId", () => {
|
|
6
|
+
test("extracts from standard watch URL", () => {
|
|
7
|
+
expect(extractVideoId("https://www.youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
test("extracts from short URL", () => {
|
|
11
|
+
expect(extractVideoId("https://youtu.be/dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
test("extracts from embed URL", () => {
|
|
15
|
+
expect(extractVideoId("https://www.youtube.com/embed/dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
test("extracts from URL with extra params", () => {
|
|
19
|
+
expect(extractVideoId("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=120")).toBe(
|
|
20
|
+
"dQw4w9WgXcQ",
|
|
21
|
+
);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
test("extracts from URL without www", () => {
|
|
25
|
+
expect(extractVideoId("https://youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
test("extracts from mobile URL", () => {
|
|
29
|
+
expect(extractVideoId("https://m.youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test("returns null for invalid URL", () => {
|
|
33
|
+
expect(extractVideoId("https://example.com")).toBeNull();
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test("returns null for YouTube URL without video ID", () => {
|
|
37
|
+
expect(extractVideoId("https://www.youtube.com/channel/UCxyz")).toBeNull();
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test("handles whitespace", () => {
|
|
41
|
+
expect(extractVideoId(" https://youtu.be/dQw4w9WgXcQ ")).toBe("dQw4w9WgXcQ");
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
});
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import type { ExtractOptions, Extractor, ExtractResult } from "./interface.js";
|
|
2
|
+
|
|
3
|
+
export function createYoutubeExtractor(): Extractor {
|
|
4
|
+
return {
|
|
5
|
+
type: "youtube",
|
|
6
|
+
|
|
7
|
+
async extract(url: string, options?: ExtractOptions): Promise<ExtractResult> {
|
|
8
|
+
const videoId = extractVideoId(url);
|
|
9
|
+
if (!videoId) {
|
|
10
|
+
throw new Error(`Could not extract video ID from URL: ${url}`);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Fetch video page to get title and metadata
|
|
14
|
+
const pageData = await fetchVideoPage(videoId);
|
|
15
|
+
|
|
16
|
+
// Attempt to fetch transcript
|
|
17
|
+
let transcript: string | null = null;
|
|
18
|
+
try {
|
|
19
|
+
transcript = await fetchTranscript(videoId);
|
|
20
|
+
} catch {
|
|
21
|
+
// Transcript not available — fall back to description
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const title = options?.title ?? pageData.title ?? `YouTube Video ${videoId}`;
|
|
25
|
+
|
|
26
|
+
let content: string;
|
|
27
|
+
if (transcript) {
|
|
28
|
+
content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n## Transcript\n\n${transcript}`;
|
|
29
|
+
} else if (pageData.description) {
|
|
30
|
+
content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n## Description\n\n${pageData.description}\n\n*Note: Transcript was not available for this video.*`;
|
|
31
|
+
} else {
|
|
32
|
+
content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n*No transcript or description available.*`;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
title,
|
|
37
|
+
content,
|
|
38
|
+
metadata: {
|
|
39
|
+
videoId,
|
|
40
|
+
channelName: pageData.channelName,
|
|
41
|
+
url: `https://www.youtube.com/watch?v=${videoId}`,
|
|
42
|
+
hasTranscript: transcript !== null,
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
},
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export function extractVideoId(url: string): string | null {
|
|
50
|
+
const trimmed = url.trim();
|
|
51
|
+
|
|
52
|
+
// youtu.be/VIDEO_ID
|
|
53
|
+
const shortMatch = trimmed.match(/youtu\.be\/([a-zA-Z0-9_-]{11})/);
|
|
54
|
+
if (shortMatch) return shortMatch[1]!;
|
|
55
|
+
|
|
56
|
+
// youtube.com/watch?v=VIDEO_ID
|
|
57
|
+
try {
|
|
58
|
+
const parsed = new URL(trimmed);
|
|
59
|
+
const v = parsed.searchParams.get("v");
|
|
60
|
+
if (v && v.length === 11) return v;
|
|
61
|
+
} catch {
|
|
62
|
+
// Not a valid URL
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// youtube.com/embed/VIDEO_ID
|
|
66
|
+
const embedMatch = trimmed.match(/youtube\.com\/embed\/([a-zA-Z0-9_-]{11})/);
|
|
67
|
+
if (embedMatch) return embedMatch[1]!;
|
|
68
|
+
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
interface VideoPageData {
|
|
73
|
+
title: string | null;
|
|
74
|
+
description: string | null;
|
|
75
|
+
channelName: string | null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async function fetchVideoPage(videoId: string): Promise<VideoPageData> {
|
|
79
|
+
// Use oembed API — no auth needed, returns JSON
|
|
80
|
+
try {
|
|
81
|
+
const response = await fetch(
|
|
82
|
+
`https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=${videoId}&format=json`,
|
|
83
|
+
);
|
|
84
|
+
if (response.ok) {
|
|
85
|
+
const data = (await response.json()) as any;
|
|
86
|
+
return {
|
|
87
|
+
title: data.title ?? null,
|
|
88
|
+
description: null, // oembed doesn't include description
|
|
89
|
+
channelName: data.author_name ?? null,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
} catch {
|
|
93
|
+
// Fallback
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return { title: null, description: null, channelName: null };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
async function fetchTranscript(videoId: string): Promise<string> {
|
|
100
|
+
// Fetch the video page to get the captions track URL
|
|
101
|
+
const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, {
|
|
102
|
+
headers: {
|
|
103
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
104
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (!response.ok) {
|
|
109
|
+
throw new Error(`Failed to fetch video page: ${response.status}`);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const html = await response.text();
|
|
113
|
+
|
|
114
|
+
// Extract captions data from the page
|
|
115
|
+
const captionMatch = html.match(/"captionTracks":\[(\{.*?\})\]/);
|
|
116
|
+
if (!captionMatch) {
|
|
117
|
+
throw new Error("No captions available");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Parse the first caption track URL
|
|
121
|
+
const trackData = JSON.parse(`[${captionMatch[1]}]`);
|
|
122
|
+
const track = trackData[0];
|
|
123
|
+
if (!track?.baseUrl) {
|
|
124
|
+
throw new Error("No caption track URL found");
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Fetch the transcript XML
|
|
128
|
+
const transcriptResponse = await fetch(track.baseUrl);
|
|
129
|
+
if (!transcriptResponse.ok) {
|
|
130
|
+
throw new Error("Failed to fetch transcript");
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const xml = await transcriptResponse.text();
|
|
134
|
+
|
|
135
|
+
// Parse XML transcript into plain text
|
|
136
|
+
return parseTranscriptXml(xml);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function parseTranscriptXml(xml: string): string {
|
|
140
|
+
const lines: string[] = [];
|
|
141
|
+
const textRegex = /<text[^>]*>([\s\S]*?)<\/text>/g;
|
|
142
|
+
let match: RegExpExecArray | null;
|
|
143
|
+
|
|
144
|
+
while ((match = textRegex.exec(xml)) !== null) {
|
|
145
|
+
const text = match[1]!
|
|
146
|
+
.replace(/&/g, "&")
|
|
147
|
+
.replace(/</g, "<")
|
|
148
|
+
.replace(/>/g, ">")
|
|
149
|
+
.replace(/"/g, '"')
|
|
150
|
+
.replace(/'/g, "'")
|
|
151
|
+
.replace(/<[^>]+>/g, "") // strip any HTML tags
|
|
152
|
+
.trim();
|
|
153
|
+
|
|
154
|
+
if (text) {
|
|
155
|
+
lines.push(text);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Join into paragraphs — group ~5 lines together
|
|
160
|
+
const paragraphs: string[] = [];
|
|
161
|
+
for (let i = 0; i < lines.length; i += 5) {
|
|
162
|
+
paragraphs.push(lines.slice(i, i + 5).join(" "));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return paragraphs.join("\n\n");
|
|
166
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import { afterEach, describe, expect, test } from "bun:test";
|
|
2
|
+
import { existsSync } from "node:fs";
|
|
3
|
+
import { mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { initVault, loadManifest } from "../vault.js";
|
|
7
|
+
import { ingestSource } from "./ingest.js";
|
|
8
|
+
|
|
9
|
+
let tempDir: string;
|
|
10
|
+
|
|
11
|
+
afterEach(async () => {
|
|
12
|
+
if (tempDir) await rm(tempDir, { recursive: true, force: true });
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
async function makeTempVault() {
|
|
16
|
+
tempDir = await mkdtemp(join(tmpdir(), "kib-ingest-test-"));
|
|
17
|
+
await initVault(tempDir, { name: "test" });
|
|
18
|
+
return tempDir;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
describe("ingestSource", () => {
|
|
22
|
+
test("ingests a local markdown file", async () => {
|
|
23
|
+
const root = await makeTempVault();
|
|
24
|
+
|
|
25
|
+
// Create a test file
|
|
26
|
+
const testFile = join(root, "test-article.md");
|
|
27
|
+
await writeFile(testFile, "# Test Article\n\nThis is some test content for ingestion.");
|
|
28
|
+
|
|
29
|
+
const result = await ingestSource(root, testFile);
|
|
30
|
+
|
|
31
|
+
expect(result.skipped).toBe(false);
|
|
32
|
+
expect(result.sourceType).toBe("file");
|
|
33
|
+
expect(result.title).toBe("Test Article");
|
|
34
|
+
expect(result.wordCount).toBeGreaterThan(0);
|
|
35
|
+
expect(result.path).toMatch(/^raw\/articles\//);
|
|
36
|
+
|
|
37
|
+
// Verify file was written
|
|
38
|
+
const rawPath = join(root, result.path);
|
|
39
|
+
expect(existsSync(rawPath)).toBe(true);
|
|
40
|
+
|
|
41
|
+
// Verify manifest was updated
|
|
42
|
+
const manifest = await loadManifest(root);
|
|
43
|
+
expect(manifest.stats.totalSources).toBe(1);
|
|
44
|
+
expect(Object.keys(manifest.sources)).toHaveLength(1);
|
|
45
|
+
|
|
46
|
+
const source = Object.values(manifest.sources)[0]!;
|
|
47
|
+
expect(source.sourceType).toBe("file");
|
|
48
|
+
expect(source.lastCompiled).toBeNull();
|
|
49
|
+
expect(source.metadata.title).toBe("Test Article");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
test("ingests a local text file", async () => {
|
|
53
|
+
const root = await makeTempVault();
|
|
54
|
+
|
|
55
|
+
const testFile = join(root, "notes.txt");
|
|
56
|
+
await writeFile(testFile, "Some plain text notes about a topic.");
|
|
57
|
+
|
|
58
|
+
const result = await ingestSource(root, testFile);
|
|
59
|
+
|
|
60
|
+
expect(result.skipped).toBe(false);
|
|
61
|
+
expect(result.sourceType).toBe("file");
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test("deduplicates identical content", async () => {
|
|
65
|
+
const root = await makeTempVault();
|
|
66
|
+
|
|
67
|
+
const testFile = join(root, "article.md");
|
|
68
|
+
await writeFile(testFile, "# Unique Content\n\nThis exact content should only appear once.");
|
|
69
|
+
|
|
70
|
+
const result1 = await ingestSource(root, testFile);
|
|
71
|
+
expect(result1.skipped).toBe(false);
|
|
72
|
+
|
|
73
|
+
const result2 = await ingestSource(root, testFile);
|
|
74
|
+
expect(result2.skipped).toBe(true);
|
|
75
|
+
expect(result2.skipReason).toContain("Duplicate");
|
|
76
|
+
|
|
77
|
+
// Manifest should still have only 1 source
|
|
78
|
+
const manifest = await loadManifest(root);
|
|
79
|
+
expect(manifest.stats.totalSources).toBe(1);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("allows different content even from same path", async () => {
|
|
83
|
+
const root = await makeTempVault();
|
|
84
|
+
|
|
85
|
+
const testFile = join(root, "article.md");
|
|
86
|
+
await writeFile(testFile, "# Version 1\n\nOriginal content.");
|
|
87
|
+
const result1 = await ingestSource(root, testFile);
|
|
88
|
+
expect(result1.skipped).toBe(false);
|
|
89
|
+
|
|
90
|
+
await writeFile(testFile, "# Version 2\n\nUpdated content that is different.");
|
|
91
|
+
const result2 = await ingestSource(root, testFile);
|
|
92
|
+
expect(result2.skipped).toBe(false);
|
|
93
|
+
|
|
94
|
+
const manifest = await loadManifest(root);
|
|
95
|
+
expect(manifest.stats.totalSources).toBe(2);
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
test("uses custom category when specified", async () => {
|
|
99
|
+
const root = await makeTempVault();
|
|
100
|
+
|
|
101
|
+
const testFile = join(root, "notes.md");
|
|
102
|
+
await writeFile(testFile, "# Notes\n\nContent.");
|
|
103
|
+
|
|
104
|
+
const result = await ingestSource(root, testFile, { category: "papers" });
|
|
105
|
+
expect(result.path).toMatch(/^raw\/papers\//);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test("uses custom title when specified", async () => {
|
|
109
|
+
const root = await makeTempVault();
|
|
110
|
+
|
|
111
|
+
const testFile = join(root, "data.md");
|
|
112
|
+
await writeFile(testFile, "Some data.");
|
|
113
|
+
|
|
114
|
+
const result = await ingestSource(root, testFile, { title: "My Custom Title" });
|
|
115
|
+
expect(result.title).toBe("My Custom Title");
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test("routes PDF files to papers category", async () => {
|
|
119
|
+
const root = await makeTempVault();
|
|
120
|
+
|
|
121
|
+
// We can't easily test actual PDF extraction without a real PDF,
|
|
122
|
+
// but we can verify the source type detection routes correctly
|
|
123
|
+
// by using a .md file with forced sourceType
|
|
124
|
+
const testFile = join(root, "test.md");
|
|
125
|
+
await writeFile(testFile, "# PDF Content\n\nExtracted from a PDF.");
|
|
126
|
+
|
|
127
|
+
const result = await ingestSource(root, testFile, { sourceType: "file", category: "papers" });
|
|
128
|
+
expect(result.path).toMatch(/^raw\/papers\//);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
test("ingests multiple sources and tracks them all", async () => {
|
|
132
|
+
const root = await makeTempVault();
|
|
133
|
+
|
|
134
|
+
const file1 = join(root, "first.md");
|
|
135
|
+
const file2 = join(root, "second.md");
|
|
136
|
+
const file3 = join(root, "third.md");
|
|
137
|
+
await writeFile(file1, "# First Article\n\nContent one.");
|
|
138
|
+
await writeFile(file2, "# Second Article\n\nContent two.");
|
|
139
|
+
await writeFile(file3, "# Third Article\n\nContent three.");
|
|
140
|
+
|
|
141
|
+
await ingestSource(root, file1);
|
|
142
|
+
await ingestSource(root, file2);
|
|
143
|
+
await ingestSource(root, file3);
|
|
144
|
+
|
|
145
|
+
const manifest = await loadManifest(root);
|
|
146
|
+
expect(manifest.stats.totalSources).toBe(3);
|
|
147
|
+
expect(Object.keys(manifest.sources)).toHaveLength(3);
|
|
148
|
+
|
|
149
|
+
// All sources should be pending compilation
|
|
150
|
+
const pending = Object.values(manifest.sources).filter((s) => s.lastCompiled === null);
|
|
151
|
+
expect(pending).toHaveLength(3);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test("ingests code files wrapped in code blocks", async () => {
|
|
155
|
+
const root = await makeTempVault();
|
|
156
|
+
|
|
157
|
+
const testFile = join(root, "example.ts");
|
|
158
|
+
await writeFile(testFile, "const greeting = 'hello world';\nconsole.log(greeting);");
|
|
159
|
+
|
|
160
|
+
const result = await ingestSource(root, testFile);
|
|
161
|
+
|
|
162
|
+
expect(result.skipped).toBe(false);
|
|
163
|
+
expect(result.sourceType).toBe("file");
|
|
164
|
+
|
|
165
|
+
// Read the raw file and verify it contains code block
|
|
166
|
+
const { readRaw } = await import("../vault.js");
|
|
167
|
+
// The path is raw/articles/example.md, we need to strip "raw/" prefix
|
|
168
|
+
const rawContent = await readRaw(root, result.path.replace(/^raw\//, ""));
|
|
169
|
+
expect(rawContent).toContain("```typescript");
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
test("normalized content includes frontmatter", async () => {
|
|
173
|
+
const root = await makeTempVault();
|
|
174
|
+
|
|
175
|
+
const testFile = join(root, "article.md");
|
|
176
|
+
await writeFile(testFile, "# My Great Article\n\nAmazing content here.");
|
|
177
|
+
|
|
178
|
+
const result = await ingestSource(root, testFile);
|
|
179
|
+
|
|
180
|
+
const { readRaw } = await import("../vault.js");
|
|
181
|
+
const rawContent = await readRaw(root, result.path.replace(/^raw\//, ""));
|
|
182
|
+
expect(rawContent).toContain("---");
|
|
183
|
+
expect(rawContent).toContain('title: "My Great Article"');
|
|
184
|
+
expect(rawContent).toContain("source_type: file");
|
|
185
|
+
expect(rawContent).toContain("word_count:");
|
|
186
|
+
});
|
|
187
|
+
});
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import { basename, extname, resolve } from "node:path";
|
|
2
|
+
import { hash } from "../hash.js";
|
|
3
|
+
import type { IngestResult, Manifest, SourceEntry, SourceType } from "../types.js";
|
|
4
|
+
import { loadManifest, saveManifest, writeRaw } from "../vault.js";
|
|
5
|
+
import type { Extractor } from "./extractors/interface.js";
|
|
6
|
+
import { countWords, normalizeSource, slugify } from "./normalize.js";
|
|
7
|
+
import { detectSourceType } from "./router.js";
|
|
8
|
+
|
|
9
|
+
interface IngestOptions {
|
|
10
|
+
/** Override the detected source type */
|
|
11
|
+
sourceType?: SourceType;
|
|
12
|
+
/** Override category (raw/ subdirectory) */
|
|
13
|
+
category?: string;
|
|
14
|
+
/** Additional tags */
|
|
15
|
+
tags?: string[];
|
|
16
|
+
/** Custom title */
|
|
17
|
+
title?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Ingest a single source into the vault.
|
|
22
|
+
*
|
|
23
|
+
* 1. Detect source type
|
|
24
|
+
* 2. Route to the correct extractor
|
|
25
|
+
* 3. Extract content
|
|
26
|
+
* 4. Hash content (dedup check)
|
|
27
|
+
* 5. Normalize with frontmatter
|
|
28
|
+
* 6. Write to raw/
|
|
29
|
+
* 7. Update manifest
|
|
30
|
+
*/
|
|
31
|
+
export async function ingestSource(
|
|
32
|
+
root: string,
|
|
33
|
+
uri: string,
|
|
34
|
+
options: IngestOptions = {},
|
|
35
|
+
): Promise<IngestResult> {
|
|
36
|
+
const sourceType = options.sourceType ?? detectSourceType(uri);
|
|
37
|
+
|
|
38
|
+
// Get the extractor for this source type
|
|
39
|
+
const extractor = await getExtractor(sourceType);
|
|
40
|
+
|
|
41
|
+
// Extract content
|
|
42
|
+
const extracted = await extractor.extract(uri, { title: options.title, tags: options.tags });
|
|
43
|
+
|
|
44
|
+
// Hash the extracted content for dedup
|
|
45
|
+
const contentHash = await hash(extracted.content);
|
|
46
|
+
|
|
47
|
+
// Load manifest and check for duplicates
|
|
48
|
+
const manifest = await loadManifest(root);
|
|
49
|
+
|
|
50
|
+
// Check if we already have this exact content
|
|
51
|
+
const existingSource = findExistingSource(manifest, uri, contentHash);
|
|
52
|
+
if (existingSource) {
|
|
53
|
+
return {
|
|
54
|
+
sourceId: existingSource.id,
|
|
55
|
+
path: existingSource.path,
|
|
56
|
+
sourceType,
|
|
57
|
+
title: extracted.title,
|
|
58
|
+
wordCount: countWords(extracted.content),
|
|
59
|
+
skipped: true,
|
|
60
|
+
skipReason: "Duplicate content (same hash already ingested)",
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Normalize content with frontmatter
|
|
65
|
+
const normalizedContent = normalizeSource({
|
|
66
|
+
title: extracted.title,
|
|
67
|
+
content: extracted.content,
|
|
68
|
+
sourceType,
|
|
69
|
+
originalUrl: isUrl(uri) ? uri : undefined,
|
|
70
|
+
metadata: extracted.metadata,
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
// Determine file path within raw/
|
|
74
|
+
const category = options.category ?? categoryForType(sourceType);
|
|
75
|
+
const slug = slugify(extracted.title);
|
|
76
|
+
const relativePath = `${category}/${slug}.md`;
|
|
77
|
+
|
|
78
|
+
// Write to raw/
|
|
79
|
+
await writeRaw(root, relativePath, normalizedContent);
|
|
80
|
+
|
|
81
|
+
// Generate a source ID
|
|
82
|
+
const sourceId = `src_${contentHash.slice(0, 12)}`;
|
|
83
|
+
|
|
84
|
+
// Update manifest
|
|
85
|
+
const now = new Date().toISOString();
|
|
86
|
+
const wordCount = countWords(extracted.content);
|
|
87
|
+
|
|
88
|
+
const sourceEntry: SourceEntry = {
|
|
89
|
+
hash: contentHash,
|
|
90
|
+
ingestedAt: now,
|
|
91
|
+
lastCompiled: null,
|
|
92
|
+
sourceType,
|
|
93
|
+
originalUrl: isUrl(uri) ? uri : undefined,
|
|
94
|
+
producedArticles: [],
|
|
95
|
+
metadata: {
|
|
96
|
+
title: extracted.title,
|
|
97
|
+
author: extracted.metadata.author as string | undefined,
|
|
98
|
+
date: extracted.metadata.date as string | undefined,
|
|
99
|
+
wordCount,
|
|
100
|
+
},
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
manifest.sources[sourceId] = sourceEntry;
|
|
104
|
+
manifest.stats.totalSources = Object.keys(manifest.sources).length;
|
|
105
|
+
|
|
106
|
+
await saveManifest(root, manifest);
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
sourceId,
|
|
110
|
+
path: `raw/${relativePath}`,
|
|
111
|
+
sourceType,
|
|
112
|
+
title: extracted.title,
|
|
113
|
+
wordCount,
|
|
114
|
+
skipped: false,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async function getExtractor(sourceType: SourceType): Promise<Extractor> {
|
|
119
|
+
switch (sourceType) {
|
|
120
|
+
case "web": {
|
|
121
|
+
const { createWebExtractor } = await import("./extractors/web.js");
|
|
122
|
+
return createWebExtractor();
|
|
123
|
+
}
|
|
124
|
+
case "pdf": {
|
|
125
|
+
const { createPdfExtractor } = await import("./extractors/pdf.js");
|
|
126
|
+
return createPdfExtractor();
|
|
127
|
+
}
|
|
128
|
+
case "youtube": {
|
|
129
|
+
const { createYoutubeExtractor } = await import("./extractors/youtube.js");
|
|
130
|
+
return createYoutubeExtractor();
|
|
131
|
+
}
|
|
132
|
+
case "github": {
|
|
133
|
+
const { createGithubExtractor } = await import("./extractors/github.js");
|
|
134
|
+
return createGithubExtractor();
|
|
135
|
+
}
|
|
136
|
+
case "file": {
|
|
137
|
+
const { createFileExtractor } = await import("./extractors/file.js");
|
|
138
|
+
return createFileExtractor();
|
|
139
|
+
}
|
|
140
|
+
case "image":
|
|
141
|
+
throw new Error("Image extraction requires vision model support (coming soon)");
|
|
142
|
+
default:
|
|
143
|
+
throw new Error(`Unsupported source type: ${sourceType}`);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function categoryForType(sourceType: SourceType): string {
|
|
148
|
+
switch (sourceType) {
|
|
149
|
+
case "pdf":
|
|
150
|
+
return "papers";
|
|
151
|
+
case "youtube":
|
|
152
|
+
return "transcripts";
|
|
153
|
+
case "github":
|
|
154
|
+
return "repos";
|
|
155
|
+
case "image":
|
|
156
|
+
return "images";
|
|
157
|
+
default:
|
|
158
|
+
return "articles";
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function findExistingSource(
|
|
163
|
+
manifest: Manifest,
|
|
164
|
+
uri: string,
|
|
165
|
+
contentHash: string,
|
|
166
|
+
): { id: string; path: string } | null {
|
|
167
|
+
for (const [id, source] of Object.entries(manifest.sources)) {
|
|
168
|
+
// Same content hash = same content regardless of URL
|
|
169
|
+
if (source.hash === contentHash) {
|
|
170
|
+
return { id, path: source.producedArticles[0] ?? "" };
|
|
171
|
+
}
|
|
172
|
+
// Same URL but different hash = content changed, allow re-ingest
|
|
173
|
+
}
|
|
174
|
+
return null;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function isUrl(str: string): boolean {
|
|
178
|
+
return str.startsWith("http://") || str.startsWith("https://");
|
|
179
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { countWords, normalizeSource, slugify } from "./normalize.js";
|
|
3
|
+
|
|
4
|
+
describe("slugify", () => {
|
|
5
|
+
test("converts title to kebab-case", () => {
|
|
6
|
+
expect(slugify("Transformer Architecture")).toBe("transformer-architecture");
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
test("strips special characters", () => {
|
|
10
|
+
expect(slugify("What's New in React 19?")).toBe("whats-new-in-react-19");
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
test("collapses multiple dashes", () => {
|
|
14
|
+
expect(slugify("foo -- bar")).toBe("foo-bar");
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test("strips leading/trailing dashes", () => {
|
|
18
|
+
expect(slugify("--hello--")).toBe("hello");
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test("truncates at 80 chars", () => {
|
|
22
|
+
const long = "a".repeat(100);
|
|
23
|
+
expect(slugify(long).length).toBeLessThanOrEqual(80);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test("handles empty string", () => {
|
|
27
|
+
expect(slugify("")).toBe("");
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test("handles unicode by stripping", () => {
|
|
31
|
+
expect(slugify("Vaswani et al. (2017)")).toBe("vaswani-et-al-2017");
|
|
32
|
+
});
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
describe("countWords", () => {
|
|
36
|
+
test("counts plain text words", () => {
|
|
37
|
+
expect(countWords("hello world foo bar")).toBe(4);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test("ignores code blocks", () => {
|
|
41
|
+
const text = "before\n```\nconst x = 1;\n```\nafter";
|
|
42
|
+
expect(countWords(text)).toBe(2); // "before" and "after"
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("ignores inline code", () => {
|
|
46
|
+
expect(countWords("use `const` to declare")).toBe(3);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
test("strips markdown syntax", () => {
|
|
50
|
+
expect(countWords("# Hello **World**")).toBe(2);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("handles empty string", () => {
|
|
54
|
+
expect(countWords("")).toBe(0);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
test("handles whitespace only", () => {
|
|
58
|
+
expect(countWords(" \n\n ")).toBe(0);
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
describe("normalizeSource", () => {
|
|
63
|
+
test("generates frontmatter with required fields", () => {
|
|
64
|
+
const result = normalizeSource({
|
|
65
|
+
title: "Test Article",
|
|
66
|
+
content: "# Test\n\nSome content here with words.",
|
|
67
|
+
sourceType: "web",
|
|
68
|
+
originalUrl: "https://example.com",
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
expect(result).toContain('title: "Test Article"');
|
|
72
|
+
expect(result).toContain("source_type: web");
|
|
73
|
+
expect(result).toContain('url: "https://example.com"');
|
|
74
|
+
expect(result).toContain("word_count:");
|
|
75
|
+
expect(result).toContain("ingested:");
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
test("includes author and date when present", () => {
|
|
79
|
+
const result = normalizeSource({
|
|
80
|
+
title: "Paper",
|
|
81
|
+
content: "Content.",
|
|
82
|
+
sourceType: "pdf",
|
|
83
|
+
metadata: { author: "John Doe", date: "2024-01-01" },
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
expect(result).toContain('author: "John Doe"');
|
|
87
|
+
expect(result).toContain('date: "2024-01-01"');
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test("escapes quotes in title", () => {
|
|
91
|
+
const result = normalizeSource({
|
|
92
|
+
title: 'He said "hello"',
|
|
93
|
+
content: "Content.",
|
|
94
|
+
sourceType: "file",
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
expect(result).toContain('title: "He said \\"hello\\""');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test("removes excessive blank lines", () => {
|
|
101
|
+
const result = normalizeSource({
|
|
102
|
+
title: "Test",
|
|
103
|
+
content: "Line 1\n\n\n\n\nLine 2",
|
|
104
|
+
sourceType: "file",
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
expect(result).not.toContain("\n\n\n");
|
|
108
|
+
expect(result).toContain("Line 1\n\nLine 2");
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
test("normalizes CRLF to LF", () => {
|
|
112
|
+
const result = normalizeSource({
|
|
113
|
+
title: "Test",
|
|
114
|
+
content: "Line 1\r\nLine 2",
|
|
115
|
+
sourceType: "file",
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
expect(result).not.toContain("\r");
|
|
119
|
+
});
|
|
120
|
+
});
|