@kibhq/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +40 -0
  2. package/src/compile/backlinks.test.ts +112 -0
  3. package/src/compile/backlinks.ts +80 -0
  4. package/src/compile/cache.test.ts +126 -0
  5. package/src/compile/cache.ts +125 -0
  6. package/src/compile/compiler.test.ts +278 -0
  7. package/src/compile/compiler.ts +305 -0
  8. package/src/compile/diff.test.ts +164 -0
  9. package/src/compile/diff.ts +121 -0
  10. package/src/compile/index-manager.test.ts +227 -0
  11. package/src/compile/index-manager.ts +148 -0
  12. package/src/compile/prompts.ts +124 -0
  13. package/src/constants.ts +40 -0
  14. package/src/errors.ts +66 -0
  15. package/src/hash.test.ts +21 -0
  16. package/src/hash.ts +24 -0
  17. package/src/index.ts +22 -0
  18. package/src/ingest/extractors/file.test.ts +129 -0
  19. package/src/ingest/extractors/file.ts +136 -0
  20. package/src/ingest/extractors/github.test.ts +47 -0
  21. package/src/ingest/extractors/github.ts +135 -0
  22. package/src/ingest/extractors/interface.ts +26 -0
  23. package/src/ingest/extractors/pdf.ts +130 -0
  24. package/src/ingest/extractors/web.test.ts +242 -0
  25. package/src/ingest/extractors/web.ts +163 -0
  26. package/src/ingest/extractors/youtube.test.ts +44 -0
  27. package/src/ingest/extractors/youtube.ts +166 -0
  28. package/src/ingest/ingest.test.ts +187 -0
  29. package/src/ingest/ingest.ts +179 -0
  30. package/src/ingest/normalize.test.ts +120 -0
  31. package/src/ingest/normalize.ts +83 -0
  32. package/src/ingest/router.test.ts +154 -0
  33. package/src/ingest/router.ts +119 -0
  34. package/src/lint/lint.test.ts +253 -0
  35. package/src/lint/lint.ts +43 -0
  36. package/src/lint/rules.ts +178 -0
  37. package/src/providers/anthropic.ts +107 -0
  38. package/src/providers/index.ts +4 -0
  39. package/src/providers/ollama.ts +101 -0
  40. package/src/providers/openai.ts +67 -0
  41. package/src/providers/router.ts +62 -0
  42. package/src/query/query.test.ts +165 -0
  43. package/src/query/query.ts +136 -0
  44. package/src/schemas.ts +193 -0
  45. package/src/search/engine.test.ts +230 -0
  46. package/src/search/engine.ts +390 -0
  47. package/src/skills/loader.ts +163 -0
  48. package/src/skills/runner.ts +139 -0
  49. package/src/skills/schema.ts +28 -0
  50. package/src/skills/skills.test.ts +134 -0
  51. package/src/types.ts +136 -0
  52. package/src/vault.test.ts +141 -0
  53. package/src/vault.ts +251 -0
@@ -0,0 +1,44 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { extractVideoId } from "./youtube.js";
3
+
4
+ describe("youtube extractor", () => {
5
+ describe("extractVideoId", () => {
6
+ test("extracts from standard watch URL", () => {
7
+ expect(extractVideoId("https://www.youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
8
+ });
9
+
10
+ test("extracts from short URL", () => {
11
+ expect(extractVideoId("https://youtu.be/dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
12
+ });
13
+
14
+ test("extracts from embed URL", () => {
15
+ expect(extractVideoId("https://www.youtube.com/embed/dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
16
+ });
17
+
18
+ test("extracts from URL with extra params", () => {
19
+ expect(extractVideoId("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=120")).toBe(
20
+ "dQw4w9WgXcQ",
21
+ );
22
+ });
23
+
24
+ test("extracts from URL without www", () => {
25
+ expect(extractVideoId("https://youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
26
+ });
27
+
28
+ test("extracts from mobile URL", () => {
29
+ expect(extractVideoId("https://m.youtube.com/watch?v=dQw4w9WgXcQ")).toBe("dQw4w9WgXcQ");
30
+ });
31
+
32
+ test("returns null for invalid URL", () => {
33
+ expect(extractVideoId("https://example.com")).toBeNull();
34
+ });
35
+
36
+ test("returns null for YouTube URL without video ID", () => {
37
+ expect(extractVideoId("https://www.youtube.com/channel/UCxyz")).toBeNull();
38
+ });
39
+
40
+ test("handles whitespace", () => {
41
+ expect(extractVideoId(" https://youtu.be/dQw4w9WgXcQ ")).toBe("dQw4w9WgXcQ");
42
+ });
43
+ });
44
+ });
@@ -0,0 +1,166 @@
1
+ import type { ExtractOptions, Extractor, ExtractResult } from "./interface.js";
2
+
3
+ export function createYoutubeExtractor(): Extractor {
4
+ return {
5
+ type: "youtube",
6
+
7
+ async extract(url: string, options?: ExtractOptions): Promise<ExtractResult> {
8
+ const videoId = extractVideoId(url);
9
+ if (!videoId) {
10
+ throw new Error(`Could not extract video ID from URL: ${url}`);
11
+ }
12
+
13
+ // Fetch video page to get title and metadata
14
+ const pageData = await fetchVideoPage(videoId);
15
+
16
+ // Attempt to fetch transcript
17
+ let transcript: string | null = null;
18
+ try {
19
+ transcript = await fetchTranscript(videoId);
20
+ } catch {
21
+ // Transcript not available — fall back to description
22
+ }
23
+
24
+ const title = options?.title ?? pageData.title ?? `YouTube Video ${videoId}`;
25
+
26
+ let content: string;
27
+ if (transcript) {
28
+ content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n## Transcript\n\n${transcript}`;
29
+ } else if (pageData.description) {
30
+ content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n## Description\n\n${pageData.description}\n\n*Note: Transcript was not available for this video.*`;
31
+ } else {
32
+ content = `# ${title}\n\n**Source:** https://www.youtube.com/watch?v=${videoId}\n\n*No transcript or description available.*`;
33
+ }
34
+
35
+ return {
36
+ title,
37
+ content,
38
+ metadata: {
39
+ videoId,
40
+ channelName: pageData.channelName,
41
+ url: `https://www.youtube.com/watch?v=${videoId}`,
42
+ hasTranscript: transcript !== null,
43
+ },
44
+ };
45
+ },
46
+ };
47
+ }
48
+
49
+ export function extractVideoId(url: string): string | null {
50
+ const trimmed = url.trim();
51
+
52
+ // youtu.be/VIDEO_ID
53
+ const shortMatch = trimmed.match(/youtu\.be\/([a-zA-Z0-9_-]{11})/);
54
+ if (shortMatch) return shortMatch[1]!;
55
+
56
+ // youtube.com/watch?v=VIDEO_ID
57
+ try {
58
+ const parsed = new URL(trimmed);
59
+ const v = parsed.searchParams.get("v");
60
+ if (v && v.length === 11) return v;
61
+ } catch {
62
+ // Not a valid URL
63
+ }
64
+
65
+ // youtube.com/embed/VIDEO_ID
66
+ const embedMatch = trimmed.match(/youtube\.com\/embed\/([a-zA-Z0-9_-]{11})/);
67
+ if (embedMatch) return embedMatch[1]!;
68
+
69
+ return null;
70
+ }
71
+
72
+ interface VideoPageData {
73
+ title: string | null;
74
+ description: string | null;
75
+ channelName: string | null;
76
+ }
77
+
78
+ async function fetchVideoPage(videoId: string): Promise<VideoPageData> {
79
+ // Use oembed API — no auth needed, returns JSON
80
+ try {
81
+ const response = await fetch(
82
+ `https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=${videoId}&format=json`,
83
+ );
84
+ if (response.ok) {
85
+ const data = (await response.json()) as any;
86
+ return {
87
+ title: data.title ?? null,
88
+ description: null, // oembed doesn't include description
89
+ channelName: data.author_name ?? null,
90
+ };
91
+ }
92
+ } catch {
93
+ // Fallback
94
+ }
95
+
96
+ return { title: null, description: null, channelName: null };
97
+ }
98
+
99
+ async function fetchTranscript(videoId: string): Promise<string> {
100
+ // Fetch the video page to get the captions track URL
101
+ const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, {
102
+ headers: {
103
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
104
+ "Accept-Language": "en-US,en;q=0.9",
105
+ },
106
+ });
107
+
108
+ if (!response.ok) {
109
+ throw new Error(`Failed to fetch video page: ${response.status}`);
110
+ }
111
+
112
+ const html = await response.text();
113
+
114
+ // Extract captions data from the page
115
+ const captionMatch = html.match(/"captionTracks":\[(\{.*?\})\]/);
116
+ if (!captionMatch) {
117
+ throw new Error("No captions available");
118
+ }
119
+
120
+ // Parse the first caption track URL
121
+ const trackData = JSON.parse(`[${captionMatch[1]}]`);
122
+ const track = trackData[0];
123
+ if (!track?.baseUrl) {
124
+ throw new Error("No caption track URL found");
125
+ }
126
+
127
+ // Fetch the transcript XML
128
+ const transcriptResponse = await fetch(track.baseUrl);
129
+ if (!transcriptResponse.ok) {
130
+ throw new Error("Failed to fetch transcript");
131
+ }
132
+
133
+ const xml = await transcriptResponse.text();
134
+
135
+ // Parse XML transcript into plain text
136
+ return parseTranscriptXml(xml);
137
+ }
138
+
139
+ function parseTranscriptXml(xml: string): string {
140
+ const lines: string[] = [];
141
+ const textRegex = /<text[^>]*>([\s\S]*?)<\/text>/g;
142
+ let match: RegExpExecArray | null;
143
+
144
+ while ((match = textRegex.exec(xml)) !== null) {
145
+ const text = match[1]!
146
+ .replace(/&amp;/g, "&")
147
+ .replace(/&lt;/g, "<")
148
+ .replace(/&gt;/g, ">")
149
+ .replace(/&quot;/g, '"')
150
+ .replace(/&#39;/g, "'")
151
+ .replace(/<[^>]+>/g, "") // strip any HTML tags
152
+ .trim();
153
+
154
+ if (text) {
155
+ lines.push(text);
156
+ }
157
+ }
158
+
159
+ // Join into paragraphs — group ~5 lines together
160
+ const paragraphs: string[] = [];
161
+ for (let i = 0; i < lines.length; i += 5) {
162
+ paragraphs.push(lines.slice(i, i + 5).join(" "));
163
+ }
164
+
165
+ return paragraphs.join("\n\n");
166
+ }
@@ -0,0 +1,187 @@
1
+ import { afterEach, describe, expect, test } from "bun:test";
2
+ import { existsSync } from "node:fs";
3
+ import { mkdtemp, rm, writeFile } from "node:fs/promises";
4
+ import { tmpdir } from "node:os";
5
+ import { join } from "node:path";
6
+ import { initVault, loadManifest } from "../vault.js";
7
+ import { ingestSource } from "./ingest.js";
8
+
9
+ let tempDir: string;
10
+
11
+ afterEach(async () => {
12
+ if (tempDir) await rm(tempDir, { recursive: true, force: true });
13
+ });
14
+
15
+ async function makeTempVault() {
16
+ tempDir = await mkdtemp(join(tmpdir(), "kib-ingest-test-"));
17
+ await initVault(tempDir, { name: "test" });
18
+ return tempDir;
19
+ }
20
+
21
+ describe("ingestSource", () => {
22
+ test("ingests a local markdown file", async () => {
23
+ const root = await makeTempVault();
24
+
25
+ // Create a test file
26
+ const testFile = join(root, "test-article.md");
27
+ await writeFile(testFile, "# Test Article\n\nThis is some test content for ingestion.");
28
+
29
+ const result = await ingestSource(root, testFile);
30
+
31
+ expect(result.skipped).toBe(false);
32
+ expect(result.sourceType).toBe("file");
33
+ expect(result.title).toBe("Test Article");
34
+ expect(result.wordCount).toBeGreaterThan(0);
35
+ expect(result.path).toMatch(/^raw\/articles\//);
36
+
37
+ // Verify file was written
38
+ const rawPath = join(root, result.path);
39
+ expect(existsSync(rawPath)).toBe(true);
40
+
41
+ // Verify manifest was updated
42
+ const manifest = await loadManifest(root);
43
+ expect(manifest.stats.totalSources).toBe(1);
44
+ expect(Object.keys(manifest.sources)).toHaveLength(1);
45
+
46
+ const source = Object.values(manifest.sources)[0]!;
47
+ expect(source.sourceType).toBe("file");
48
+ expect(source.lastCompiled).toBeNull();
49
+ expect(source.metadata.title).toBe("Test Article");
50
+ });
51
+
52
+ test("ingests a local text file", async () => {
53
+ const root = await makeTempVault();
54
+
55
+ const testFile = join(root, "notes.txt");
56
+ await writeFile(testFile, "Some plain text notes about a topic.");
57
+
58
+ const result = await ingestSource(root, testFile);
59
+
60
+ expect(result.skipped).toBe(false);
61
+ expect(result.sourceType).toBe("file");
62
+ });
63
+
64
+ test("deduplicates identical content", async () => {
65
+ const root = await makeTempVault();
66
+
67
+ const testFile = join(root, "article.md");
68
+ await writeFile(testFile, "# Unique Content\n\nThis exact content should only appear once.");
69
+
70
+ const result1 = await ingestSource(root, testFile);
71
+ expect(result1.skipped).toBe(false);
72
+
73
+ const result2 = await ingestSource(root, testFile);
74
+ expect(result2.skipped).toBe(true);
75
+ expect(result2.skipReason).toContain("Duplicate");
76
+
77
+ // Manifest should still have only 1 source
78
+ const manifest = await loadManifest(root);
79
+ expect(manifest.stats.totalSources).toBe(1);
80
+ });
81
+
82
+ test("allows different content even from same path", async () => {
83
+ const root = await makeTempVault();
84
+
85
+ const testFile = join(root, "article.md");
86
+ await writeFile(testFile, "# Version 1\n\nOriginal content.");
87
+ const result1 = await ingestSource(root, testFile);
88
+ expect(result1.skipped).toBe(false);
89
+
90
+ await writeFile(testFile, "# Version 2\n\nUpdated content that is different.");
91
+ const result2 = await ingestSource(root, testFile);
92
+ expect(result2.skipped).toBe(false);
93
+
94
+ const manifest = await loadManifest(root);
95
+ expect(manifest.stats.totalSources).toBe(2);
96
+ });
97
+
98
+ test("uses custom category when specified", async () => {
99
+ const root = await makeTempVault();
100
+
101
+ const testFile = join(root, "notes.md");
102
+ await writeFile(testFile, "# Notes\n\nContent.");
103
+
104
+ const result = await ingestSource(root, testFile, { category: "papers" });
105
+ expect(result.path).toMatch(/^raw\/papers\//);
106
+ });
107
+
108
+ test("uses custom title when specified", async () => {
109
+ const root = await makeTempVault();
110
+
111
+ const testFile = join(root, "data.md");
112
+ await writeFile(testFile, "Some data.");
113
+
114
+ const result = await ingestSource(root, testFile, { title: "My Custom Title" });
115
+ expect(result.title).toBe("My Custom Title");
116
+ });
117
+
118
+ test("routes PDF files to papers category", async () => {
119
+ const root = await makeTempVault();
120
+
121
+ // We can't easily test actual PDF extraction without a real PDF,
122
+ // but we can verify the source type detection routes correctly
123
+ // by using a .md file with forced sourceType
124
+ const testFile = join(root, "test.md");
125
+ await writeFile(testFile, "# PDF Content\n\nExtracted from a PDF.");
126
+
127
+ const result = await ingestSource(root, testFile, { sourceType: "file", category: "papers" });
128
+ expect(result.path).toMatch(/^raw\/papers\//);
129
+ });
130
+
131
+ test("ingests multiple sources and tracks them all", async () => {
132
+ const root = await makeTempVault();
133
+
134
+ const file1 = join(root, "first.md");
135
+ const file2 = join(root, "second.md");
136
+ const file3 = join(root, "third.md");
137
+ await writeFile(file1, "# First Article\n\nContent one.");
138
+ await writeFile(file2, "# Second Article\n\nContent two.");
139
+ await writeFile(file3, "# Third Article\n\nContent three.");
140
+
141
+ await ingestSource(root, file1);
142
+ await ingestSource(root, file2);
143
+ await ingestSource(root, file3);
144
+
145
+ const manifest = await loadManifest(root);
146
+ expect(manifest.stats.totalSources).toBe(3);
147
+ expect(Object.keys(manifest.sources)).toHaveLength(3);
148
+
149
+ // All sources should be pending compilation
150
+ const pending = Object.values(manifest.sources).filter((s) => s.lastCompiled === null);
151
+ expect(pending).toHaveLength(3);
152
+ });
153
+
154
+ test("ingests code files wrapped in code blocks", async () => {
155
+ const root = await makeTempVault();
156
+
157
+ const testFile = join(root, "example.ts");
158
+ await writeFile(testFile, "const greeting = 'hello world';\nconsole.log(greeting);");
159
+
160
+ const result = await ingestSource(root, testFile);
161
+
162
+ expect(result.skipped).toBe(false);
163
+ expect(result.sourceType).toBe("file");
164
+
165
+ // Read the raw file and verify it contains code block
166
+ const { readRaw } = await import("../vault.js");
167
+ // The path is raw/articles/example.md, we need to strip "raw/" prefix
168
+ const rawContent = await readRaw(root, result.path.replace(/^raw\//, ""));
169
+ expect(rawContent).toContain("```typescript");
170
+ });
171
+
172
+ test("normalized content includes frontmatter", async () => {
173
+ const root = await makeTempVault();
174
+
175
+ const testFile = join(root, "article.md");
176
+ await writeFile(testFile, "# My Great Article\n\nAmazing content here.");
177
+
178
+ const result = await ingestSource(root, testFile);
179
+
180
+ const { readRaw } = await import("../vault.js");
181
+ const rawContent = await readRaw(root, result.path.replace(/^raw\//, ""));
182
+ expect(rawContent).toContain("---");
183
+ expect(rawContent).toContain('title: "My Great Article"');
184
+ expect(rawContent).toContain("source_type: file");
185
+ expect(rawContent).toContain("word_count:");
186
+ });
187
+ });
@@ -0,0 +1,179 @@
1
+ import { basename, extname, resolve } from "node:path";
2
+ import { hash } from "../hash.js";
3
+ import type { IngestResult, Manifest, SourceEntry, SourceType } from "../types.js";
4
+ import { loadManifest, saveManifest, writeRaw } from "../vault.js";
5
+ import type { Extractor } from "./extractors/interface.js";
6
+ import { countWords, normalizeSource, slugify } from "./normalize.js";
7
+ import { detectSourceType } from "./router.js";
8
+
9
+ interface IngestOptions {
10
+ /** Override the detected source type */
11
+ sourceType?: SourceType;
12
+ /** Override category (raw/ subdirectory) */
13
+ category?: string;
14
+ /** Additional tags */
15
+ tags?: string[];
16
+ /** Custom title */
17
+ title?: string;
18
+ }
19
+
20
+ /**
21
+ * Ingest a single source into the vault.
22
+ *
23
+ * 1. Detect source type
24
+ * 2. Route to the correct extractor
25
+ * 3. Extract content
26
+ * 4. Hash content (dedup check)
27
+ * 5. Normalize with frontmatter
28
+ * 6. Write to raw/
29
+ * 7. Update manifest
30
+ */
31
+ export async function ingestSource(
32
+ root: string,
33
+ uri: string,
34
+ options: IngestOptions = {},
35
+ ): Promise<IngestResult> {
36
+ const sourceType = options.sourceType ?? detectSourceType(uri);
37
+
38
+ // Get the extractor for this source type
39
+ const extractor = await getExtractor(sourceType);
40
+
41
+ // Extract content
42
+ const extracted = await extractor.extract(uri, { title: options.title, tags: options.tags });
43
+
44
+ // Hash the extracted content for dedup
45
+ const contentHash = await hash(extracted.content);
46
+
47
+ // Load manifest and check for duplicates
48
+ const manifest = await loadManifest(root);
49
+
50
+ // Check if we already have this exact content
51
+ const existingSource = findExistingSource(manifest, uri, contentHash);
52
+ if (existingSource) {
53
+ return {
54
+ sourceId: existingSource.id,
55
+ path: existingSource.path,
56
+ sourceType,
57
+ title: extracted.title,
58
+ wordCount: countWords(extracted.content),
59
+ skipped: true,
60
+ skipReason: "Duplicate content (same hash already ingested)",
61
+ };
62
+ }
63
+
64
+ // Normalize content with frontmatter
65
+ const normalizedContent = normalizeSource({
66
+ title: extracted.title,
67
+ content: extracted.content,
68
+ sourceType,
69
+ originalUrl: isUrl(uri) ? uri : undefined,
70
+ metadata: extracted.metadata,
71
+ });
72
+
73
+ // Determine file path within raw/
74
+ const category = options.category ?? categoryForType(sourceType);
75
+ const slug = slugify(extracted.title);
76
+ const relativePath = `${category}/${slug}.md`;
77
+
78
+ // Write to raw/
79
+ await writeRaw(root, relativePath, normalizedContent);
80
+
81
+ // Generate a source ID
82
+ const sourceId = `src_${contentHash.slice(0, 12)}`;
83
+
84
+ // Update manifest
85
+ const now = new Date().toISOString();
86
+ const wordCount = countWords(extracted.content);
87
+
88
+ const sourceEntry: SourceEntry = {
89
+ hash: contentHash,
90
+ ingestedAt: now,
91
+ lastCompiled: null,
92
+ sourceType,
93
+ originalUrl: isUrl(uri) ? uri : undefined,
94
+ producedArticles: [],
95
+ metadata: {
96
+ title: extracted.title,
97
+ author: extracted.metadata.author as string | undefined,
98
+ date: extracted.metadata.date as string | undefined,
99
+ wordCount,
100
+ },
101
+ };
102
+
103
+ manifest.sources[sourceId] = sourceEntry;
104
+ manifest.stats.totalSources = Object.keys(manifest.sources).length;
105
+
106
+ await saveManifest(root, manifest);
107
+
108
+ return {
109
+ sourceId,
110
+ path: `raw/${relativePath}`,
111
+ sourceType,
112
+ title: extracted.title,
113
+ wordCount,
114
+ skipped: false,
115
+ };
116
+ }
117
+
118
+ async function getExtractor(sourceType: SourceType): Promise<Extractor> {
119
+ switch (sourceType) {
120
+ case "web": {
121
+ const { createWebExtractor } = await import("./extractors/web.js");
122
+ return createWebExtractor();
123
+ }
124
+ case "pdf": {
125
+ const { createPdfExtractor } = await import("./extractors/pdf.js");
126
+ return createPdfExtractor();
127
+ }
128
+ case "youtube": {
129
+ const { createYoutubeExtractor } = await import("./extractors/youtube.js");
130
+ return createYoutubeExtractor();
131
+ }
132
+ case "github": {
133
+ const { createGithubExtractor } = await import("./extractors/github.js");
134
+ return createGithubExtractor();
135
+ }
136
+ case "file": {
137
+ const { createFileExtractor } = await import("./extractors/file.js");
138
+ return createFileExtractor();
139
+ }
140
+ case "image":
141
+ throw new Error("Image extraction requires vision model support (coming soon)");
142
+ default:
143
+ throw new Error(`Unsupported source type: ${sourceType}`);
144
+ }
145
+ }
146
+
147
+ function categoryForType(sourceType: SourceType): string {
148
+ switch (sourceType) {
149
+ case "pdf":
150
+ return "papers";
151
+ case "youtube":
152
+ return "transcripts";
153
+ case "github":
154
+ return "repos";
155
+ case "image":
156
+ return "images";
157
+ default:
158
+ return "articles";
159
+ }
160
+ }
161
+
162
+ function findExistingSource(
163
+ manifest: Manifest,
164
+ uri: string,
165
+ contentHash: string,
166
+ ): { id: string; path: string } | null {
167
+ for (const [id, source] of Object.entries(manifest.sources)) {
168
+ // Same content hash = same content regardless of URL
169
+ if (source.hash === contentHash) {
170
+ return { id, path: source.producedArticles[0] ?? "" };
171
+ }
172
+ // Same URL but different hash = content changed, allow re-ingest
173
+ }
174
+ return null;
175
+ }
176
+
177
+ function isUrl(str: string): boolean {
178
+ return str.startsWith("http://") || str.startsWith("https://");
179
+ }
@@ -0,0 +1,120 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { countWords, normalizeSource, slugify } from "./normalize.js";
3
+
4
+ describe("slugify", () => {
5
+ test("converts title to kebab-case", () => {
6
+ expect(slugify("Transformer Architecture")).toBe("transformer-architecture");
7
+ });
8
+
9
+ test("strips special characters", () => {
10
+ expect(slugify("What's New in React 19?")).toBe("whats-new-in-react-19");
11
+ });
12
+
13
+ test("collapses multiple dashes", () => {
14
+ expect(slugify("foo -- bar")).toBe("foo-bar");
15
+ });
16
+
17
+ test("strips leading/trailing dashes", () => {
18
+ expect(slugify("--hello--")).toBe("hello");
19
+ });
20
+
21
+ test("truncates at 80 chars", () => {
22
+ const long = "a".repeat(100);
23
+ expect(slugify(long).length).toBeLessThanOrEqual(80);
24
+ });
25
+
26
+ test("handles empty string", () => {
27
+ expect(slugify("")).toBe("");
28
+ });
29
+
30
+ test("handles unicode by stripping", () => {
31
+ expect(slugify("Vaswani et al. (2017)")).toBe("vaswani-et-al-2017");
32
+ });
33
+ });
34
+
35
+ describe("countWords", () => {
36
+ test("counts plain text words", () => {
37
+ expect(countWords("hello world foo bar")).toBe(4);
38
+ });
39
+
40
+ test("ignores code blocks", () => {
41
+ const text = "before\n```\nconst x = 1;\n```\nafter";
42
+ expect(countWords(text)).toBe(2); // "before" and "after"
43
+ });
44
+
45
+ test("ignores inline code", () => {
46
+ expect(countWords("use `const` to declare")).toBe(3);
47
+ });
48
+
49
+ test("strips markdown syntax", () => {
50
+ expect(countWords("# Hello **World**")).toBe(2);
51
+ });
52
+
53
+ test("handles empty string", () => {
54
+ expect(countWords("")).toBe(0);
55
+ });
56
+
57
+ test("handles whitespace only", () => {
58
+ expect(countWords(" \n\n ")).toBe(0);
59
+ });
60
+ });
61
+
62
+ describe("normalizeSource", () => {
63
+ test("generates frontmatter with required fields", () => {
64
+ const result = normalizeSource({
65
+ title: "Test Article",
66
+ content: "# Test\n\nSome content here with words.",
67
+ sourceType: "web",
68
+ originalUrl: "https://example.com",
69
+ });
70
+
71
+ expect(result).toContain('title: "Test Article"');
72
+ expect(result).toContain("source_type: web");
73
+ expect(result).toContain('url: "https://example.com"');
74
+ expect(result).toContain("word_count:");
75
+ expect(result).toContain("ingested:");
76
+ });
77
+
78
+ test("includes author and date when present", () => {
79
+ const result = normalizeSource({
80
+ title: "Paper",
81
+ content: "Content.",
82
+ sourceType: "pdf",
83
+ metadata: { author: "John Doe", date: "2024-01-01" },
84
+ });
85
+
86
+ expect(result).toContain('author: "John Doe"');
87
+ expect(result).toContain('date: "2024-01-01"');
88
+ });
89
+
90
+ test("escapes quotes in title", () => {
91
+ const result = normalizeSource({
92
+ title: 'He said "hello"',
93
+ content: "Content.",
94
+ sourceType: "file",
95
+ });
96
+
97
+ expect(result).toContain('title: "He said \\"hello\\""');
98
+ });
99
+
100
+ test("removes excessive blank lines", () => {
101
+ const result = normalizeSource({
102
+ title: "Test",
103
+ content: "Line 1\n\n\n\n\nLine 2",
104
+ sourceType: "file",
105
+ });
106
+
107
+ expect(result).not.toContain("\n\n\n");
108
+ expect(result).toContain("Line 1\n\nLine 2");
109
+ });
110
+
111
+ test("normalizes CRLF to LF", () => {
112
+ const result = normalizeSource({
113
+ title: "Test",
114
+ content: "Line 1\r\nLine 2",
115
+ sourceType: "file",
116
+ });
117
+
118
+ expect(result).not.toContain("\r");
119
+ });
120
+ });