@kibhq/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +40 -0
  2. package/src/compile/backlinks.test.ts +112 -0
  3. package/src/compile/backlinks.ts +80 -0
  4. package/src/compile/cache.test.ts +126 -0
  5. package/src/compile/cache.ts +125 -0
  6. package/src/compile/compiler.test.ts +278 -0
  7. package/src/compile/compiler.ts +305 -0
  8. package/src/compile/diff.test.ts +164 -0
  9. package/src/compile/diff.ts +121 -0
  10. package/src/compile/index-manager.test.ts +227 -0
  11. package/src/compile/index-manager.ts +148 -0
  12. package/src/compile/prompts.ts +124 -0
  13. package/src/constants.ts +40 -0
  14. package/src/errors.ts +66 -0
  15. package/src/hash.test.ts +21 -0
  16. package/src/hash.ts +24 -0
  17. package/src/index.ts +22 -0
  18. package/src/ingest/extractors/file.test.ts +129 -0
  19. package/src/ingest/extractors/file.ts +136 -0
  20. package/src/ingest/extractors/github.test.ts +47 -0
  21. package/src/ingest/extractors/github.ts +135 -0
  22. package/src/ingest/extractors/interface.ts +26 -0
  23. package/src/ingest/extractors/pdf.ts +130 -0
  24. package/src/ingest/extractors/web.test.ts +242 -0
  25. package/src/ingest/extractors/web.ts +163 -0
  26. package/src/ingest/extractors/youtube.test.ts +44 -0
  27. package/src/ingest/extractors/youtube.ts +166 -0
  28. package/src/ingest/ingest.test.ts +187 -0
  29. package/src/ingest/ingest.ts +179 -0
  30. package/src/ingest/normalize.test.ts +120 -0
  31. package/src/ingest/normalize.ts +83 -0
  32. package/src/ingest/router.test.ts +154 -0
  33. package/src/ingest/router.ts +119 -0
  34. package/src/lint/lint.test.ts +253 -0
  35. package/src/lint/lint.ts +43 -0
  36. package/src/lint/rules.ts +178 -0
  37. package/src/providers/anthropic.ts +107 -0
  38. package/src/providers/index.ts +4 -0
  39. package/src/providers/ollama.ts +101 -0
  40. package/src/providers/openai.ts +67 -0
  41. package/src/providers/router.ts +62 -0
  42. package/src/query/query.test.ts +165 -0
  43. package/src/query/query.ts +136 -0
  44. package/src/schemas.ts +193 -0
  45. package/src/search/engine.test.ts +230 -0
  46. package/src/search/engine.ts +390 -0
  47. package/src/skills/loader.ts +163 -0
  48. package/src/skills/runner.ts +139 -0
  49. package/src/skills/schema.ts +28 -0
  50. package/src/skills/skills.test.ts +134 -0
  51. package/src/types.ts +136 -0
  52. package/src/vault.test.ts +141 -0
  53. package/src/vault.ts +251 -0
@@ -0,0 +1,135 @@
1
+ import type { ExtractOptions, Extractor, ExtractResult } from "./interface.js";
2
+
3
+ export function createGithubExtractor(): Extractor {
4
+ return {
5
+ type: "github",
6
+
7
+ async extract(url: string, options?: ExtractOptions): Promise<ExtractResult> {
8
+ const parsed = parseGithubUrl(url);
9
+ if (!parsed) {
10
+ throw new Error(`Could not parse GitHub URL: ${url}`);
11
+ }
12
+
13
+ const { owner, repo, branch } = parsed;
14
+ const apiBase = `https://api.github.com/repos/${owner}/${repo}`;
15
+
16
+ const headers: Record<string, string> = {
17
+ Accept: "application/vnd.github.v3+json",
18
+ "User-Agent": "kib/0.1",
19
+ };
20
+
21
+ // Use GITHUB_TOKEN if available for higher rate limits
22
+ if (process.env.GITHUB_TOKEN) {
23
+ headers.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`;
24
+ }
25
+
26
+ // Fetch repo metadata
27
+ const repoResponse = await fetch(apiBase, { headers });
28
+ if (!repoResponse.ok) {
29
+ throw new Error(
30
+ `Failed to fetch repo info: ${repoResponse.status} ${repoResponse.statusText}`,
31
+ );
32
+ }
33
+ const repoData = (await repoResponse.json()) as any;
34
+
35
+ // Fetch README
36
+ let readme = "";
37
+ try {
38
+ const readmeResponse = await fetch(`${apiBase}/readme`, {
39
+ headers: { ...headers, Accept: "application/vnd.github.v3.raw" },
40
+ });
41
+ if (readmeResponse.ok) {
42
+ readme = await readmeResponse.text();
43
+ }
44
+ } catch {
45
+ // No README
46
+ }
47
+
48
+ // Fetch file tree (top level only)
49
+ let fileTree = "";
50
+ try {
51
+ const ref = branch ?? repoData.default_branch ?? "main";
52
+ const treeResponse = await fetch(`${apiBase}/git/trees/${ref}`, { headers });
53
+ if (treeResponse.ok) {
54
+ const treeData = (await treeResponse.json()) as any;
55
+ const files = (treeData.tree ?? [])
56
+ .map((f: any) => `${f.type === "tree" ? "📁" : "📄"} ${f.path}`)
57
+ .slice(0, 50); // Cap at 50 entries
58
+ fileTree = files.join("\n");
59
+ }
60
+ } catch {
61
+ // No tree
62
+ }
63
+
64
+ const title = options?.title ?? `${owner}/${repo}`;
65
+ const description = repoData.description ?? "";
66
+ const stars = repoData.stargazers_count ?? 0;
67
+ const language = repoData.language ?? "Unknown";
68
+ const topics = repoData.topics ?? [];
69
+
70
+ const sections: string[] = [
71
+ `# ${title}`,
72
+ "",
73
+ description ? `> ${description}` : "",
74
+ "",
75
+ `**Language:** ${language} | **Stars:** ${stars.toLocaleString()} | **URL:** ${url}`,
76
+ ];
77
+
78
+ if (topics.length > 0) {
79
+ sections.push(`**Topics:** ${topics.join(", ")}`);
80
+ }
81
+
82
+ if (fileTree) {
83
+ sections.push("", "## File Structure", "", "```", fileTree, "```");
84
+ }
85
+
86
+ if (readme) {
87
+ sections.push("", "## README", "", readme);
88
+ }
89
+
90
+ return {
91
+ title,
92
+ content: sections.filter((s) => s !== undefined).join("\n"),
93
+ metadata: {
94
+ owner,
95
+ repo,
96
+ stars,
97
+ language,
98
+ topics,
99
+ url,
100
+ },
101
+ };
102
+ },
103
+ };
104
+ }
105
+
106
+ interface ParsedGithubUrl {
107
+ owner: string;
108
+ repo: string;
109
+ branch?: string;
110
+ }
111
+
112
+ export function parseGithubUrl(url: string): ParsedGithubUrl | null {
113
+ try {
114
+ const parsed = new URL(url.trim());
115
+ if (parsed.hostname !== "github.com" && parsed.hostname !== "www.github.com") {
116
+ return null;
117
+ }
118
+
119
+ const parts = parsed.pathname.split("/").filter(Boolean);
120
+ if (parts.length < 2) return null;
121
+
122
+ const owner = parts[0]!;
123
+ const repo = parts[1]!;
124
+
125
+ // Check for /tree/branch pattern
126
+ let branch: string | undefined;
127
+ if (parts[2] === "tree" && parts[3]) {
128
+ branch = parts[3];
129
+ }
130
+
131
+ return { owner, repo, branch };
132
+ } catch {
133
+ return null;
134
+ }
135
+ }
@@ -0,0 +1,26 @@
1
+ import type { SourceType } from "../../types.js";
2
+
3
+ export interface ExtractOptions {
4
+ /** Override the detected title */
5
+ title?: string;
6
+ /** Additional tags to attach */
7
+ tags?: string[];
8
+ /** Whether to download images referenced in the content */
9
+ downloadImages?: boolean;
10
+ }
11
+
12
+ export interface ExtractResult {
13
+ /** Extracted/detected title */
14
+ title: string;
15
+ /** Cleaned markdown content */
16
+ content: string;
17
+ /** Source-specific metadata */
18
+ metadata: Record<string, unknown>;
19
+ }
20
+
21
+ export interface Extractor {
22
+ /** Which source type this extractor handles */
23
+ type: SourceType;
24
+ /** Extract content from the given input (URL or file path) */
25
+ extract(input: string, options?: ExtractOptions): Promise<ExtractResult>;
26
+ }
@@ -0,0 +1,130 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import { basename, extname } from "node:path";
3
+ import type { ExtractOptions, Extractor, ExtractResult } from "./interface.js";
4
+
5
+ // Lazy-load pdf-parse (it's heavy)
6
+ let pdfParse: any = null;
7
+
8
+ async function getPdfParse() {
9
+ if (!pdfParse) {
10
+ const mod = await import("pdf-parse");
11
+ // pdf-parse exports default as the function in some builds
12
+ pdfParse = mod.default ?? mod;
13
+ }
14
+ return pdfParse;
15
+ }
16
+
17
+ export function createPdfExtractor(): Extractor {
18
+ return {
19
+ type: "pdf",
20
+
21
+ async extract(input: string, options?: ExtractOptions): Promise<ExtractResult> {
22
+ const parse = await getPdfParse();
23
+
24
+ let buffer: Buffer;
25
+ if (input.startsWith("http://") || input.startsWith("https://")) {
26
+ // Fetch PDF from URL
27
+ const response = await fetch(input, {
28
+ headers: {
29
+ "User-Agent": "Mozilla/5.0 (compatible; kib/0.1)",
30
+ },
31
+ redirect: "follow",
32
+ });
33
+ if (!response.ok) {
34
+ throw new Error(
35
+ `Failed to fetch PDF from ${input}: ${response.status} ${response.statusText}`,
36
+ );
37
+ }
38
+ buffer = Buffer.from(await response.arrayBuffer());
39
+ } else {
40
+ // Read local PDF file
41
+ buffer = await readFile(input);
42
+ }
43
+
44
+ const data = await parse(buffer);
45
+
46
+ const title =
47
+ options?.title ??
48
+ data.info?.Title ??
49
+ extractTitleFromText(data.text) ??
50
+ formatFilename(input);
51
+
52
+ const author = data.info?.Author ?? undefined;
53
+ const date = data.info?.CreationDate ? parsePdfDate(data.info.CreationDate) : undefined;
54
+
55
+ // Clean up the extracted text into readable markdown
56
+ const content = formatPdfText(data.text, title);
57
+
58
+ return {
59
+ title,
60
+ content,
61
+ metadata: {
62
+ author,
63
+ date,
64
+ pageCount: data.numpages,
65
+ fileType: ".pdf",
66
+ },
67
+ };
68
+ },
69
+ };
70
+ }
71
+
72
+ /**
73
+ * Try to extract a title from the first few lines of PDF text.
74
+ * Academic papers often have the title as the first prominent line.
75
+ */
76
+ function extractTitleFromText(text: string): string | undefined {
77
+ const lines = text
78
+ .split("\n")
79
+ .map((l) => l.trim())
80
+ .filter(Boolean);
81
+ // First non-empty line that's between 10-200 chars and doesn't look like metadata
82
+ for (const line of lines.slice(0, 5)) {
83
+ if (
84
+ line.length >= 10 &&
85
+ line.length <= 200 &&
86
+ !line.match(/^(abstract|introduction|page|copyright|\d)/i)
87
+ ) {
88
+ return line;
89
+ }
90
+ }
91
+ return undefined;
92
+ }
93
+
94
+ function formatFilename(input: string): string {
95
+ if (input.startsWith("http")) {
96
+ const url = new URL(input);
97
+ const parts = url.pathname.split("/");
98
+ const last = parts[parts.length - 1] ?? "document";
99
+ return last.replace(/\.pdf$/i, "").replace(/[-_]/g, " ");
100
+ }
101
+ const name = basename(input, extname(input));
102
+ return name.replace(/[-_]/g, " ");
103
+ }
104
+
105
+ function formatPdfText(text: string, title: string): string {
106
+ // Split into paragraphs (double newline or more)
107
+ const cleaned = text
108
+ // Normalize whitespace
109
+ .replace(/\r\n/g, "\n")
110
+ // Remove form feeds and other control chars
111
+ .replace(/[\f\v]/g, "\n")
112
+ // Collapse 3+ newlines to 2
113
+ .replace(/\n{3,}/g, "\n\n")
114
+ // Remove lines that are just page numbers
115
+ .replace(/^\s*\d+\s*$/gm, "")
116
+ .trim();
117
+
118
+ return `# ${title}\n\n${cleaned}`;
119
+ }
120
+
121
+ /**
122
+ * Parse PDF date format (D:20240315120000+00'00') to ISO date string.
123
+ */
124
+ function parsePdfDate(dateStr: string): string | undefined {
125
+ const match = dateStr.match(/D:(\d{4})(\d{2})(\d{2})/);
126
+ if (match) {
127
+ return `${match[1]}-${match[2]}-${match[3]}`;
128
+ }
129
+ return undefined;
130
+ }
@@ -0,0 +1,242 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { extractFromHtml } from "./web.js";
3
+
4
+ describe("web extractor", () => {
5
+ describe("extractFromHtml", () => {
6
+ test("extracts title from <title> tag", () => {
7
+ const html = `
8
+ <html>
9
+ <head><title>Test Article - Example Blog</title></head>
10
+ <body><article><p>Content here.</p></article></body>
11
+ </html>
12
+ `;
13
+ const result = extractFromHtml(html, "https://example.com");
14
+ expect(result.title).toBe("Test Article");
15
+ });
16
+
17
+ test("prefers og:title over <title>", () => {
18
+ const html = `
19
+ <html>
20
+ <head>
21
+ <title>Fallback Title</title>
22
+ <meta property="og:title" content="OG Title">
23
+ </head>
24
+ <body><article><p>Some long content that makes this valid content for extraction.</p></article></body>
25
+ </html>
26
+ `;
27
+ const result = extractFromHtml(html, "https://example.com");
28
+ expect(result.title).toBe("OG Title");
29
+ });
30
+
31
+ test("uses custom title from options", () => {
32
+ const html = `<html><head><title>Original</title></head><body><p>Content.</p></body></html>`;
33
+ const result = extractFromHtml(html, "https://example.com", {
34
+ title: "Custom Title",
35
+ });
36
+ expect(result.title).toBe("Custom Title");
37
+ });
38
+
39
+ test("extracts content from <article> tag", () => {
40
+ const html = `
41
+ <html><body>
42
+ <nav>Navigation stuff that should be removed</nav>
43
+ <article>
44
+ <h1>Main Article</h1>
45
+ <p>This is the important article content that we want to extract from the page.</p>
46
+ </article>
47
+ <footer>Footer stuff that should be removed</footer>
48
+ </body></html>
49
+ `;
50
+ const result = extractFromHtml(html, "https://example.com");
51
+ expect(result.content).toContain("Main Article");
52
+ expect(result.content).toContain("important article content");
53
+ expect(result.content).not.toContain("Navigation stuff");
54
+ expect(result.content).not.toContain("Footer stuff");
55
+ });
56
+
57
+ test("falls back to <main> when no <article>", () => {
58
+ const html = `
59
+ <html><body>
60
+ <nav>Skip this</nav>
61
+ <main>
62
+ <h1>Main Content</h1>
63
+ <p>This is the main content area with enough text to pass the threshold for extraction.</p>
64
+ </main>
65
+ </body></html>
66
+ `;
67
+ const result = extractFromHtml(html, "https://example.com");
68
+ expect(result.content).toContain("Main Content");
69
+ expect(result.content).not.toContain("Skip this");
70
+ });
71
+
72
+ test("falls back to body when no semantic elements", () => {
73
+ const html = `
74
+ <html><body>
75
+ <div>
76
+ <h1>Simple Page</h1>
77
+ <p>Just a simple page with some content.</p>
78
+ </div>
79
+ </body></html>
80
+ `;
81
+ const result = extractFromHtml(html, "https://example.com");
82
+ expect(result.content).toContain("Simple Page");
83
+ });
84
+
85
+ test("removes script and style tags", () => {
86
+ const html = `
87
+ <html><body>
88
+ <script>alert('xss')</script>
89
+ <style>.foo { color: red; }</style>
90
+ <article><p>Clean content that should be extracted without any script or style artifacts.</p></article>
91
+ </body></html>
92
+ `;
93
+ const result = extractFromHtml(html, "https://example.com");
94
+ expect(result.content).not.toContain("alert");
95
+ expect(result.content).not.toContain("color: red");
96
+ expect(result.content).toContain("Clean content");
97
+ });
98
+
99
+ test("converts HTML headings to markdown", () => {
100
+ const html = `
101
+ <html><body><article>
102
+ <h1>Heading 1</h1>
103
+ <h2>Heading 2</h2>
104
+ <p>Paragraph text underneath the headings with enough length to be valid content for extraction.</p>
105
+ </article></body></html>
106
+ `;
107
+ const result = extractFromHtml(html, "https://example.com");
108
+ expect(result.content).toContain("# Heading 1");
109
+ expect(result.content).toContain("## Heading 2");
110
+ });
111
+
112
+ test("converts HTML lists to markdown", () => {
113
+ const html = `
114
+ <html><body><article>
115
+ <ul>
116
+ <li>Item one</li>
117
+ <li>Item two</li>
118
+ <li>Item three</li>
119
+ </ul>
120
+ <p>Some additional content to make this a valid extraction with enough length.</p>
121
+ </article></body></html>
122
+ `;
123
+ const result = extractFromHtml(html, "https://example.com");
124
+ expect(result.content).toContain("Item one");
125
+ expect(result.content).toContain("Item two");
126
+ // Turndown uses "-" bullet markers
127
+ expect(result.content).toMatch(/-\s+Item one/);
128
+ });
129
+
130
+ test("converts code blocks to fenced markdown", () => {
131
+ const html = `
132
+ <html><body><article>
133
+ <p>Here is some code that demonstrates the concept with enough context for extraction:</p>
134
+ <pre><code>const x = 1;
135
+ const y = 2;</code></pre>
136
+ </article></body></html>
137
+ `;
138
+ const result = extractFromHtml(html, "https://example.com");
139
+ expect(result.content).toContain("```");
140
+ expect(result.content).toContain("const x = 1;");
141
+ });
142
+
143
+ test("replaces images with alt text description", () => {
144
+ const html = `
145
+ <html><body><article>
146
+ <p>Content about diagrams and explanations with enough text for extraction threshold.</p>
147
+ <img src="diagram.png" alt="Architecture diagram showing the system layout">
148
+ </article></body></html>
149
+ `;
150
+ const result = extractFromHtml(html, "https://example.com");
151
+ expect(result.content).toContain("[Image: Architecture diagram");
152
+ expect(result.content).not.toContain("diagram.png");
153
+ });
154
+
155
+ test("extracts author metadata", () => {
156
+ const html = `
157
+ <html>
158
+ <head><meta name="author" content="Jane Doe"></head>
159
+ <body><article><p>Article content with enough words for extraction threshold to be met.</p></article></body>
160
+ </html>
161
+ `;
162
+ const result = extractFromHtml(html, "https://example.com");
163
+ expect(result.metadata.author).toBe("Jane Doe");
164
+ });
165
+
166
+ test("extracts date metadata", () => {
167
+ const html = `
168
+ <html>
169
+ <head><meta property="article:published_time" content="2024-03-15"></head>
170
+ <body><article><p>Article content with enough words for extraction.</p></article></body>
171
+ </html>
172
+ `;
173
+ const result = extractFromHtml(html, "https://example.com");
174
+ expect(result.metadata.date).toBe("2024-03-15");
175
+ });
176
+
177
+ test("extracts date from time element", () => {
178
+ const html = `
179
+ <html><body><article>
180
+ <time datetime="2024-06-01">June 1, 2024</time>
181
+ <p>Article content with enough words for extraction threshold to be met by the extractor.</p>
182
+ </article></body></html>
183
+ `;
184
+ const result = extractFromHtml(html, "https://example.com");
185
+ expect(result.metadata.date).toBe("2024-06-01");
186
+ });
187
+
188
+ test("preserves URL in metadata", () => {
189
+ const result = extractFromHtml(
190
+ "<html><body><p>Content.</p></body></html>",
191
+ "https://example.com/article",
192
+ );
193
+ expect(result.metadata.url).toBe("https://example.com/article");
194
+ });
195
+
196
+ test("strips common title suffixes", () => {
197
+ const html = `<html><head><title>Article Title | My Blog</title></head><body><p>C</p></body></html>`;
198
+ const result = extractFromHtml(html, "https://example.com");
199
+ expect(result.title).toBe("Article Title");
200
+ });
201
+
202
+ test("strips title suffix with dash separator", () => {
203
+ const html = `<html><head><title>Great Post - The Newsletter</title></head><body><p>C</p></body></html>`;
204
+ const result = extractFromHtml(html, "https://example.com");
205
+ expect(result.title).toBe("Great Post");
206
+ });
207
+
208
+ test("handles title that is only a suffix pattern", () => {
209
+ const html = `<html><head><title></title></head><body><p>Content.</p></body></html>`;
210
+ const result = extractFromHtml(html, "https://example.com");
211
+ expect(result.title).toBe("Untitled");
212
+ });
213
+
214
+ test("removes ad-related elements", () => {
215
+ const html = `
216
+ <html><body>
217
+ <article>
218
+ <p>Real content that we want to keep in the extraction with enough words to matter.</p>
219
+ <div class="ad">Buy our product!</div>
220
+ <div class="advertisement">Sponsor content</div>
221
+ </article>
222
+ </body></html>
223
+ `;
224
+ const result = extractFromHtml(html, "https://example.com");
225
+ expect(result.content).toContain("Real content");
226
+ expect(result.content).not.toContain("Buy our product");
227
+ expect(result.content).not.toContain("Sponsor content");
228
+ });
229
+
230
+ test("removes cookie banners", () => {
231
+ const html = `
232
+ <html><body>
233
+ <div class="cookie-banner">Accept cookies?</div>
234
+ <article><p>The actual interesting content of the page that we want to extract properly.</p></article>
235
+ </body></html>
236
+ `;
237
+ const result = extractFromHtml(html, "https://example.com");
238
+ expect(result.content).not.toContain("cookie");
239
+ expect(result.content).toContain("actual interesting content");
240
+ });
241
+ });
242
+ });
@@ -0,0 +1,163 @@
1
+ import * as cheerio from "cheerio";
2
+ import TurndownService from "turndown";
3
+ import type { ExtractOptions, Extractor, ExtractResult } from "./interface.js";
4
+
5
+ const REMOVE_SELECTORS = [
6
+ "script",
7
+ "style",
8
+ "noscript",
9
+ "iframe",
10
+ "nav",
11
+ "footer",
12
+ "header:not(article header)",
13
+ "aside",
14
+ ".sidebar",
15
+ ".nav",
16
+ ".navigation",
17
+ ".menu",
18
+ ".footer",
19
+ ".header",
20
+ ".ad",
21
+ ".ads",
22
+ ".advertisement",
23
+ ".cookie-banner",
24
+ ".popup",
25
+ ".modal",
26
+ ".comments",
27
+ ".comment-section",
28
+ "[role='navigation']",
29
+ "[role='banner']",
30
+ "[role='contentinfo']",
31
+ "[role='complementary']",
32
+ ].join(", ");
33
+
34
+ const CONTENT_SELECTORS = [
35
+ "article",
36
+ "[role='main']",
37
+ "main",
38
+ ".post-content",
39
+ ".article-content",
40
+ ".entry-content",
41
+ ".content",
42
+ "#content",
43
+ ".post",
44
+ ".article",
45
+ ".blog-post",
46
+ ];
47
+
48
+ export function createWebExtractor(): Extractor {
49
+ return {
50
+ type: "web",
51
+
52
+ async extract(url: string, options?: ExtractOptions): Promise<ExtractResult> {
53
+ const response = await fetch(url, {
54
+ headers: {
55
+ "User-Agent": "Mozilla/5.0 (compatible; kib/0.1; +https://github.com/kib-cli/kib)",
56
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
57
+ },
58
+ redirect: "follow",
59
+ });
60
+
61
+ if (!response.ok) {
62
+ throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
63
+ }
64
+
65
+ const html = await response.text();
66
+ return extractFromHtml(html, url, options);
67
+ },
68
+ };
69
+ }
70
+
71
+ /**
72
+ * Extract content from raw HTML. Exported for testing without network.
73
+ */
74
+ export function extractFromHtml(
75
+ html: string,
76
+ url: string,
77
+ options?: ExtractOptions,
78
+ ): ExtractResult {
79
+ const $ = cheerio.load(html);
80
+
81
+ // Extract metadata before removing elements
82
+ const title =
83
+ options?.title ??
84
+ $('meta[property="og:title"]').attr("content") ??
85
+ $("title").first().text().trim() ??
86
+ $("h1").first().text().trim() ??
87
+ "Untitled";
88
+
89
+ const author =
90
+ $('meta[name="author"]').attr("content") ??
91
+ $('meta[property="article:author"]').attr("content") ??
92
+ ($('[rel="author"]').first().text().trim() || undefined);
93
+
94
+ const date =
95
+ $('meta[property="article:published_time"]').attr("content") ??
96
+ $("time[datetime]").first().attr("datetime") ??
97
+ $('meta[name="date"]').attr("content") ??
98
+ undefined;
99
+
100
+ const description =
101
+ $('meta[property="og:description"]').attr("content") ??
102
+ $('meta[name="description"]').attr("content") ??
103
+ undefined;
104
+
105
+ // Remove unwanted elements
106
+ $(REMOVE_SELECTORS).remove();
107
+
108
+ // Find main content
109
+ let contentHtml = "";
110
+ for (const selector of CONTENT_SELECTORS) {
111
+ const el = $(selector).first();
112
+ if (el.length && el.text().trim().length > 100) {
113
+ contentHtml = el.html() ?? "";
114
+ break;
115
+ }
116
+ }
117
+
118
+ // Fallback to body
119
+ if (!contentHtml) {
120
+ contentHtml = $("body").html() ?? $.html();
121
+ }
122
+
123
+ // Convert to markdown
124
+ const turndown = new TurndownService({
125
+ headingStyle: "atx",
126
+ codeBlockStyle: "fenced",
127
+ bulletListMarker: "-",
128
+ });
129
+
130
+ // Remove image tags by default (they'd be broken links)
131
+ turndown.addRule("images", {
132
+ filter: "img",
133
+ replacement: (_content, node) => {
134
+ const alt = (node as HTMLElement).getAttribute("alt");
135
+ if (alt && alt.length > 5) {
136
+ return `[Image: ${alt}]`;
137
+ }
138
+ return "";
139
+ },
140
+ });
141
+
142
+ const markdown = turndown.turndown(contentHtml);
143
+
144
+ return {
145
+ title: cleanTitle(title),
146
+ content: markdown,
147
+ metadata: {
148
+ author,
149
+ date,
150
+ description,
151
+ url,
152
+ },
153
+ };
154
+ }
155
+
156
+ function cleanTitle(title: string): string {
157
+ return (
158
+ title
159
+ // Remove common suffixes like "| Site Name" or "- Blog Name"
160
+ .replace(/\s*[|–—-]\s*[^|–—-]+$/, "")
161
+ .trim() || "Untitled"
162
+ );
163
+ }