@oh-my-pi/pi-coding-agent 3.25.0 → 3.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/CHANGELOG.md +90 -0
  2. package/package.json +5 -5
  3. package/src/cli/args.ts +4 -0
  4. package/src/core/agent-session.ts +29 -2
  5. package/src/core/bash-executor.ts +2 -1
  6. package/src/core/custom-commands/bundled/review/index.ts +369 -14
  7. package/src/core/custom-commands/bundled/wt/index.ts +1 -1
  8. package/src/core/session-manager.ts +158 -246
  9. package/src/core/session-storage.ts +379 -0
  10. package/src/core/settings-manager.ts +155 -4
  11. package/src/core/system-prompt.ts +62 -64
  12. package/src/core/tools/ask.ts +5 -4
  13. package/src/core/tools/bash-interceptor.ts +26 -61
  14. package/src/core/tools/bash.ts +13 -8
  15. package/src/core/tools/complete.ts +2 -4
  16. package/src/core/tools/edit-diff.ts +11 -4
  17. package/src/core/tools/edit.ts +7 -13
  18. package/src/core/tools/find.ts +111 -50
  19. package/src/core/tools/gemini-image.ts +128 -147
  20. package/src/core/tools/grep.ts +397 -415
  21. package/src/core/tools/index.test.ts +5 -1
  22. package/src/core/tools/index.ts +6 -8
  23. package/src/core/tools/jtd-to-json-schema.ts +174 -196
  24. package/src/core/tools/ls.ts +12 -10
  25. package/src/core/tools/lsp/client.ts +58 -9
  26. package/src/core/tools/lsp/config.ts +205 -656
  27. package/src/core/tools/lsp/defaults.json +465 -0
  28. package/src/core/tools/lsp/index.ts +55 -32
  29. package/src/core/tools/lsp/rust-analyzer.ts +49 -10
  30. package/src/core/tools/lsp/types.ts +1 -0
  31. package/src/core/tools/lsp/utils.ts +1 -1
  32. package/src/core/tools/read.ts +152 -76
  33. package/src/core/tools/render-utils.ts +70 -10
  34. package/src/core/tools/review.ts +38 -126
  35. package/src/core/tools/task/artifacts.ts +5 -4
  36. package/src/core/tools/task/executor.ts +204 -67
  37. package/src/core/tools/task/index.ts +129 -92
  38. package/src/core/tools/task/name-generator.ts +1544 -214
  39. package/src/core/tools/task/parallel.ts +30 -3
  40. package/src/core/tools/task/render.ts +85 -39
  41. package/src/core/tools/task/types.ts +34 -11
  42. package/src/core/tools/task/worker.ts +152 -27
  43. package/src/core/tools/web-fetch.ts +220 -1657
  44. package/src/core/tools/web-scrapers/academic.test.ts +239 -0
  45. package/src/core/tools/web-scrapers/artifacthub.ts +215 -0
  46. package/src/core/tools/web-scrapers/arxiv.ts +88 -0
  47. package/src/core/tools/web-scrapers/aur.ts +175 -0
  48. package/src/core/tools/web-scrapers/biorxiv.ts +141 -0
  49. package/src/core/tools/web-scrapers/bluesky.ts +284 -0
  50. package/src/core/tools/web-scrapers/brew.ts +177 -0
  51. package/src/core/tools/web-scrapers/business.test.ts +82 -0
  52. package/src/core/tools/web-scrapers/cheatsh.ts +78 -0
  53. package/src/core/tools/web-scrapers/chocolatey.ts +158 -0
  54. package/src/core/tools/web-scrapers/choosealicense.ts +110 -0
  55. package/src/core/tools/web-scrapers/cisa-kev.ts +100 -0
  56. package/src/core/tools/web-scrapers/clojars.ts +180 -0
  57. package/src/core/tools/web-scrapers/coingecko.ts +184 -0
  58. package/src/core/tools/web-scrapers/crates-io.ts +128 -0
  59. package/src/core/tools/web-scrapers/crossref.ts +149 -0
  60. package/src/core/tools/web-scrapers/dev-platforms.test.ts +254 -0
  61. package/src/core/tools/web-scrapers/devto.ts +177 -0
  62. package/src/core/tools/web-scrapers/discogs.ts +308 -0
  63. package/src/core/tools/web-scrapers/discourse.ts +221 -0
  64. package/src/core/tools/web-scrapers/dockerhub.ts +160 -0
  65. package/src/core/tools/web-scrapers/documentation.test.ts +85 -0
  66. package/src/core/tools/web-scrapers/fdroid.ts +158 -0
  67. package/src/core/tools/web-scrapers/finance-media.test.ts +144 -0
  68. package/src/core/tools/web-scrapers/firefox-addons.ts +214 -0
  69. package/src/core/tools/web-scrapers/flathub.ts +239 -0
  70. package/src/core/tools/web-scrapers/git-hosting.test.ts +272 -0
  71. package/src/core/tools/web-scrapers/github-gist.ts +68 -0
  72. package/src/core/tools/web-scrapers/github.ts +455 -0
  73. package/src/core/tools/web-scrapers/gitlab.ts +456 -0
  74. package/src/core/tools/web-scrapers/go-pkg.ts +275 -0
  75. package/src/core/tools/web-scrapers/hackage.ts +94 -0
  76. package/src/core/tools/web-scrapers/hackernews.ts +208 -0
  77. package/src/core/tools/web-scrapers/hex.ts +121 -0
  78. package/src/core/tools/web-scrapers/huggingface.ts +385 -0
  79. package/src/core/tools/web-scrapers/iacr.ts +86 -0
  80. package/src/core/tools/web-scrapers/index.ts +250 -0
  81. package/src/core/tools/web-scrapers/jetbrains-marketplace.ts +169 -0
  82. package/src/core/tools/web-scrapers/lemmy.ts +220 -0
  83. package/src/core/tools/web-scrapers/lobsters.ts +186 -0
  84. package/src/core/tools/web-scrapers/mastodon.ts +310 -0
  85. package/src/core/tools/web-scrapers/maven.ts +152 -0
  86. package/src/core/tools/web-scrapers/mdn.ts +174 -0
  87. package/src/core/tools/web-scrapers/media.test.ts +138 -0
  88. package/src/core/tools/web-scrapers/metacpan.ts +253 -0
  89. package/src/core/tools/web-scrapers/musicbrainz.ts +273 -0
  90. package/src/core/tools/web-scrapers/npm.ts +114 -0
  91. package/src/core/tools/web-scrapers/nuget.ts +205 -0
  92. package/src/core/tools/web-scrapers/nvd.ts +243 -0
  93. package/src/core/tools/web-scrapers/ollama.ts +267 -0
  94. package/src/core/tools/web-scrapers/open-vsx.ts +119 -0
  95. package/src/core/tools/web-scrapers/opencorporates.ts +275 -0
  96. package/src/core/tools/web-scrapers/openlibrary.ts +319 -0
  97. package/src/core/tools/web-scrapers/orcid.ts +299 -0
  98. package/src/core/tools/web-scrapers/osv.ts +189 -0
  99. package/src/core/tools/web-scrapers/package-managers-2.test.ts +199 -0
  100. package/src/core/tools/web-scrapers/package-managers.test.ts +171 -0
  101. package/src/core/tools/web-scrapers/package-registries.test.ts +259 -0
  102. package/src/core/tools/web-scrapers/packagist.ts +174 -0
  103. package/src/core/tools/web-scrapers/pub-dev.ts +185 -0
  104. package/src/core/tools/web-scrapers/pubmed.ts +178 -0
  105. package/src/core/tools/web-scrapers/pypi.ts +129 -0
  106. package/src/core/tools/web-scrapers/rawg.ts +124 -0
  107. package/src/core/tools/web-scrapers/readthedocs.ts +126 -0
  108. package/src/core/tools/web-scrapers/reddit.ts +104 -0
  109. package/src/core/tools/web-scrapers/repology.ts +262 -0
  110. package/src/core/tools/web-scrapers/research.test.ts +107 -0
  111. package/src/core/tools/web-scrapers/rfc.ts +209 -0
  112. package/src/core/tools/web-scrapers/rubygems.ts +117 -0
  113. package/src/core/tools/web-scrapers/searchcode.ts +217 -0
  114. package/src/core/tools/web-scrapers/sec-edgar.ts +274 -0
  115. package/src/core/tools/web-scrapers/security.test.ts +103 -0
  116. package/src/core/tools/web-scrapers/semantic-scholar.ts +190 -0
  117. package/src/core/tools/web-scrapers/snapcraft.ts +200 -0
  118. package/src/core/tools/web-scrapers/social-extended.test.ts +192 -0
  119. package/src/core/tools/web-scrapers/social.test.ts +259 -0
  120. package/src/core/tools/web-scrapers/sourcegraph.ts +373 -0
  121. package/src/core/tools/web-scrapers/spdx.ts +121 -0
  122. package/src/core/tools/web-scrapers/spotify.ts +218 -0
  123. package/src/core/tools/web-scrapers/stackexchange.test.ts +120 -0
  124. package/src/core/tools/web-scrapers/stackoverflow.ts +124 -0
  125. package/src/core/tools/web-scrapers/standards.test.ts +122 -0
  126. package/src/core/tools/web-scrapers/terraform.ts +304 -0
  127. package/src/core/tools/web-scrapers/tldr.ts +51 -0
  128. package/src/core/tools/web-scrapers/twitter.ts +96 -0
  129. package/src/core/tools/web-scrapers/types.ts +234 -0
  130. package/src/core/tools/web-scrapers/utils.ts +162 -0
  131. package/src/core/tools/web-scrapers/vimeo.ts +152 -0
  132. package/src/core/tools/web-scrapers/vscode-marketplace.ts +195 -0
  133. package/src/core/tools/web-scrapers/w3c.ts +163 -0
  134. package/src/core/tools/web-scrapers/wikidata.ts +357 -0
  135. package/src/core/tools/web-scrapers/wikipedia.test.ts +73 -0
  136. package/src/core/tools/web-scrapers/wikipedia.ts +95 -0
  137. package/src/core/tools/web-scrapers/youtube.test.ts +198 -0
  138. package/src/core/tools/web-scrapers/youtube.ts +371 -0
  139. package/src/core/tools/write.ts +21 -18
  140. package/src/core/voice.ts +3 -2
  141. package/src/lib/worktree/collapse.ts +2 -1
  142. package/src/lib/worktree/git.ts +2 -18
  143. package/src/main.ts +59 -3
  144. package/src/modes/interactive/components/extensions/extension-dashboard.ts +33 -19
  145. package/src/modes/interactive/components/extensions/extension-list.ts +15 -8
  146. package/src/modes/interactive/components/hook-editor.ts +2 -1
  147. package/src/modes/interactive/components/model-selector.ts +19 -4
  148. package/src/modes/interactive/interactive-mode.ts +41 -38
  149. package/src/modes/interactive/theme/theme.ts +58 -58
  150. package/src/modes/rpc/rpc-mode.ts +10 -9
  151. package/src/prompts/review-request.md +27 -0
  152. package/src/prompts/reviewer.md +64 -68
  153. package/src/prompts/tools/output.md +22 -3
  154. package/src/prompts/tools/task.md +32 -33
  155. package/src/utils/clipboard.ts +2 -1
  156. package/src/utils/tools-manager.ts +110 -8
  157. package/examples/extensions/subagent/agents/reviewer.md +0 -35
@@ -1,30 +1,26 @@
1
1
  import { tmpdir } from "node:os";
2
2
  import * as path from "node:path";
3
3
  import type { AgentTool } from "@oh-my-pi/pi-agent-core";
4
+ import type { Component } from "@oh-my-pi/pi-tui";
5
+ import { Text } from "@oh-my-pi/pi-tui";
4
6
  import { Type } from "@sinclair/typebox";
7
+ import { nanoid } from "nanoid";
5
8
  import { parse as parseHtml } from "node-html-parser";
9
+ import { type Theme, theme } from "../../modes/interactive/theme/theme";
6
10
  import webFetchDescription from "../../prompts/tools/web-fetch.md" with { type: "text" };
7
- import { logger } from "../logger";
11
+ import { ensureTool } from "../../utils/tools-manager";
12
+ import type { RenderResultOptions } from "../custom-tools/types";
8
13
  import type { ToolSession } from "./index";
14
+ import { specialHandlers } from "./web-scrapers/index";
15
+ import type { RenderResult } from "./web-scrapers/types";
16
+ import { finalizeOutput, loadPage } from "./web-scrapers/types";
17
+ import { convertWithMarkitdown, fetchBinary } from "./web-scrapers/utils";
9
18
 
10
19
  // =============================================================================
11
20
  // Types and Constants
12
21
  // =============================================================================
13
22
 
14
- interface RenderResult {
15
- url: string;
16
- finalUrl: string;
17
- contentType: string;
18
- method: string;
19
- content: string;
20
- fetchedAt: string;
21
- truncated: boolean;
22
- notes: string[];
23
- }
24
-
25
23
  const DEFAULT_TIMEOUT = 20;
26
- const MAX_BYTES = 50 * 1024 * 1024; // 50MB for binary files
27
- const MAX_OUTPUT_CHARS = 500_000;
28
24
 
29
25
  // Convertible document types (markitdown supported)
30
26
  const CONVERTIBLE_MIMES = new Set([
@@ -67,124 +63,11 @@ const CONVERTIBLE_EXTENSIONS = new Set([
67
63
  ".ogg",
68
64
  ]);
69
65
 
70
- const USER_AGENTS = [
71
- "curl/8.0",
72
- "Mozilla/5.0 (compatible; TextBot/1.0)",
73
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
74
- ];
75
-
76
66
  // =============================================================================
77
67
  // Utilities
78
68
  // =============================================================================
79
69
 
80
- interface LoadPageResult {
81
- content: string;
82
- contentType: string;
83
- finalUrl: string;
84
- ok: boolean;
85
- status?: number;
86
- }
87
-
88
- interface LoadPageOptions {
89
- timeout?: number;
90
- headers?: Record<string, string>;
91
- maxBytes?: number;
92
- }
93
-
94
- /**
95
- * Check if response indicates bot blocking (Cloudflare, etc.)
96
- */
97
- function isBotBlocked(status: number, content: string): boolean {
98
- if (status === 403 || status === 503) {
99
- const lower = content.toLowerCase();
100
- return (
101
- lower.includes("cloudflare") ||
102
- lower.includes("captcha") ||
103
- lower.includes("challenge") ||
104
- lower.includes("blocked") ||
105
- lower.includes("access denied") ||
106
- lower.includes("bot detection")
107
- );
108
- }
109
- return false;
110
- }
111
-
112
- /**
113
- * Fetch a page with timeout, size limit, and automatic retry with browser UA if blocked
114
- */
115
- async function loadPage(url: string, options: LoadPageOptions = {}): Promise<LoadPageResult> {
116
- const { timeout = 20, headers = {}, maxBytes = MAX_BYTES } = options;
117
-
118
- for (let attempt = 0; attempt < USER_AGENTS.length; attempt++) {
119
- const userAgent = USER_AGENTS[attempt];
120
-
121
- try {
122
- const controller = new AbortController();
123
- const timeoutId = setTimeout(() => controller.abort(), timeout * 1000);
124
-
125
- const response = await fetch(url, {
126
- signal: controller.signal,
127
- headers: {
128
- "User-Agent": userAgent,
129
- Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
130
- "Accept-Language": "en-US,en;q=0.5",
131
- ...headers,
132
- },
133
- redirect: "follow",
134
- });
135
-
136
- clearTimeout(timeoutId);
137
-
138
- const contentType = response.headers.get("content-type")?.split(";")[0]?.trim().toLowerCase() ?? "";
139
- const finalUrl = response.url;
140
-
141
- // Read with size limit
142
- const reader = response.body?.getReader();
143
- if (!reader) {
144
- return { content: "", contentType, finalUrl, ok: false, status: response.status };
145
- }
146
-
147
- const chunks: Uint8Array[] = [];
148
- let totalSize = 0;
149
-
150
- while (true) {
151
- const { done, value } = await reader.read();
152
- if (done) break;
153
-
154
- chunks.push(value);
155
- totalSize += value.length;
156
-
157
- if (totalSize > maxBytes) {
158
- reader.cancel();
159
- break;
160
- }
161
- }
162
-
163
- const decoder = new TextDecoder();
164
- const content = decoder.decode(Buffer.concat(chunks));
165
-
166
- // Check if we got blocked and should retry with browser UA
167
- if (isBotBlocked(response.status, content) && attempt < USER_AGENTS.length - 1) {
168
- continue;
169
- }
170
-
171
- if (!response.ok) {
172
- return { content, contentType, finalUrl, ok: false, status: response.status };
173
- }
174
-
175
- return { content, contentType, finalUrl, ok: true, status: response.status };
176
- } catch (err) {
177
- // On last attempt, return failure
178
- if (attempt === USER_AGENTS.length - 1) {
179
- logger.debug("Web fetch failed after retries", { url, error: String(err) });
180
- return { content: "", contentType: "", finalUrl: url, ok: false };
181
- }
182
- // Otherwise retry with next UA
183
- }
184
- }
185
-
186
- return { content: "", contentType: "", finalUrl: url, ok: false };
187
- }
70
+ type SpawnSyncOptions = NonNullable<Parameters<typeof Bun.spawnSync>[1]>;
188
71
 
189
72
  /**
190
73
  * Execute a command and return stdout
@@ -194,8 +77,9 @@ function exec(
194
77
  args: string[],
195
78
  options?: { timeout?: number; input?: string | Buffer },
196
79
  ): { stdout: string; stderr: string; ok: boolean } {
80
+ const stdin = (options?.input ?? "ignore") as SpawnSyncOptions["stdin"];
197
81
  const result = Bun.spawnSync([cmd, ...args], {
198
- stdin: options?.input ? (options.input as any) : "ignore",
82
+ stdin,
199
83
  stdout: "pipe",
200
84
  stderr: "pipe",
201
85
  });
@@ -288,38 +172,10 @@ function looksLikeHtml(content: string): boolean {
288
172
  );
289
173
  }
290
174
 
291
- /**
292
- * Convert binary file to markdown using markitdown
293
- */
294
- async function convertWithMarkitdown(
295
- content: Buffer,
296
- extensionHint: string,
297
- timeout: number,
298
- ): Promise<{ content: string; ok: boolean }> {
299
- if (!hasCommand("markitdown")) {
300
- return { content: "", ok: false };
301
- }
302
-
303
- // Write to temp file with extension hint
304
- const ext = extensionHint || ".bin";
305
- const tmpDir = tmpdir();
306
- const tmpFile = path.join(tmpDir, `omp-convert-${Date.now()}${ext}`);
307
-
308
- try {
309
- await Bun.write(tmpFile, content);
310
- const result = exec("markitdown", [tmpFile], { timeout });
311
- return { content: result.stdout, ok: result.ok };
312
- } finally {
313
- try {
314
- await Bun.$`rm ${tmpFile}`.quiet();
315
- } catch {}
316
- }
317
- }
318
-
319
175
  /**
320
176
  * Try fetching URL with .md appended (llms.txt convention)
321
177
  */
322
- async function tryMdSuffix(url: string, timeout: number): Promise<string | null> {
178
+ async function tryMdSuffix(url: string, timeout: number, signal?: AbortSignal): Promise<string | null> {
323
179
  const candidates: string[] = [];
324
180
 
325
181
  try {
@@ -340,8 +196,15 @@ async function tryMdSuffix(url: string, timeout: number): Promise<string | null>
340
196
  return null;
341
197
  }
342
198
 
199
+ if (signal?.aborted) {
200
+ return null;
201
+ }
202
+
343
203
  for (const candidate of candidates) {
344
- const result = await loadPage(candidate, { timeout: Math.min(timeout, 5) });
204
+ if (signal?.aborted) {
205
+ return null;
206
+ }
207
+ const result = await loadPage(candidate, { timeout: Math.min(timeout, 5), signal });
345
208
  if (result.ok && result.content.trim().length > 100 && !looksLikeHtml(result.content)) {
346
209
  return result.content;
347
210
  }
@@ -353,11 +216,18 @@ async function tryMdSuffix(url: string, timeout: number): Promise<string | null>
353
216
  /**
354
217
  * Try to fetch LLM-friendly endpoints
355
218
  */
356
- async function tryLlmEndpoints(origin: string, timeout: number): Promise<string | null> {
219
+ async function tryLlmEndpoints(origin: string, timeout: number, signal?: AbortSignal): Promise<string | null> {
357
220
  const endpoints = [`${origin}/.well-known/llms.txt`, `${origin}/llms.txt`, `${origin}/llms.md`];
358
221
 
222
+ if (signal?.aborted) {
223
+ return null;
224
+ }
225
+
359
226
  for (const endpoint of endpoints) {
360
- const result = await loadPage(endpoint, { timeout: Math.min(timeout, 5) });
227
+ if (signal?.aborted) {
228
+ return null;
229
+ }
230
+ const result = await loadPage(endpoint, { timeout: Math.min(timeout, 5), signal });
361
231
  if (result.ok && result.content.trim().length > 100 && !looksLikeHtml(result.content)) {
362
232
  return result.content;
363
233
  }
@@ -368,10 +238,19 @@ async function tryLlmEndpoints(origin: string, timeout: number): Promise<string
368
238
  /**
369
239
  * Try content negotiation for markdown/plain
370
240
  */
371
- async function tryContentNegotiation(url: string, timeout: number): Promise<{ content: string; type: string } | null> {
241
+ async function tryContentNegotiation(
242
+ url: string,
243
+ timeout: number,
244
+ signal?: AbortSignal,
245
+ ): Promise<{ content: string; type: string } | null> {
246
+ if (signal?.aborted) {
247
+ return null;
248
+ }
249
+
372
250
  const result = await loadPage(url, {
373
251
  timeout,
374
252
  headers: { Accept: "text/markdown, text/plain;q=0.9, text/html;q=0.8" },
253
+ signal,
375
254
  });
376
255
 
377
256
  if (!result.ok) return null;
@@ -522,18 +401,39 @@ function parseFeedToMarkdown(content: string, maxItems = 10): string {
522
401
  }
523
402
 
524
403
  /**
525
- * Render HTML to text using lynx
404
+ * Render HTML to text using lynx or html2text fallback
526
405
  */
527
- async function renderWithLynx(html: string, timeout: number): Promise<{ content: string; ok: boolean }> {
406
+ async function renderHtmlToText(
407
+ html: string,
408
+ timeout: number,
409
+ ): Promise<{ content: string; ok: boolean; method: string }> {
528
410
  const tmpDir = tmpdir();
529
- const tmpFile = path.join(tmpDir, `omp-render-${Date.now()}.html`);
411
+ const tmpFile = path.join(tmpDir, `omp-${nanoid()}.html`);
412
+
530
413
  try {
531
414
  await Bun.write(tmpFile, html);
532
- // Convert path to file URL (handles Windows paths correctly)
533
- const normalizedPath = tmpFile.replace(/\\/g, "/");
534
- const fileUrl = normalizedPath.startsWith("/") ? `file://${normalizedPath}` : `file:///${normalizedPath}`;
535
- const result = exec("lynx", ["-dump", "-nolist", "-width", "120", fileUrl], { timeout });
536
- return { content: result.stdout, ok: result.ok };
415
+
416
+ // Try lynx first (can't auto-install, system package)
417
+ const lynx = hasCommand("lynx");
418
+ if (lynx) {
419
+ const normalizedPath = tmpFile.replace(/\\/g, "/");
420
+ const fileUrl = normalizedPath.startsWith("/") ? `file://${normalizedPath}` : `file:///${normalizedPath}`;
421
+ const result = exec("lynx", ["-dump", "-nolist", "-width", "120", fileUrl], { timeout });
422
+ if (result.ok) {
423
+ return { content: result.stdout, ok: true, method: "lynx" };
424
+ }
425
+ }
426
+
427
+ // Fall back to html2text (auto-install via uv/pip)
428
+ const html2text = await ensureTool("html2text", true);
429
+ if (html2text) {
430
+ const result = exec(html2text, [tmpFile], { timeout });
431
+ if (result.ok) {
432
+ return { content: result.stdout, ok: true, method: "html2text" };
433
+ }
434
+ }
435
+
436
+ return { content: "", ok: false, method: "none" };
537
437
  } finally {
538
438
  try {
539
439
  await Bun.$`rm ${tmpFile}`.quiet();
@@ -580,1488 +480,157 @@ function formatJson(content: string): string {
580
480
  }
581
481
  }
582
482
 
583
- /**
584
- * Truncate and cleanup output
585
- */
586
- function finalizeOutput(content: string): { content: string; truncated: boolean } {
587
- const cleaned = content.replace(/\n{3,}/g, "\n\n").trim();
588
- const truncated = cleaned.length > MAX_OUTPUT_CHARS;
589
- return {
590
- content: cleaned.slice(0, MAX_OUTPUT_CHARS),
591
- truncated,
592
- };
593
- }
483
+ // =============================================================================
484
+ // Unified Special Handler Dispatch
485
+ // =============================================================================
594
486
 
595
487
  /**
596
- * Fetch page as binary buffer (for convertible files)
488
+ * Try all special handlers
597
489
  */
598
- async function fetchBinary(
599
- url: string,
600
- timeout: number,
601
- ): Promise<{ buffer: Buffer; contentType: string; contentDisposition?: string; ok: boolean }> {
602
- try {
603
- const controller = new AbortController();
604
- const timeoutId = setTimeout(() => controller.abort(), timeout * 1000);
605
-
606
- const response = await fetch(url, {
607
- signal: controller.signal,
608
- headers: {
609
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0",
610
- },
611
- redirect: "follow",
612
- });
613
-
614
- clearTimeout(timeoutId);
615
-
616
- if (!response.ok) {
617
- return { buffer: Buffer.alloc(0), contentType: "", ok: false };
618
- }
619
-
620
- const contentType = response.headers.get("content-type") ?? "";
621
- const contentDisposition = response.headers.get("content-disposition") ?? undefined;
622
- const contentLength = response.headers.get("content-length");
623
- if (contentLength) {
624
- const size = Number.parseInt(contentLength, 10);
625
- if (Number.isFinite(size) && size > MAX_BYTES) {
626
- return { buffer: Buffer.alloc(0), contentType, contentDisposition, ok: false };
627
- }
490
+ async function handleSpecialUrls(url: string, timeout: number, signal?: AbortSignal): Promise<RenderResult | null> {
491
+ for (const handler of specialHandlers) {
492
+ if (signal?.aborted) {
493
+ throw new Error("Operation aborted");
628
494
  }
629
-
630
- const buffer = Buffer.from(await response.arrayBuffer());
631
- if (buffer.length > MAX_BYTES) {
632
- return { buffer: Buffer.alloc(0), contentType, contentDisposition, ok: false };
633
- }
634
-
635
- return { buffer, contentType, contentDisposition, ok: true };
636
- } catch {
637
- return { buffer: Buffer.alloc(0), contentType: "", ok: false };
495
+ const result = await handler(url, timeout, signal);
496
+ if (result) return result;
638
497
  }
498
+ return null;
639
499
  }
640
500
 
641
501
  // =============================================================================
642
- // GitHub Special Handling
502
+ // Main Render Function
643
503
  // =============================================================================
644
504
 
645
- interface GitHubUrl {
646
- type: "blob" | "tree" | "repo" | "issue" | "issues" | "pull" | "pulls" | "discussion" | "discussions" | "other";
647
- owner: string;
648
- repo: string;
649
- ref?: string;
650
- path?: string;
651
- number?: number;
652
- }
653
-
654
505
  /**
655
- * Parse GitHub URL into components
506
+ * Main render function implementing the full pipeline
656
507
  */
657
- function parseGitHubUrl(url: string): GitHubUrl | null {
658
- try {
659
- const parsed = new URL(url);
660
- if (parsed.hostname !== "github.com") return null;
661
-
662
- const parts = parsed.pathname.split("/").filter(Boolean);
663
- if (parts.length < 2) return null;
664
-
665
- const [owner, repo, ...rest] = parts;
666
-
667
- if (rest.length === 0) {
668
- return { type: "repo", owner, repo };
669
- }
670
-
671
- const [section, ...subParts] = rest;
672
-
673
- switch (section) {
674
- case "blob":
675
- case "tree": {
676
- const [ref, ...pathParts] = subParts;
677
- return { type: section, owner, repo, ref, path: pathParts.join("/") };
678
- }
679
- case "issues":
680
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
681
- return { type: "issue", owner, repo, number: parseInt(subParts[0], 10) };
682
- }
683
- return { type: "issues", owner, repo };
684
- case "pull":
685
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
686
- return { type: "pull", owner, repo, number: parseInt(subParts[0], 10) };
687
- }
688
- return { type: "pulls", owner, repo };
689
- case "pulls":
690
- return { type: "pulls", owner, repo };
691
- case "discussions":
692
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
693
- return { type: "discussion", owner, repo, number: parseInt(subParts[0], 10) };
694
- }
695
- return { type: "discussions", owner, repo };
696
- default:
697
- return { type: "other", owner, repo };
698
- }
699
- } catch {
700
- return null;
508
+ async function renderUrl(
509
+ url: string,
510
+ timeout: number,
511
+ raw: boolean = false,
512
+ signal?: AbortSignal,
513
+ ): Promise<RenderResult> {
514
+ const notes: string[] = [];
515
+ const fetchedAt = new Date().toISOString();
516
+ if (signal?.aborted) {
517
+ throw new Error("Operation aborted");
701
518
  }
702
- }
703
519
 
704
- /**
705
- * Convert GitHub blob URL to raw URL
706
- */
707
- function toRawGitHubUrl(gh: GitHubUrl): string {
708
- return `https://raw.githubusercontent.com/${gh.owner}/${gh.repo}/refs/heads/${gh.ref}/${gh.path}`;
709
- }
520
+ // Step 0: Normalize URL (ensure scheme for special handlers)
521
+ url = normalizeUrl(url);
522
+ const origin = getOrigin(url);
710
523
 
711
- /**
712
- * Fetch from GitHub API
713
- */
714
- async function fetchGitHubApi(endpoint: string, timeout: number): Promise<{ data: unknown; ok: boolean }> {
715
- try {
716
- const controller = new AbortController();
717
- const timeoutId = setTimeout(() => controller.abort(), timeout * 1000);
524
+ // Step 1: Try special handlers for known sites (unless raw mode)
525
+ if (!raw) {
526
+ const specialResult = await handleSpecialUrls(url, timeout, signal);
527
+ if (specialResult) return specialResult;
528
+ }
718
529
 
719
- const headers: Record<string, string> = {
720
- Accept: "application/vnd.github.v3+json",
721
- "User-Agent": "omp-web-fetch/1.0",
530
+ // Step 2: Fetch page
531
+ const response = await loadPage(url, { timeout, signal });
532
+ if (signal?.aborted) {
533
+ throw new Error("Operation aborted");
534
+ }
535
+ if (!response.ok) {
536
+ return {
537
+ url,
538
+ finalUrl: url,
539
+ contentType: "unknown",
540
+ method: "failed",
541
+ content: "",
542
+ fetchedAt,
543
+ truncated: false,
544
+ notes: ["Failed to fetch URL"],
722
545
  };
723
-
724
- // Use GITHUB_TOKEN if available
725
- const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
726
- if (token) {
727
- headers.Authorization = `Bearer ${token}`;
728
- }
729
-
730
- const response = await fetch(`https://api.github.com${endpoint}`, {
731
- signal: controller.signal,
732
- headers,
733
- });
734
-
735
- clearTimeout(timeoutId);
736
-
737
- if (!response.ok) {
738
- return { data: null, ok: false };
739
- }
740
-
741
- return { data: await response.json(), ok: true };
742
- } catch {
743
- return { data: null, ok: false };
744
546
  }
745
- }
746
547
 
747
- /**
748
- * Render GitHub issue/PR to markdown
749
- */
750
- async function renderGitHubIssue(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
751
- const endpoint =
752
- gh.type === "pull"
753
- ? `/repos/${gh.owner}/${gh.repo}/pulls/${gh.number}`
754
- : `/repos/${gh.owner}/${gh.repo}/issues/${gh.number}`;
755
-
756
- const result = await fetchGitHubApi(endpoint, timeout);
757
- if (!result.ok || !result.data) return { content: "", ok: false };
758
-
759
- const issue = result.data as {
760
- title: string;
761
- number: number;
762
- state: string;
763
- user: { login: string };
764
- created_at: string;
765
- updated_at: string;
766
- body: string | null;
767
- labels: Array<{ name: string }>;
768
- comments: number;
769
- html_url: string;
770
- };
548
+ const { finalUrl, content: rawContent } = response;
549
+ const mime = normalizeMime(response.contentType);
550
+ const extHint = getExtensionHint(finalUrl);
771
551
 
772
- let md = `# ${issue.title}\n\n`;
773
- md += `**#${issue.number}** · ${issue.state} · opened by @${issue.user.login}\n`;
774
- md += `Created: ${issue.created_at} · Updated: ${issue.updated_at}\n`;
775
- if (issue.labels.length > 0) {
776
- md += `Labels: ${issue.labels.map((l) => l.name).join(", ")}\n`;
777
- }
778
- md += `\n---\n\n`;
779
- md += issue.body || "*No description provided.*";
780
- md += `\n\n---\n\n`;
781
-
782
- // Fetch comments if any
783
- if (issue.comments > 0) {
784
- const commentsResult = await fetchGitHubApi(
785
- `/repos/${gh.owner}/${gh.repo}/issues/${gh.number}/comments?per_page=50`,
786
- timeout,
787
- );
788
- if (commentsResult.ok && Array.isArray(commentsResult.data)) {
789
- md += `## Comments (${issue.comments})\n\n`;
790
- for (const comment of commentsResult.data as Array<{
791
- user: { login: string };
792
- created_at: string;
793
- body: string;
794
- }>) {
795
- md += `### @${comment.user.login} · ${comment.created_at}\n\n`;
796
- md += `${comment.body}\n\n---\n\n`;
552
+ // Step 3: Handle convertible binary files (PDF, DOCX, etc.)
553
+ if (isConvertible(mime, extHint)) {
554
+ const binary = await fetchBinary(finalUrl, timeout, signal);
555
+ if (binary.ok) {
556
+ const ext = getExtensionHint(finalUrl, binary.contentDisposition) || extHint;
557
+ const converted = await convertWithMarkitdown(binary.buffer, ext, timeout, signal);
558
+ if (converted.ok) {
559
+ if (converted.content.trim().length > 50) {
560
+ notes.push("Converted with markitdown");
561
+ const output = finalizeOutput(converted.content);
562
+ return {
563
+ url,
564
+ finalUrl,
565
+ contentType: mime,
566
+ method: "markitdown",
567
+ content: output.content,
568
+ fetchedAt,
569
+ truncated: output.truncated,
570
+ notes,
571
+ };
572
+ }
573
+ notes.push("markitdown conversion produced no usable output");
574
+ } else if (converted.error) {
575
+ notes.push(`markitdown conversion failed: ${converted.error}`);
576
+ } else {
577
+ notes.push("markitdown conversion failed");
797
578
  }
579
+ } else if (binary.error) {
580
+ notes.push(`Binary fetch failed: ${binary.error}`);
581
+ } else {
582
+ notes.push("Binary fetch failed");
798
583
  }
799
584
  }
800
585
 
801
- return { content: md, ok: true };
802
- }
586
+ // Step 4: Handle non-HTML text content
587
+ const isHtml = mime.includes("html") || mime.includes("xhtml");
588
+ const isJson = mime.includes("json");
589
+ const isXml = mime.includes("xml") && !isHtml;
590
+ const isText = mime.includes("text/plain") || mime.includes("text/markdown");
591
+ const isFeed = mime.includes("rss") || mime.includes("atom") || mime.includes("feed");
803
592
 
804
- /**
805
- * Render GitHub issues list to markdown
806
- */
807
- async function renderGitHubIssuesList(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
808
- const result = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/issues?state=open&per_page=30`, timeout);
809
- if (!result.ok || !Array.isArray(result.data)) return { content: "", ok: false };
810
-
811
- const issues = result.data as Array<{
812
- number: number;
813
- title: string;
814
- state: string;
815
- user: { login: string };
816
- created_at: string;
817
- comments: number;
818
- labels: Array<{ name: string }>;
819
- pull_request?: unknown;
820
- }>;
821
-
822
- let md = `# ${gh.owner}/${gh.repo} - Open Issues\n\n`;
823
-
824
- for (const issue of issues) {
825
- if (issue.pull_request) continue; // Skip PRs in issues list
826
- const labels = issue.labels.length > 0 ? ` [${issue.labels.map((l) => l.name).join(", ")}]` : "";
827
- md += `- **#${issue.number}** ${issue.title}${labels}\n`;
828
- md += ` by @${issue.user.login} · ${issue.comments} comments · ${issue.created_at}\n\n`;
593
+ if (isJson) {
594
+ const output = finalizeOutput(formatJson(rawContent));
595
+ return {
596
+ url,
597
+ finalUrl,
598
+ contentType: mime,
599
+ method: "json",
600
+ content: output.content,
601
+ fetchedAt,
602
+ truncated: output.truncated,
603
+ notes,
604
+ };
829
605
  }
830
606
 
831
- return { content: md, ok: true };
832
- }
833
-
834
- /**
835
- * Render GitHub tree (directory) to markdown
836
- */
837
- async function renderGitHubTree(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
838
- // Fetch repo info first to get default branch if ref not specified
839
- const repoResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}`, timeout);
840
- if (!repoResult.ok) return { content: "", ok: false };
841
-
842
- const repo = repoResult.data as {
843
- full_name: string;
844
- default_branch: string;
845
- };
846
-
847
- const ref = gh.ref || repo.default_branch;
848
- const dirPath = gh.path || "";
849
-
850
- let md = `# ${repo.full_name}/${dirPath || "(root)"}\n\n`;
851
- md += `**Branch:** ${ref}\n\n`;
852
-
853
- // Fetch directory contents
854
- const contentsResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/contents/${dirPath}?ref=${ref}`, timeout);
855
-
856
- if (contentsResult.ok && Array.isArray(contentsResult.data)) {
857
- const items = contentsResult.data as Array<{
858
- name: string;
859
- type: "file" | "dir" | "symlink" | "submodule";
860
- size?: number;
861
- path: string;
862
- }>;
863
-
864
- // Sort: directories first, then files, alphabetically
865
- items.sort((a, b) => {
866
- if (a.type === "dir" && b.type !== "dir") return -1;
867
- if (a.type !== "dir" && b.type === "dir") return 1;
868
- return a.name.localeCompare(b.name);
869
- });
870
-
871
- md += `## Contents\n\n`;
872
- md += "```\n";
873
- for (const item of items) {
874
- const prefix = item.type === "dir" ? "[dir] " : " ";
875
- const size = item.size ? ` (${item.size} bytes)` : "";
876
- md += `${prefix}${item.name}${item.type === "file" ? size : ""}\n`;
877
- }
878
- md += "```\n\n";
879
-
880
- // Look for README in this directory
881
- const readmeFile = items.find((item) => item.type === "file" && /^readme\.md$/i.test(item.name));
882
- if (readmeFile) {
883
- const readmePath = dirPath ? `${dirPath}/${readmeFile.name}` : readmeFile.name;
884
- const rawUrl = `https://raw.githubusercontent.com/${gh.owner}/${gh.repo}/refs/heads/${ref}/${readmePath}`;
885
- const readmeResult = await loadPage(rawUrl, { timeout });
886
- if (readmeResult.ok) {
887
- md += `---\n\n## README\n\n${readmeResult.content}`;
888
- }
889
- }
607
+ if (isFeed || (isXml && (rawContent.includes("<rss") || rawContent.includes("<feed")))) {
608
+ const parsed = parseFeedToMarkdown(rawContent);
609
+ const output = finalizeOutput(parsed);
610
+ return {
611
+ url,
612
+ finalUrl,
613
+ contentType: mime,
614
+ method: "feed",
615
+ content: output.content,
616
+ fetchedAt,
617
+ truncated: output.truncated,
618
+ notes,
619
+ };
890
620
  }
891
621
 
892
- return { content: md, ok: true };
893
- }
894
-
895
- /**
896
- * Render GitHub repo to markdown (file list + README)
897
- */
898
- async function renderGitHubRepo(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
899
- // Fetch repo info
900
- const repoResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}`, timeout);
901
- if (!repoResult.ok) return { content: "", ok: false };
902
-
903
- const repo = repoResult.data as {
904
- full_name: string;
905
- description: string | null;
906
- stargazers_count: number;
907
- forks_count: number;
908
- open_issues_count: number;
909
- default_branch: string;
910
- language: string | null;
911
- license: { name: string } | null;
912
- };
913
-
914
- let md = `# ${repo.full_name}\n\n`;
915
- if (repo.description) md += `${repo.description}\n\n`;
916
- md += `Stars: ${repo.stargazers_count} · Forks: ${repo.forks_count} · Issues: ${repo.open_issues_count}\n`;
917
- if (repo.language) md += `Language: ${repo.language}\n`;
918
- if (repo.license) md += `License: ${repo.license.name}\n`;
919
- md += `\n---\n\n`;
920
-
921
- // Fetch file tree
922
- const treeResult = await fetchGitHubApi(
923
- `/repos/${gh.owner}/${gh.repo}/git/trees/${repo.default_branch}?recursive=1`,
924
- timeout,
925
- );
926
- if (treeResult.ok && treeResult.data) {
927
- const tree = (treeResult.data as { tree: Array<{ path: string; type: string }> }).tree;
928
- md += `## Files\n\n`;
929
- md += "```\n";
930
- for (const item of tree.slice(0, 100)) {
931
- const prefix = item.type === "tree" ? "[dir] " : " ";
932
- md += `${prefix}${item.path}\n`;
933
- }
934
- if (tree.length > 100) {
935
- md += `... and ${tree.length - 100} more files\n`;
936
- }
937
- md += "```\n\n";
938
- }
939
-
940
- // Fetch README
941
- const readmeResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/readme`, timeout);
942
- if (readmeResult.ok && readmeResult.data) {
943
- const readme = readmeResult.data as { content: string; encoding: string };
944
- if (readme.encoding === "base64") {
945
- const decoded = Buffer.from(readme.content, "base64").toString("utf-8");
946
- md += `## README\n\n${decoded}`;
947
- }
948
- }
949
-
950
- return { content: md, ok: true };
951
- }
952
-
953
- /**
954
- * Handle GitHub URLs specially
955
- */
956
- async function handleGitHub(url: string, timeout: number): Promise<RenderResult | null> {
957
- const gh = parseGitHubUrl(url);
958
- if (!gh) return null;
959
-
960
- const fetchedAt = new Date().toISOString();
961
- const notes: string[] = [];
962
-
963
- switch (gh.type) {
964
- case "blob": {
965
- // Convert to raw URL and fetch
966
- const rawUrl = toRawGitHubUrl(gh);
967
- notes.push(`Fetched raw: ${rawUrl}`);
968
- const result = await loadPage(rawUrl, { timeout });
969
- if (result.ok) {
970
- const output = finalizeOutput(result.content);
971
- return {
972
- url,
973
- finalUrl: rawUrl,
974
- contentType: "text/plain",
975
- method: "github-raw",
976
- content: output.content,
977
- fetchedAt,
978
- truncated: output.truncated,
979
- notes,
980
- };
981
- }
982
- break;
983
- }
984
-
985
- case "tree": {
986
- notes.push(`Fetched via GitHub API`);
987
- const result = await renderGitHubTree(gh, timeout);
988
- if (result.ok) {
989
- const output = finalizeOutput(result.content);
990
- return {
991
- url,
992
- finalUrl: url,
993
- contentType: "text/markdown",
994
- method: "github-tree",
995
- content: output.content,
996
- fetchedAt,
997
- truncated: output.truncated,
998
- notes,
999
- };
1000
- }
1001
- break;
1002
- }
1003
-
1004
- case "issue":
1005
- case "pull": {
1006
- notes.push(`Fetched via GitHub API`);
1007
- const result = await renderGitHubIssue(gh, timeout);
1008
- if (result.ok) {
1009
- const output = finalizeOutput(result.content);
1010
- return {
1011
- url,
1012
- finalUrl: url,
1013
- contentType: "text/markdown",
1014
- method: gh.type === "pull" ? "github-pr" : "github-issue",
1015
- content: output.content,
1016
- fetchedAt,
1017
- truncated: output.truncated,
1018
- notes,
1019
- };
1020
- }
1021
- break;
1022
- }
1023
-
1024
- case "issues": {
1025
- notes.push(`Fetched via GitHub API`);
1026
- const result = await renderGitHubIssuesList(gh, timeout);
1027
- if (result.ok) {
1028
- const output = finalizeOutput(result.content);
1029
- return {
1030
- url,
1031
- finalUrl: url,
1032
- contentType: "text/markdown",
1033
- method: "github-issues",
1034
- content: output.content,
1035
- fetchedAt,
1036
- truncated: output.truncated,
1037
- notes,
1038
- };
1039
- }
1040
- break;
1041
- }
1042
-
1043
- case "repo": {
1044
- notes.push(`Fetched via GitHub API`);
1045
- const result = await renderGitHubRepo(gh, timeout);
1046
- if (result.ok) {
1047
- const output = finalizeOutput(result.content);
1048
- return {
1049
- url,
1050
- finalUrl: url,
1051
- contentType: "text/markdown",
1052
- method: "github-repo",
1053
- content: output.content,
1054
- fetchedAt,
1055
- truncated: output.truncated,
1056
- notes,
1057
- };
1058
- }
1059
- break;
1060
- }
1061
- }
1062
-
1063
- // Fall back to null (let normal rendering handle it)
1064
- return null;
1065
- }
1066
-
1067
- // =============================================================================
1068
- // Twitter/X Special Handling (via Nitter)
1069
- // =============================================================================
1070
-
1071
- // Active Nitter instances - check https://status.d420.de/instances for current status
1072
- const NITTER_INSTANCES = [
1073
- "nitter.privacyredirect.com",
1074
- "nitter.tiekoetter.com",
1075
- "nitter.poast.org",
1076
- "nitter.woodland.cafe",
1077
- ];
1078
-
1079
- /**
1080
- * Handle Twitter/X URLs via Nitter
1081
- */
1082
- async function handleTwitter(url: string, timeout: number): Promise<RenderResult | null> {
1083
- try {
1084
- const parsed = new URL(url);
1085
- if (!["twitter.com", "x.com", "www.twitter.com", "www.x.com"].includes(parsed.hostname)) {
1086
- return null;
1087
- }
1088
-
1089
- const fetchedAt = new Date().toISOString();
1090
-
1091
- // Try Nitter instances
1092
- for (const instance of NITTER_INSTANCES) {
1093
- const nitterUrl = `https://${instance}${parsed.pathname}`;
1094
- const result = await loadPage(nitterUrl, { timeout: Math.min(timeout, 10) });
1095
-
1096
- if (result.ok && result.content.length > 500) {
1097
- // Parse the Nitter HTML
1098
- const doc = parseHtml(result.content);
1099
-
1100
- // Extract tweet content
1101
- const tweetContent = doc.querySelector(".tweet-content")?.text?.trim();
1102
- const fullname = doc.querySelector(".fullname")?.text?.trim();
1103
- const username = doc.querySelector(".username")?.text?.trim();
1104
- const date = doc.querySelector(".tweet-date a")?.text?.trim();
1105
- const stats = doc.querySelector(".tweet-stats")?.text?.trim();
1106
-
1107
- if (tweetContent) {
1108
- let md = `# Tweet by ${fullname || "Unknown"} (${username || "@?"})\n\n`;
1109
- if (date) md += `*${date}*\n\n`;
1110
- md += `${tweetContent}\n\n`;
1111
- if (stats) md += `---\n${stats.replace(/\s+/g, " ")}\n`;
1112
-
1113
- // Check for replies/thread
1114
- const replies = doc.querySelectorAll(".timeline-item .tweet-content");
1115
- if (replies.length > 1) {
1116
- md += `\n---\n\n## Thread/Replies\n\n`;
1117
- for (const reply of Array.from(replies).slice(1, 10)) {
1118
- const replyUser = reply.parentNode?.querySelector(".username")?.text?.trim();
1119
- md += `**${replyUser || "@?"}**: ${reply.text?.trim()}\n\n`;
1120
- }
1121
- }
1122
-
1123
- const output = finalizeOutput(md);
1124
- return {
1125
- url,
1126
- finalUrl: nitterUrl,
1127
- contentType: "text/markdown",
1128
- method: "twitter-nitter",
1129
- content: output.content,
1130
- fetchedAt,
1131
- truncated: output.truncated,
1132
- notes: [`Via Nitter: ${instance}`],
1133
- };
1134
- }
1135
- }
1136
- }
1137
- } catch {}
1138
-
1139
- // X.com blocks all bots - return a helpful error instead of falling through
1140
- return {
1141
- url,
1142
- finalUrl: url,
1143
- contentType: "text/plain",
1144
- method: "twitter-blocked",
1145
- content:
1146
- "Twitter/X blocks automated access. Nitter instances were unavailable.\n\nTry:\n- Opening the link in a browser\n- Using a different Nitter instance manually\n- Checking if the tweet is available via an archive service",
1147
- fetchedAt: new Date().toISOString(),
1148
- truncated: false,
1149
- notes: ["X.com blocks bots; Nitter instances unavailable"],
1150
- };
1151
- }
1152
-
1153
- // =============================================================================
1154
- // Stack Overflow Special Handling
1155
- // =============================================================================
1156
-
1157
- interface SOQuestion {
1158
- title: string;
1159
- body: string;
1160
- score: number;
1161
- owner: { display_name: string };
1162
- creation_date: number;
1163
- tags: string[];
1164
- answer_count: number;
1165
- is_answered: boolean;
1166
- }
1167
-
1168
- interface SOAnswer {
1169
- body: string;
1170
- score: number;
1171
- is_accepted: boolean;
1172
- owner: { display_name: string };
1173
- creation_date: number;
1174
- }
1175
-
1176
- /**
1177
- * Convert basic HTML to markdown (for SO bodies)
1178
- */
1179
- function htmlToBasicMarkdown(html: string): string {
1180
- return html
1181
- .replace(/<pre><code[^>]*>/g, "\n```\n")
1182
- .replace(/<\/code><\/pre>/g, "\n```\n")
1183
- .replace(/<code>/g, "`")
1184
- .replace(/<\/code>/g, "`")
1185
- .replace(/<strong>/g, "**")
1186
- .replace(/<\/strong>/g, "**")
1187
- .replace(/<em>/g, "*")
1188
- .replace(/<\/em>/g, "*")
1189
- .replace(/<a href="([^"]+)"[^>]*>([^<]+)<\/a>/g, "[$2]($1)")
1190
- .replace(/<p>/g, "\n\n")
1191
- .replace(/<\/p>/g, "")
1192
- .replace(/<br\s*\/?>/g, "\n")
1193
- .replace(/<li>/g, "- ")
1194
- .replace(/<\/li>/g, "\n")
1195
- .replace(/<\/?[uo]l>/g, "\n")
1196
- .replace(/<h(\d)>/g, (_, n) => `\n${"#".repeat(parseInt(n, 10))} `)
1197
- .replace(/<\/h\d>/g, "\n")
1198
- .replace(/<blockquote>/g, "\n> ")
1199
- .replace(/<\/blockquote>/g, "\n")
1200
- .replace(/<[^>]+>/g, "") // Strip remaining tags
1201
- .replace(/&lt;/g, "<")
1202
- .replace(/&gt;/g, ">")
1203
- .replace(/&amp;/g, "&")
1204
- .replace(/&quot;/g, '"')
1205
- .replace(/&#39;/g, "'")
1206
- .replace(/\n{3,}/g, "\n\n")
1207
- .trim();
1208
- }
1209
-
1210
- /**
1211
- * Handle Stack Overflow URLs via API
1212
- */
1213
- async function handleStackOverflow(url: string, timeout: number): Promise<RenderResult | null> {
1214
- try {
1215
- const parsed = new URL(url);
1216
- if (!parsed.hostname.includes("stackoverflow.com") && !parsed.hostname.includes("stackexchange.com")) {
1217
- return null;
1218
- }
1219
-
1220
- // Extract question ID from URL patterns like /questions/12345/...
1221
- const match = parsed.pathname.match(/\/questions\/(\d+)/);
1222
- if (!match) return null;
1223
-
1224
- const questionId = match[1];
1225
- const site = parsed.hostname.includes("stackoverflow") ? "stackoverflow" : parsed.hostname.split(".")[0];
1226
- const fetchedAt = new Date().toISOString();
1227
-
1228
- // Fetch question with answers
1229
- const apiUrl = `https://api.stackexchange.com/2.3/questions/${questionId}?order=desc&sort=votes&site=${site}&filter=withbody`;
1230
- const qResult = await loadPage(apiUrl, { timeout });
1231
-
1232
- if (!qResult.ok) return null;
1233
-
1234
- const qData = JSON.parse(qResult.content) as { items: SOQuestion[] };
1235
- if (!qData.items?.length) return null;
1236
-
1237
- const question = qData.items[0];
1238
-
1239
- let md = `# ${question.title}\n\n`;
1240
- md += `**Score:** ${question.score} · **Answers:** ${question.answer_count}`;
1241
- md += question.is_answered ? " (Answered)" : "";
1242
- md += `\n**Tags:** ${question.tags.join(", ")}\n`;
1243
- md += `**Asked by:** ${question.owner.display_name} · ${new Date(question.creation_date * 1000).toISOString().split("T")[0]}\n\n`;
1244
- md += `---\n\n## Question\n\n${htmlToBasicMarkdown(question.body)}\n\n`;
1245
-
1246
- // Fetch answers
1247
- const aUrl = `https://api.stackexchange.com/2.3/questions/${questionId}/answers?order=desc&sort=votes&site=${site}&filter=withbody`;
1248
- const aResult = await loadPage(aUrl, { timeout });
1249
-
1250
- if (aResult.ok) {
1251
- const aData = JSON.parse(aResult.content) as { items: SOAnswer[] };
1252
- if (aData.items?.length) {
1253
- md += `---\n\n## Answers\n\n`;
1254
- for (const answer of aData.items.slice(0, 5)) {
1255
- const accepted = answer.is_accepted ? " (Accepted)" : "";
1256
- md += `### Score: ${answer.score}${accepted} · by ${answer.owner.display_name}\n\n`;
1257
- md += `${htmlToBasicMarkdown(answer.body)}\n\n---\n\n`;
1258
- }
1259
- }
1260
- }
1261
-
1262
- const output = finalizeOutput(md);
1263
- return {
1264
- url,
1265
- finalUrl: url,
1266
- contentType: "text/markdown",
1267
- method: "stackoverflow",
1268
- content: output.content,
1269
- fetchedAt,
1270
- truncated: output.truncated,
1271
- notes: ["Fetched via Stack Exchange API"],
1272
- };
1273
- } catch {}
1274
-
1275
- return null;
1276
- }
1277
-
1278
- // =============================================================================
1279
- // Wikipedia Special Handling
1280
- // =============================================================================
1281
-
1282
- /**
1283
- * Handle Wikipedia URLs via API
1284
- */
1285
- async function handleWikipedia(url: string, timeout: number): Promise<RenderResult | null> {
1286
- try {
1287
- const parsed = new URL(url);
1288
- // Match *.wikipedia.org
1289
- const wikiMatch = parsed.hostname.match(/^(\w+)\.wikipedia\.org$/);
1290
- if (!wikiMatch) return null;
1291
-
1292
- const lang = wikiMatch[1];
1293
- const titleMatch = parsed.pathname.match(/\/wiki\/(.+)/);
1294
- if (!titleMatch) return null;
1295
-
1296
- const title = decodeURIComponent(titleMatch[1]);
1297
- const fetchedAt = new Date().toISOString();
1298
-
1299
- // Use Wikipedia API to get plain text extract
1300
- const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(title)}`;
1301
- const summaryResult = await loadPage(apiUrl, { timeout });
1302
-
1303
- let md = "";
1304
-
1305
- if (summaryResult.ok) {
1306
- const summary = JSON.parse(summaryResult.content) as {
1307
- title: string;
1308
- description?: string;
1309
- extract: string;
1310
- };
1311
- md = `# ${summary.title}\n\n`;
1312
- if (summary.description) md += `*${summary.description}*\n\n`;
1313
- md += `${summary.extract}\n\n---\n\n`;
1314
- }
1315
-
1316
- // Get full article content via mobile-html or parse API
1317
- const contentUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(title)}`;
1318
- const contentResult = await loadPage(contentUrl, { timeout });
1319
-
1320
- if (contentResult.ok) {
1321
- const doc = parseHtml(contentResult.content);
1322
-
1323
- // Extract main content sections
1324
- const sections = doc.querySelectorAll("section");
1325
- for (const section of sections) {
1326
- const heading = section.querySelector("h2, h3, h4");
1327
- const headingText = heading?.text?.trim();
1328
-
1329
- // Skip certain sections
1330
- if (
1331
- headingText &&
1332
- ["References", "External links", "See also", "Notes", "Further reading"].includes(headingText)
1333
- ) {
1334
- continue;
1335
- }
1336
-
1337
- if (headingText) {
1338
- const level = heading?.tagName === "H2" ? "##" : "###";
1339
- md += `${level} ${headingText}\n\n`;
1340
- }
1341
-
1342
- const paragraphs = section.querySelectorAll("p");
1343
- for (const p of paragraphs) {
1344
- const text = p.text?.trim();
1345
- if (text && text.length > 20) {
1346
- md += `${text}\n\n`;
1347
- }
1348
- }
1349
- }
1350
- }
1351
-
1352
- if (!md) return null;
1353
-
1354
- const output = finalizeOutput(md);
1355
- return {
1356
- url,
1357
- finalUrl: url,
1358
- contentType: "text/markdown",
1359
- method: "wikipedia",
1360
- content: output.content,
1361
- fetchedAt,
1362
- truncated: output.truncated,
1363
- notes: ["Fetched via Wikipedia API"],
1364
- };
1365
- } catch {}
1366
-
1367
- return null;
1368
- }
1369
-
1370
- // =============================================================================
1371
- // Reddit Special Handling
1372
- // =============================================================================
1373
-
1374
- interface RedditPost {
1375
- title: string;
1376
- selftext: string;
1377
- author: string;
1378
- score: number;
1379
- num_comments: number;
1380
- created_utc: number;
1381
- subreddit: string;
1382
- url: string;
1383
- is_self: boolean;
1384
- }
1385
-
1386
- interface RedditComment {
1387
- body: string;
1388
- author: string;
1389
- score: number;
1390
- created_utc: number;
1391
- replies?: { data: { children: Array<{ data: RedditComment }> } };
1392
- }
1393
-
1394
- /**
1395
- * Handle Reddit URLs via JSON API
1396
- */
1397
- async function handleReddit(url: string, timeout: number): Promise<RenderResult | null> {
1398
- try {
1399
- const parsed = new URL(url);
1400
- if (!parsed.hostname.includes("reddit.com")) return null;
1401
-
1402
- const fetchedAt = new Date().toISOString();
1403
-
1404
- // Append .json to get JSON response
1405
- let jsonUrl = `${url.replace(/\/$/, "")}.json`;
1406
- if (parsed.search) {
1407
- jsonUrl = `${url.replace(/\/$/, "").replace(parsed.search, "")}.json${parsed.search}`;
1408
- }
1409
-
1410
- const result = await loadPage(jsonUrl, { timeout });
1411
- if (!result.ok) return null;
1412
-
1413
- const data = JSON.parse(result.content);
1414
- let md = "";
1415
-
1416
- // Handle different Reddit URL types
1417
- if (Array.isArray(data) && data.length >= 1) {
1418
- // Post page (with comments)
1419
- const postData = data[0]?.data?.children?.[0]?.data as RedditPost | undefined;
1420
- if (postData) {
1421
- md = `# ${postData.title}\n\n`;
1422
- md += `**r/${postData.subreddit}** · u/${postData.author} · ${postData.score} points · ${postData.num_comments} comments\n`;
1423
- md += `*${new Date(postData.created_utc * 1000).toISOString().split("T")[0]}*\n\n`;
1424
-
1425
- if (postData.is_self && postData.selftext) {
1426
- md += `---\n\n${postData.selftext}\n\n`;
1427
- } else if (!postData.is_self) {
1428
- md += `**Link:** ${postData.url}\n\n`;
1429
- }
1430
-
1431
- // Add comments if available
1432
- if (data.length >= 2 && data[1]?.data?.children) {
1433
- md += `---\n\n## Top Comments\n\n`;
1434
- const comments = data[1].data.children.filter((c: { kind: string }) => c.kind === "t1").slice(0, 10);
1435
-
1436
- for (const { data: comment } of comments as Array<{ data: RedditComment }>) {
1437
- md += `### u/${comment.author} · ${comment.score} points\n\n`;
1438
- md += `${comment.body}\n\n---\n\n`;
1439
- }
1440
- }
1441
- }
1442
- } else if (data?.data?.children) {
1443
- // Subreddit or listing page
1444
- const posts = data.data.children.slice(0, 20) as Array<{ data: RedditPost }>;
1445
- const subreddit = posts[0]?.data?.subreddit;
1446
-
1447
- md = `# r/${subreddit || "Reddit"}\n\n`;
1448
- for (const { data: post } of posts) {
1449
- md += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments)\n`;
1450
- md += ` by u/${post.author}\n\n`;
1451
- }
1452
- }
1453
-
1454
- if (!md) return null;
1455
-
1456
- const output = finalizeOutput(md);
1457
- return {
1458
- url,
1459
- finalUrl: url,
1460
- contentType: "text/markdown",
1461
- method: "reddit",
1462
- content: output.content,
1463
- fetchedAt,
1464
- truncated: output.truncated,
1465
- notes: ["Fetched via Reddit JSON API"],
1466
- };
1467
- } catch {}
1468
-
1469
- return null;
1470
- }
1471
-
1472
- // =============================================================================
1473
- // NPM Special Handling
1474
- // =============================================================================
1475
-
1476
- /**
1477
- * Handle NPM URLs via registry API
1478
- */
1479
- async function handleNpm(url: string, timeout: number): Promise<RenderResult | null> {
1480
- try {
1481
- const parsed = new URL(url);
1482
- if (parsed.hostname !== "www.npmjs.com" && parsed.hostname !== "npmjs.com") return null;
1483
-
1484
- // Extract package name from /package/[scope/]name
1485
- const match = parsed.pathname.match(/^\/package\/(.+?)(?:\/|$)/);
1486
- if (!match) return null;
1487
-
1488
- let packageName = decodeURIComponent(match[1]);
1489
- // Handle scoped packages: /package/@scope/name
1490
- if (packageName.startsWith("@")) {
1491
- const scopeMatch = parsed.pathname.match(/^\/package\/(@[^/]+\/[^/]+)/);
1492
- if (scopeMatch) packageName = decodeURIComponent(scopeMatch[1]);
1493
- }
1494
-
1495
- const fetchedAt = new Date().toISOString();
1496
-
1497
- // Fetch from npm registry - use /latest endpoint for smaller response
1498
- const latestUrl = `https://registry.npmjs.org/${packageName}/latest`;
1499
- const downloadsUrl = `https://api.npmjs.org/downloads/point/last-week/${encodeURIComponent(packageName)}`;
1500
-
1501
- // Fetch package info and download stats in parallel
1502
- const [result, downloadsResult] = await Promise.all([
1503
- loadPage(latestUrl, { timeout }),
1504
- loadPage(downloadsUrl, { timeout: Math.min(timeout, 5) }),
1505
- ]);
1506
-
1507
- if (!result.ok) return null;
1508
-
1509
- // Parse download stats
1510
- let weeklyDownloads: number | null = null;
1511
- if (downloadsResult.ok) {
1512
- try {
1513
- const dlData = JSON.parse(downloadsResult.content) as { downloads?: number };
1514
- weeklyDownloads = dlData.downloads ?? null;
1515
- } catch {}
1516
- }
1517
-
1518
- let pkg: {
1519
- name: string;
1520
- version: string;
1521
- description?: string;
1522
- license?: string;
1523
- homepage?: string;
1524
- repository?: { url: string } | string;
1525
- keywords?: string[];
1526
- maintainers?: Array<{ name: string }>;
1527
- dependencies?: Record<string, string>;
1528
- readme?: string;
1529
- };
1530
-
1531
- try {
1532
- pkg = JSON.parse(result.content);
1533
- } catch {
1534
- return null; // JSON parse failed (truncated response)
1535
- }
1536
-
1537
- let md = `# ${pkg.name}\n\n`;
1538
- if (pkg.description) md += `${pkg.description}\n\n`;
1539
-
1540
- md += `**Latest:** ${pkg.version || "unknown"}`;
1541
- if (pkg.license) md += ` · **License:** ${typeof pkg.license === "string" ? pkg.license : pkg.license}`;
1542
- md += "\n";
1543
- if (weeklyDownloads !== null) {
1544
- const formatted =
1545
- weeklyDownloads >= 1_000_000
1546
- ? `${(weeklyDownloads / 1_000_000).toFixed(1)}M`
1547
- : weeklyDownloads >= 1_000
1548
- ? `${(weeklyDownloads / 1_000).toFixed(1)}K`
1549
- : String(weeklyDownloads);
1550
- md += `**Weekly Downloads:** ${formatted}\n`;
1551
- }
1552
- md += "\n";
1553
-
1554
- if (pkg.homepage) md += `**Homepage:** ${pkg.homepage}\n`;
1555
- const repoUrl = typeof pkg.repository === "string" ? pkg.repository : pkg.repository?.url;
1556
- if (repoUrl) md += `**Repository:** ${repoUrl.replace(/^git\+/, "").replace(/\.git$/, "")}\n`;
1557
- if (pkg.keywords?.length) md += `**Keywords:** ${pkg.keywords.join(", ")}\n`;
1558
- if (pkg.maintainers?.length) md += `**Maintainers:** ${pkg.maintainers.map((m) => m.name).join(", ")}\n`;
1559
-
1560
- if (pkg.dependencies && Object.keys(pkg.dependencies).length > 0) {
1561
- md += `\n## Dependencies\n\n`;
1562
- for (const [dep, version] of Object.entries(pkg.dependencies)) {
1563
- md += `- ${dep}: ${version}\n`;
1564
- }
1565
- }
1566
-
1567
- if (pkg.readme) {
1568
- md += `\n---\n\n## README\n\n${pkg.readme}\n`;
1569
- }
1570
-
1571
- const output = finalizeOutput(md);
1572
- return {
1573
- url,
1574
- finalUrl: url,
1575
- contentType: "text/markdown",
1576
- method: "npm",
1577
- content: output.content,
1578
- fetchedAt,
1579
- truncated: output.truncated,
1580
- notes: ["Fetched via npm registry"],
1581
- };
1582
- } catch {}
1583
-
1584
- return null;
1585
- }
1586
-
1587
- // =============================================================================
1588
- // Crates.io Special Handling
1589
- // =============================================================================
1590
-
1591
- /**
1592
- * Handle crates.io URLs via API
1593
- */
1594
- async function handleCratesIo(url: string, timeout: number): Promise<RenderResult | null> {
1595
- try {
1596
- const parsed = new URL(url);
1597
- if (parsed.hostname !== "crates.io" && parsed.hostname !== "www.crates.io") return null;
1598
-
1599
- // Extract crate name from /crates/name or /crates/name/version
1600
- const match = parsed.pathname.match(/^\/crates\/([^/]+)/);
1601
- if (!match) return null;
1602
-
1603
- const crateName = decodeURIComponent(match[1]);
1604
- const fetchedAt = new Date().toISOString();
1605
-
1606
- // Fetch from crates.io API
1607
- const apiUrl = `https://crates.io/api/v1/crates/${crateName}`;
1608
- const result = await loadPage(apiUrl, {
1609
- timeout,
1610
- headers: { "User-Agent": "omp-web-fetch/1.0 (https://github.com/anthropics)" },
1611
- });
1612
-
1613
- if (!result.ok) return null;
1614
-
1615
- let data: {
1616
- crate: {
1617
- name: string;
1618
- description: string | null;
1619
- downloads: number;
1620
- recent_downloads: number;
1621
- max_version: string;
1622
- repository: string | null;
1623
- homepage: string | null;
1624
- documentation: string | null;
1625
- categories: string[];
1626
- keywords: string[];
1627
- created_at: string;
1628
- updated_at: string;
1629
- };
1630
- versions: Array<{
1631
- num: string;
1632
- downloads: number;
1633
- created_at: string;
1634
- license: string | null;
1635
- rust_version: string | null;
1636
- }>;
1637
- };
1638
-
1639
- try {
1640
- data = JSON.parse(result.content);
1641
- } catch {
1642
- return null;
1643
- }
1644
-
1645
- const crate = data.crate;
1646
- const latestVersion = data.versions?.[0];
1647
-
1648
- // Format download counts
1649
- const formatDownloads = (n: number): string =>
1650
- n >= 1_000_000 ? `${(n / 1_000_000).toFixed(1)}M` : n >= 1_000 ? `${(n / 1_000).toFixed(1)}K` : String(n);
1651
-
1652
- let md = `# ${crate.name}\n\n`;
1653
- if (crate.description) md += `${crate.description}\n\n`;
1654
-
1655
- md += `**Latest:** ${crate.max_version}`;
1656
- if (latestVersion?.license) md += ` · **License:** ${latestVersion.license}`;
1657
- if (latestVersion?.rust_version) md += ` · **MSRV:** ${latestVersion.rust_version}`;
1658
- md += "\n";
1659
- md += `**Downloads:** ${formatDownloads(crate.downloads)} total · ${formatDownloads(crate.recent_downloads)} recent\n\n`;
1660
-
1661
- if (crate.repository) md += `**Repository:** ${crate.repository}\n`;
1662
- if (crate.homepage && crate.homepage !== crate.repository) md += `**Homepage:** ${crate.homepage}\n`;
1663
- if (crate.documentation) md += `**Docs:** ${crate.documentation}\n`;
1664
- if (crate.keywords?.length) md += `**Keywords:** ${crate.keywords.join(", ")}\n`;
1665
- if (crate.categories?.length) md += `**Categories:** ${crate.categories.join(", ")}\n`;
1666
-
1667
- // Show recent versions
1668
- if (data.versions?.length > 0) {
1669
- md += `\n## Recent Versions\n\n`;
1670
- for (const ver of data.versions.slice(0, 5)) {
1671
- const date = ver.created_at.split("T")[0];
1672
- md += `- **${ver.num}** (${date}) - ${formatDownloads(ver.downloads)} downloads\n`;
1673
- }
1674
- }
1675
-
1676
- // Try to fetch README from docs.rs or repository
1677
- const docsRsUrl = `https://docs.rs/crate/${crateName}/${crate.max_version}/source/README.md`;
1678
- const readmeResult = await loadPage(docsRsUrl, { timeout: Math.min(timeout, 5) });
1679
- if (readmeResult.ok && readmeResult.content.length > 100 && !looksLikeHtml(readmeResult.content)) {
1680
- md += `\n---\n\n## README\n\n${readmeResult.content}\n`;
1681
- }
1682
-
1683
- const output = finalizeOutput(md);
1684
- return {
1685
- url,
1686
- finalUrl: url,
1687
- contentType: "text/markdown",
1688
- method: "crates.io",
1689
- content: output.content,
1690
- fetchedAt,
1691
- truncated: output.truncated,
1692
- notes: ["Fetched via crates.io API"],
1693
- };
1694
- } catch {}
1695
-
1696
- return null;
1697
- }
1698
-
1699
- // =============================================================================
1700
- // arXiv Special Handling
1701
- // =============================================================================
1702
-
1703
- /**
1704
- * Handle arXiv URLs - fetch abstract + optionally PDF
1705
- */
1706
- async function handleArxiv(url: string, timeout: number): Promise<RenderResult | null> {
1707
- try {
1708
- const parsed = new URL(url);
1709
- if (parsed.hostname !== "arxiv.org") return null;
1710
-
1711
- // Extract paper ID from various URL formats
1712
- // /abs/1234.56789, /pdf/1234.56789, /abs/cs/0123456
1713
- const match = parsed.pathname.match(/\/(abs|pdf)\/(.+?)(?:\.pdf)?$/);
1714
- if (!match) return null;
1715
-
1716
- const paperId = match[2];
1717
- const fetchedAt = new Date().toISOString();
1718
- const notes: string[] = [];
1719
-
1720
- // Fetch metadata via arXiv API
1721
- const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
1722
- const result = await loadPage(apiUrl, { timeout });
1723
-
1724
- if (!result.ok) return null;
1725
-
1726
- // Parse the Atom feed response
1727
- const doc = parseHtml(result.content, { parseNoneClosedTags: true });
1728
- const entry = doc.querySelector("entry");
1729
-
1730
- if (!entry) return null;
1731
-
1732
- const title = entry.querySelector("title")?.text?.trim()?.replace(/\s+/g, " ");
1733
- const summary = entry.querySelector("summary")?.text?.trim();
1734
- const authors = entry
1735
- .querySelectorAll("author name")
1736
- .map((n) => n.text?.trim())
1737
- .filter(Boolean);
1738
- const published = entry.querySelector("published")?.text?.trim()?.split("T")[0];
1739
- const categories = entry
1740
- .querySelectorAll("category")
1741
- .map((c) => c.getAttribute("term"))
1742
- .filter(Boolean);
1743
- const pdfLink = entry.querySelector('link[title="pdf"]')?.getAttribute("href");
1744
-
1745
- let md = `# ${title || "arXiv Paper"}\n\n`;
1746
- if (authors.length) md += `**Authors:** ${authors.join(", ")}\n`;
1747
- if (published) md += `**Published:** ${published}\n`;
1748
- if (categories.length) md += `**Categories:** ${categories.join(", ")}\n`;
1749
- md += `**arXiv:** ${paperId}\n\n`;
1750
- md += `---\n\n## Abstract\n\n${summary || "No abstract available."}\n\n`;
1751
-
1752
- // If it was a PDF link or we want full content, try to fetch and convert PDF
1753
- if (match[1] === "pdf" || parsed.pathname.includes(".pdf")) {
1754
- if (pdfLink) {
1755
- notes.push("Fetching PDF for full content...");
1756
- const pdfResult = await fetchBinary(pdfLink, timeout);
1757
- if (pdfResult.ok) {
1758
- const converted = await convertWithMarkitdown(pdfResult.buffer, ".pdf", timeout);
1759
- if (converted.ok && converted.content.length > 500) {
1760
- md += `---\n\n## Full Paper\n\n${converted.content}\n`;
1761
- notes.push("PDF converted via markitdown");
1762
- }
1763
- }
1764
- }
1765
- }
1766
-
1767
- const output = finalizeOutput(md);
1768
- return {
1769
- url,
1770
- finalUrl: url,
1771
- contentType: "text/markdown",
1772
- method: "arxiv",
1773
- content: output.content,
1774
- fetchedAt,
1775
- truncated: output.truncated,
1776
- notes: notes.length ? notes : ["Fetched via arXiv API"],
1777
- };
1778
- } catch {}
1779
-
1780
- return null;
1781
- }
1782
-
1783
- // =============================================================================
1784
- // IACR ePrint Special Handling
1785
- // =============================================================================
1786
-
1787
- /**
1788
- * Handle IACR Cryptology ePrint Archive URLs
1789
- */
1790
- async function handleIacr(url: string, timeout: number): Promise<RenderResult | null> {
1791
- try {
1792
- const parsed = new URL(url);
1793
- if (parsed.hostname !== "eprint.iacr.org") return null;
1794
-
1795
- // Extract paper ID from /year/number or /year/number.pdf
1796
- const match = parsed.pathname.match(/\/(\d{4})\/(\d+)(?:\.pdf)?$/);
1797
- if (!match) return null;
1798
-
1799
- const [, year, number] = match;
1800
- const paperId = `${year}/${number}`;
1801
- const fetchedAt = new Date().toISOString();
1802
- const notes: string[] = [];
1803
-
1804
- // Fetch the HTML page for metadata
1805
- const pageUrl = `https://eprint.iacr.org/${paperId}`;
1806
- const result = await loadPage(pageUrl, { timeout });
1807
-
1808
- if (!result.ok) return null;
1809
-
1810
- const doc = parseHtml(result.content);
1811
-
1812
- // Extract metadata from the page
1813
- const title =
1814
- doc.querySelector("h3.mb-3")?.text?.trim() ||
1815
- doc.querySelector('meta[name="citation_title"]')?.getAttribute("content");
1816
- const authors = doc
1817
- .querySelectorAll('meta[name="citation_author"]')
1818
- .map((m) => m.getAttribute("content"))
1819
- .filter(Boolean);
1820
- // Abstract is in <p> after <h5>Abstract</h5>
1821
- const abstractHeading = doc.querySelectorAll("h5").find((h) => h.text?.includes("Abstract"));
1822
- const abstract =
1823
- abstractHeading?.parentNode?.querySelector("p")?.text?.trim() ||
1824
- doc.querySelector('meta[name="description"]')?.getAttribute("content");
1825
- const keywords = doc.querySelector(".keywords")?.text?.replace("Keywords:", "").trim();
1826
- const pubDate = doc.querySelector('meta[name="citation_publication_date"]')?.getAttribute("content");
1827
-
1828
- let md = `# ${title || "IACR ePrint Paper"}\n\n`;
1829
- if (authors.length) md += `**Authors:** ${authors.join(", ")}\n`;
1830
- if (pubDate) md += `**Date:** ${pubDate}\n`;
1831
- md += `**ePrint:** ${paperId}\n`;
1832
- if (keywords) md += `**Keywords:** ${keywords}\n`;
1833
- md += `\n---\n\n## Abstract\n\n${abstract || "No abstract available."}\n\n`;
1834
-
1835
- // If it was a PDF link, try to fetch and convert PDF
1836
- if (parsed.pathname.endsWith(".pdf")) {
1837
- const pdfUrl = `https://eprint.iacr.org/${paperId}.pdf`;
1838
- notes.push("Fetching PDF for full content...");
1839
- const pdfResult = await fetchBinary(pdfUrl, timeout);
1840
- if (pdfResult.ok) {
1841
- const converted = await convertWithMarkitdown(pdfResult.buffer, ".pdf", timeout);
1842
- if (converted.ok && converted.content.length > 500) {
1843
- md += `---\n\n## Full Paper\n\n${converted.content}\n`;
1844
- notes.push("PDF converted via markitdown");
1845
- }
1846
- }
1847
- }
1848
-
1849
- const output = finalizeOutput(md);
1850
- return {
1851
- url,
1852
- finalUrl: url,
1853
- contentType: "text/markdown",
1854
- method: "iacr",
1855
- content: output.content,
1856
- fetchedAt,
1857
- truncated: output.truncated,
1858
- notes: notes.length ? notes : ["Fetched from IACR ePrint Archive"],
1859
- };
1860
- } catch {}
1861
-
1862
- return null;
1863
- }
1864
-
1865
- // =============================================================================
1866
- // GitHub Gist Special Handling
1867
- // =============================================================================
1868
-
1869
- /**
1870
- * Handle GitHub Gist URLs via API
1871
- */
1872
- async function handleGitHubGist(url: string, timeout: number): Promise<RenderResult | null> {
1873
- try {
1874
- const parsed = new URL(url);
1875
- if (parsed.hostname !== "gist.github.com") return null;
1876
-
1877
- // Extract gist ID from /username/gistId or just /gistId
1878
- const parts = parsed.pathname.split("/").filter(Boolean);
1879
- if (parts.length === 0) return null;
1880
-
1881
- // Gist ID is always the last path segment (or only segment for anonymous gists)
1882
- const gistId = parts[parts.length - 1];
1883
- if (!gistId || !/^[a-f0-9]+$/i.test(gistId)) return null;
1884
-
1885
- const fetchedAt = new Date().toISOString();
1886
-
1887
- // Fetch via GitHub API
1888
- const result = await fetchGitHubApi(`/gists/${gistId}`, timeout);
1889
- if (!result.ok || !result.data) return null;
1890
-
1891
- const gist = result.data as {
1892
- description: string | null;
1893
- owner?: { login: string };
1894
- created_at: string;
1895
- updated_at: string;
1896
- files: Record<string, { filename: string; language: string | null; size: number; content: string }>;
1897
- html_url: string;
1898
- };
1899
-
1900
- const files = Object.values(gist.files);
1901
- const owner = gist.owner?.login || "anonymous";
1902
-
1903
- let md = `# Gist by ${owner}\n\n`;
1904
- if (gist.description) md += `${gist.description}\n\n`;
1905
- md += `**Created:** ${gist.created_at} · **Updated:** ${gist.updated_at}\n`;
1906
- md += `**Files:** ${files.length}\n\n`;
1907
-
1908
- for (const file of files) {
1909
- const lang = file.language?.toLowerCase() || "";
1910
- md += `---\n\n## ${file.filename}\n\n`;
1911
- md += `\`\`\`${lang}\n${file.content}\n\`\`\`\n\n`;
1912
- }
1913
-
1914
- const output = finalizeOutput(md);
1915
- return {
1916
- url,
1917
- finalUrl: url,
1918
- contentType: "text/markdown",
1919
- method: "github-gist",
1920
- content: output.content,
1921
- fetchedAt,
1922
- truncated: output.truncated,
1923
- notes: ["Fetched via GitHub API"],
1924
- };
1925
- } catch {}
1926
-
1927
- return null;
1928
- }
1929
-
1930
- // =============================================================================
1931
- // Unified Special Handler Dispatch
1932
- // =============================================================================
1933
-
1934
- /**
1935
- * Try all special handlers
1936
- */
1937
- async function handleSpecialUrls(url: string, timeout: number): Promise<RenderResult | null> {
1938
- // Order matters - more specific first
1939
- return (
1940
- (await handleGitHubGist(url, timeout)) ||
1941
- (await handleGitHub(url, timeout)) ||
1942
- (await handleTwitter(url, timeout)) ||
1943
- (await handleStackOverflow(url, timeout)) ||
1944
- (await handleWikipedia(url, timeout)) ||
1945
- (await handleReddit(url, timeout)) ||
1946
- (await handleNpm(url, timeout)) ||
1947
- (await handleCratesIo(url, timeout)) ||
1948
- (await handleArxiv(url, timeout)) ||
1949
- (await handleIacr(url, timeout))
1950
- );
1951
- }
1952
-
1953
- // =============================================================================
1954
- // Main Render Function
1955
- // =============================================================================
1956
-
1957
- /**
1958
- * Main render function implementing the full pipeline
1959
- */
1960
- async function renderUrl(url: string, timeout: number, raw: boolean = false): Promise<RenderResult> {
1961
- const notes: string[] = [];
1962
- const fetchedAt = new Date().toISOString();
1963
-
1964
- // Step 0: Normalize URL (ensure scheme for special handlers)
1965
- url = normalizeUrl(url);
1966
- const origin = getOrigin(url);
1967
-
1968
- // Step 1: Try special handlers for known sites (unless raw mode)
1969
- if (!raw) {
1970
- const specialResult = await handleSpecialUrls(url, timeout);
1971
- if (specialResult) return specialResult;
1972
- }
1973
-
1974
- // Step 2: Fetch page
1975
- const response = await loadPage(url, { timeout });
1976
- if (!response.ok) {
1977
- return {
1978
- url,
1979
- finalUrl: url,
1980
- contentType: "unknown",
1981
- method: "failed",
1982
- content: "",
1983
- fetchedAt,
1984
- truncated: false,
1985
- notes: ["Failed to fetch URL"],
1986
- };
1987
- }
1988
-
1989
- const { finalUrl, content: rawContent } = response;
1990
- const mime = normalizeMime(response.contentType);
1991
- const extHint = getExtensionHint(finalUrl);
1992
-
1993
- // Step 3: Handle convertible binary files (PDF, DOCX, etc.)
1994
- if (isConvertible(mime, extHint)) {
1995
- const binary = await fetchBinary(finalUrl, timeout);
1996
- if (binary.ok) {
1997
- const ext = getExtensionHint(finalUrl, binary.contentDisposition) || extHint;
1998
- const converted = await convertWithMarkitdown(binary.buffer, ext, timeout);
1999
- if (converted.ok && converted.content.trim().length > 50) {
2000
- notes.push(`Converted with markitdown`);
2001
- const output = finalizeOutput(converted.content);
2002
- return {
2003
- url,
2004
- finalUrl,
2005
- contentType: mime,
2006
- method: "markitdown",
2007
- content: output.content,
2008
- fetchedAt,
2009
- truncated: output.truncated,
2010
- notes,
2011
- };
2012
- }
2013
- }
2014
- notes.push("markitdown conversion failed");
2015
- }
2016
-
2017
- // Step 4: Handle non-HTML text content
2018
- const isHtml = mime.includes("html") || mime.includes("xhtml");
2019
- const isJson = mime.includes("json");
2020
- const isXml = mime.includes("xml") && !isHtml;
2021
- const isText = mime.includes("text/plain") || mime.includes("text/markdown");
2022
- const isFeed = mime.includes("rss") || mime.includes("atom") || mime.includes("feed");
2023
-
2024
- if (isJson) {
2025
- const output = finalizeOutput(formatJson(rawContent));
2026
- return {
2027
- url,
2028
- finalUrl,
2029
- contentType: mime,
2030
- method: "json",
2031
- content: output.content,
2032
- fetchedAt,
2033
- truncated: output.truncated,
2034
- notes,
2035
- };
2036
- }
2037
-
2038
- if (isFeed || (isXml && (rawContent.includes("<rss") || rawContent.includes("<feed")))) {
2039
- const parsed = parseFeedToMarkdown(rawContent);
2040
- const output = finalizeOutput(parsed);
2041
- return {
2042
- url,
2043
- finalUrl,
2044
- contentType: mime,
2045
- method: "feed",
2046
- content: output.content,
2047
- fetchedAt,
2048
- truncated: output.truncated,
2049
- notes,
2050
- };
2051
- }
2052
-
2053
- if (isText && !looksLikeHtml(rawContent)) {
2054
- const output = finalizeOutput(rawContent);
2055
- return {
2056
- url,
2057
- finalUrl,
2058
- contentType: mime,
2059
- method: "text",
2060
- content: output.content,
2061
- fetchedAt,
2062
- truncated: output.truncated,
2063
- notes,
2064
- };
622
+ if (isText && !looksLikeHtml(rawContent)) {
623
+ const output = finalizeOutput(rawContent);
624
+ return {
625
+ url,
626
+ finalUrl,
627
+ contentType: mime,
628
+ method: "text",
629
+ content: output.content,
630
+ fetchedAt,
631
+ truncated: output.truncated,
632
+ notes,
633
+ };
2065
634
  }
2066
635
 
2067
636
  // Step 5: For HTML, try digestible formats first (unless raw mode)
@@ -2071,7 +640,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2071
640
  const markdownAlt = alternates.find((alt) => alt.endsWith(".md") || alt.includes("markdown"));
2072
641
  if (markdownAlt) {
2073
642
  const resolved = markdownAlt.startsWith("http") ? markdownAlt : new URL(markdownAlt, finalUrl).href;
2074
- const altResult = await loadPage(resolved, { timeout });
643
+ const altResult = await loadPage(resolved, { timeout, signal });
2075
644
  if (altResult.ok && altResult.content.trim().length > 100 && !looksLikeHtml(altResult.content)) {
2076
645
  notes.push(`Used markdown alternate: ${resolved}`);
2077
646
  const output = finalizeOutput(altResult.content);
@@ -2089,7 +658,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2089
658
  }
2090
659
 
2091
660
  // 5B: Try URL.md suffix (llms.txt convention)
2092
- const mdSuffix = await tryMdSuffix(finalUrl, timeout);
661
+ const mdSuffix = await tryMdSuffix(finalUrl, timeout, signal);
2093
662
  if (mdSuffix) {
2094
663
  notes.push("Found .md suffix version");
2095
664
  const output = finalizeOutput(mdSuffix);
@@ -2106,7 +675,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2106
675
  }
2107
676
 
2108
677
  // 5C: LLM-friendly endpoints
2109
- const llmContent = await tryLlmEndpoints(origin, timeout);
678
+ const llmContent = await tryLlmEndpoints(origin, timeout, signal);
2110
679
  if (llmContent) {
2111
680
  notes.push("Found llms.txt");
2112
681
  const output = finalizeOutput(llmContent);
@@ -2123,7 +692,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2123
692
  }
2124
693
 
2125
694
  // 5D: Content negotiation
2126
- const negotiated = await tryContentNegotiation(url, timeout);
695
+ const negotiated = await tryContentNegotiation(url, timeout, signal);
2127
696
  if (negotiated) {
2128
697
  notes.push(`Content negotiation returned ${negotiated.type}`);
2129
698
  const output = finalizeOutput(negotiated.content);
@@ -2143,7 +712,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2143
712
  const feedAlternates = alternates.filter((alt) => !alt.endsWith(".md") && !alt.includes("markdown"));
2144
713
  for (const altUrl of feedAlternates.slice(0, 2)) {
2145
714
  const resolved = altUrl.startsWith("http") ? altUrl : new URL(altUrl, finalUrl).href;
2146
- const altResult = await loadPage(resolved, { timeout });
715
+ const altResult = await loadPage(resolved, { timeout, signal });
2147
716
  if (altResult.ok && altResult.content.trim().length > 200) {
2148
717
  notes.push(`Used feed alternate: ${resolved}`);
2149
718
  const parsed = parseFeedToMarkdown(altResult.content);
@@ -2161,25 +730,14 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2161
730
  }
2162
731
  }
2163
732
 
2164
- // Step 6: Render HTML with lynx
2165
- if (!hasCommand("lynx")) {
2166
- notes.push("lynx not installed");
2167
- const output = finalizeOutput(rawContent);
2168
- return {
2169
- url,
2170
- finalUrl,
2171
- contentType: mime,
2172
- method: "raw-html",
2173
- content: output.content,
2174
- fetchedAt,
2175
- truncated: output.truncated,
2176
- notes,
2177
- };
733
+ if (signal?.aborted) {
734
+ throw new Error("Operation aborted");
2178
735
  }
2179
736
 
2180
- const lynxResult = await renderWithLynx(rawContent, timeout);
2181
- if (!lynxResult.ok) {
2182
- notes.push("lynx failed");
737
+ // Step 6: Render HTML with lynx or html2text
738
+ const htmlResult = await renderHtmlToText(rawContent, timeout);
739
+ if (!htmlResult.ok) {
740
+ notes.push("html rendering failed (lynx/html2text unavailable)");
2183
741
  const output = finalizeOutput(rawContent);
2184
742
  return {
2185
743
  url,
@@ -2194,15 +752,15 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2194
752
  }
2195
753
 
2196
754
  // Step 7: If lynx output is low quality, try extracting document links
2197
- if (isLowQualityOutput(lynxResult.content)) {
755
+ if (isLowQualityOutput(htmlResult.content)) {
2198
756
  const docLinks = extractDocumentLinks(rawContent, finalUrl);
2199
757
  if (docLinks.length > 0) {
2200
758
  const docUrl = docLinks[0];
2201
- const binary = await fetchBinary(docUrl, timeout);
759
+ const binary = await fetchBinary(docUrl, timeout, signal);
2202
760
  if (binary.ok) {
2203
761
  const ext = getExtensionHint(docUrl, binary.contentDisposition);
2204
- const converted = await convertWithMarkitdown(binary.buffer, ext, timeout);
2205
- if (converted.ok && converted.content.trim().length > lynxResult.content.length) {
762
+ const converted = await convertWithMarkitdown(binary.buffer, ext, timeout, signal);
763
+ if (converted.ok && converted.content.trim().length > htmlResult.content.length) {
2206
764
  notes.push(`Extracted and converted document: ${docUrl}`);
2207
765
  const output = finalizeOutput(converted.content);
2208
766
  return {
@@ -2216,17 +774,22 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2216
774
  notes,
2217
775
  };
2218
776
  }
777
+ if (!converted.ok && converted.error) {
778
+ notes.push(`markitdown conversion failed: ${converted.error}`);
779
+ }
780
+ } else if (binary.error) {
781
+ notes.push(`Binary fetch failed: ${binary.error}`);
2219
782
  }
2220
783
  }
2221
784
  notes.push("Page appears to require JavaScript or is mostly navigation");
2222
785
  }
2223
786
 
2224
- const output = finalizeOutput(lynxResult.content);
787
+ const output = finalizeOutput(htmlResult.content);
2225
788
  return {
2226
789
  url,
2227
790
  finalUrl,
2228
791
  contentType: mime,
2229
- method: "lynx",
792
+ method: htmlResult.method,
2230
793
  content: output.content,
2231
794
  fetchedAt,
2232
795
  truncated: output.truncated,
@@ -2278,11 +841,16 @@ export function createWebFetchTool(_session: ToolSession): AgentTool<typeof webF
2278
841
  execute: async (
2279
842
  _toolCallId: string,
2280
843
  { url, timeout = DEFAULT_TIMEOUT, raw = false }: { url: string; timeout?: number; raw?: boolean },
844
+ signal?: AbortSignal,
2281
845
  ) => {
846
+ if (signal?.aborted) {
847
+ throw new Error("Operation aborted");
848
+ }
849
+
2282
850
  // Clamp timeout
2283
851
  const effectiveTimeout = Math.min(Math.max(timeout, 1), 120);
2284
852
 
2285
- const result = await renderUrl(url, effectiveTimeout, raw);
853
+ const result = await renderUrl(url, effectiveTimeout, raw, signal);
2286
854
 
2287
855
  // Format output
2288
856
  let output = "";
@@ -2319,11 +887,6 @@ export function createWebFetchTool(_session: ToolSession): AgentTool<typeof webF
2319
887
  // TUI Rendering
2320
888
  // =============================================================================
2321
889
 
2322
- import type { Component } from "@oh-my-pi/pi-tui";
2323
- import { Text } from "@oh-my-pi/pi-tui";
2324
- import { type Theme, theme } from "../../modes/interactive/theme/theme";
2325
- import type { RenderResultOptions } from "../custom-tools/types";
2326
-
2327
890
  /** Truncate text to max length with ellipsis */
2328
891
  function truncate(text: string, maxLen: number, ellipsis: string): string {
2329
892
  if (text.length <= maxLen) return text;