@oh-my-pi/pi-coding-agent 3.25.0 → 3.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/package.json +4 -4
  3. package/src/core/tools/complete.ts +2 -4
  4. package/src/core/tools/jtd-to-json-schema.ts +174 -196
  5. package/src/core/tools/read.ts +4 -4
  6. package/src/core/tools/task/executor.ts +146 -20
  7. package/src/core/tools/task/name-generator.ts +1544 -214
  8. package/src/core/tools/task/types.ts +19 -5
  9. package/src/core/tools/task/worker.ts +103 -13
  10. package/src/core/tools/web-fetch-handlers/academic.test.ts +239 -0
  11. package/src/core/tools/web-fetch-handlers/artifacthub.ts +210 -0
  12. package/src/core/tools/web-fetch-handlers/arxiv.ts +84 -0
  13. package/src/core/tools/web-fetch-handlers/aur.ts +171 -0
  14. package/src/core/tools/web-fetch-handlers/biorxiv.ts +136 -0
  15. package/src/core/tools/web-fetch-handlers/bluesky.ts +277 -0
  16. package/src/core/tools/web-fetch-handlers/brew.ts +173 -0
  17. package/src/core/tools/web-fetch-handlers/business.test.ts +82 -0
  18. package/src/core/tools/web-fetch-handlers/cheatsh.ts +73 -0
  19. package/src/core/tools/web-fetch-handlers/chocolatey.ts +153 -0
  20. package/src/core/tools/web-fetch-handlers/coingecko.ts +179 -0
  21. package/src/core/tools/web-fetch-handlers/crates-io.ts +123 -0
  22. package/src/core/tools/web-fetch-handlers/dev-platforms.test.ts +254 -0
  23. package/src/core/tools/web-fetch-handlers/devto.ts +173 -0
  24. package/src/core/tools/web-fetch-handlers/discogs.ts +303 -0
  25. package/src/core/tools/web-fetch-handlers/dockerhub.ts +156 -0
  26. package/src/core/tools/web-fetch-handlers/documentation.test.ts +85 -0
  27. package/src/core/tools/web-fetch-handlers/finance-media.test.ts +144 -0
  28. package/src/core/tools/web-fetch-handlers/git-hosting.test.ts +272 -0
  29. package/src/core/tools/web-fetch-handlers/github-gist.ts +64 -0
  30. package/src/core/tools/web-fetch-handlers/github.ts +424 -0
  31. package/src/core/tools/web-fetch-handlers/gitlab.ts +444 -0
  32. package/src/core/tools/web-fetch-handlers/go-pkg.ts +271 -0
  33. package/src/core/tools/web-fetch-handlers/hackage.ts +89 -0
  34. package/src/core/tools/web-fetch-handlers/hackernews.ts +208 -0
  35. package/src/core/tools/web-fetch-handlers/hex.ts +121 -0
  36. package/src/core/tools/web-fetch-handlers/huggingface.ts +385 -0
  37. package/src/core/tools/web-fetch-handlers/iacr.ts +82 -0
  38. package/src/core/tools/web-fetch-handlers/index.ts +69 -0
  39. package/src/core/tools/web-fetch-handlers/lobsters.ts +186 -0
  40. package/src/core/tools/web-fetch-handlers/mastodon.ts +302 -0
  41. package/src/core/tools/web-fetch-handlers/maven.ts +147 -0
  42. package/src/core/tools/web-fetch-handlers/mdn.ts +174 -0
  43. package/src/core/tools/web-fetch-handlers/media.test.ts +138 -0
  44. package/src/core/tools/web-fetch-handlers/metacpan.ts +247 -0
  45. package/src/core/tools/web-fetch-handlers/npm.ts +107 -0
  46. package/src/core/tools/web-fetch-handlers/nuget.ts +201 -0
  47. package/src/core/tools/web-fetch-handlers/nvd.ts +238 -0
  48. package/src/core/tools/web-fetch-handlers/opencorporates.ts +273 -0
  49. package/src/core/tools/web-fetch-handlers/openlibrary.ts +313 -0
  50. package/src/core/tools/web-fetch-handlers/osv.ts +184 -0
  51. package/src/core/tools/web-fetch-handlers/package-managers-2.test.ts +199 -0
  52. package/src/core/tools/web-fetch-handlers/package-managers.test.ts +171 -0
  53. package/src/core/tools/web-fetch-handlers/package-registries.test.ts +259 -0
  54. package/src/core/tools/web-fetch-handlers/packagist.ts +170 -0
  55. package/src/core/tools/web-fetch-handlers/pub-dev.ts +185 -0
  56. package/src/core/tools/web-fetch-handlers/pubmed.ts +174 -0
  57. package/src/core/tools/web-fetch-handlers/pypi.ts +125 -0
  58. package/src/core/tools/web-fetch-handlers/readthedocs.ts +122 -0
  59. package/src/core/tools/web-fetch-handlers/reddit.ts +100 -0
  60. package/src/core/tools/web-fetch-handlers/repology.ts +257 -0
  61. package/src/core/tools/web-fetch-handlers/research.test.ts +107 -0
  62. package/src/core/tools/web-fetch-handlers/rfc.ts +205 -0
  63. package/src/core/tools/web-fetch-handlers/rubygems.ts +112 -0
  64. package/src/core/tools/web-fetch-handlers/sec-edgar.ts +269 -0
  65. package/src/core/tools/web-fetch-handlers/security.test.ts +103 -0
  66. package/src/core/tools/web-fetch-handlers/semantic-scholar.ts +190 -0
  67. package/src/core/tools/web-fetch-handlers/social-extended.test.ts +192 -0
  68. package/src/core/tools/web-fetch-handlers/social.test.ts +259 -0
  69. package/src/core/tools/web-fetch-handlers/spotify.ts +218 -0
  70. package/src/core/tools/web-fetch-handlers/stackexchange.test.ts +120 -0
  71. package/src/core/tools/web-fetch-handlers/stackoverflow.ts +123 -0
  72. package/src/core/tools/web-fetch-handlers/standards.test.ts +122 -0
  73. package/src/core/tools/web-fetch-handlers/terraform.ts +296 -0
  74. package/src/core/tools/web-fetch-handlers/tldr.ts +47 -0
  75. package/src/core/tools/web-fetch-handlers/twitter.ts +84 -0
  76. package/src/core/tools/web-fetch-handlers/types.ts +163 -0
  77. package/src/core/tools/web-fetch-handlers/utils.ts +91 -0
  78. package/src/core/tools/web-fetch-handlers/vimeo.ts +152 -0
  79. package/src/core/tools/web-fetch-handlers/wikidata.ts +349 -0
  80. package/src/core/tools/web-fetch-handlers/wikipedia.test.ts +73 -0
  81. package/src/core/tools/web-fetch-handlers/wikipedia.ts +91 -0
  82. package/src/core/tools/web-fetch-handlers/youtube.test.ts +198 -0
  83. package/src/core/tools/web-fetch-handlers/youtube.ts +319 -0
  84. package/src/core/tools/web-fetch.ts +152 -1324
  85. package/src/utils/tools-manager.ts +110 -8
@@ -4,8 +4,64 @@ import type { AgentTool } from "@oh-my-pi/pi-agent-core";
4
4
  import { Type } from "@sinclair/typebox";
5
5
  import { parse as parseHtml } from "node-html-parser";
6
6
  import webFetchDescription from "../../prompts/tools/web-fetch.md" with { type: "text" };
7
+ import { ensureTool } from "../../utils/tools-manager";
7
8
  import { logger } from "../logger";
8
9
  import type { ToolSession } from "./index";
10
+ import {
11
+ handleArtifactHub,
12
+ handleArxiv,
13
+ handleAur,
14
+ handleBiorxiv,
15
+ handleBluesky,
16
+ handleBrew,
17
+ handleCheatSh,
18
+ handleChocolatey,
19
+ handleCoinGecko,
20
+ handleCratesIo,
21
+ handleDevTo,
22
+ handleDiscogs,
23
+ handleDockerHub,
24
+ handleGitHub,
25
+ handleGitHubGist,
26
+ handleGitLab,
27
+ handleGoPkg,
28
+ handleHackage,
29
+ handleHackerNews,
30
+ handleHex,
31
+ handleHuggingFace,
32
+ handleIacr,
33
+ handleLobsters,
34
+ handleMastodon,
35
+ handleMaven,
36
+ handleMDN,
37
+ handleMetaCPAN,
38
+ handleNpm,
39
+ handleNuGet,
40
+ handleNvd,
41
+ handleOpenCorporates,
42
+ handleOpenLibrary,
43
+ handleOsv,
44
+ handlePackagist,
45
+ handlePubDev,
46
+ handlePubMed,
47
+ handlePyPI,
48
+ handleReadTheDocs,
49
+ handleReddit,
50
+ handleRepology,
51
+ handleRfc,
52
+ handleRubyGems,
53
+ handleSecEdgar,
54
+ handleSemanticScholar,
55
+ handleSpotify,
56
+ handleStackOverflow,
57
+ handleTerraform,
58
+ handleTldr,
59
+ handleTwitter,
60
+ handleVimeo,
61
+ handleWikidata,
62
+ handleWikipedia,
63
+ handleYouTube,
64
+ } from "./web-fetch-handlers/index";
9
65
 
10
66
  // =============================================================================
11
67
  // Types and Constants
@@ -296,7 +352,8 @@ async function convertWithMarkitdown(
296
352
  extensionHint: string,
297
353
  timeout: number,
298
354
  ): Promise<{ content: string; ok: boolean }> {
299
- if (!hasCommand("markitdown")) {
355
+ const markitdown = await ensureTool("markitdown", true);
356
+ if (!markitdown) {
300
357
  return { content: "", ok: false };
301
358
  }
302
359
 
@@ -307,7 +364,7 @@ async function convertWithMarkitdown(
307
364
 
308
365
  try {
309
366
  await Bun.write(tmpFile, content);
310
- const result = exec("markitdown", [tmpFile], { timeout });
367
+ const result = exec(markitdown, [tmpFile], { timeout });
311
368
  return { content: result.stdout, ok: result.ok };
312
369
  } finally {
313
370
  try {
@@ -522,18 +579,39 @@ function parseFeedToMarkdown(content: string, maxItems = 10): string {
522
579
  }
523
580
 
524
581
  /**
525
- * Render HTML to text using lynx
582
+ * Render HTML to text using lynx or html2text fallback
526
583
  */
527
- async function renderWithLynx(html: string, timeout: number): Promise<{ content: string; ok: boolean }> {
584
+ async function renderHtmlToText(
585
+ html: string,
586
+ timeout: number,
587
+ ): Promise<{ content: string; ok: boolean; method: string }> {
528
588
  const tmpDir = tmpdir();
529
589
  const tmpFile = path.join(tmpDir, `omp-render-${Date.now()}.html`);
590
+
530
591
  try {
531
592
  await Bun.write(tmpFile, html);
532
- // Convert path to file URL (handles Windows paths correctly)
533
- const normalizedPath = tmpFile.replace(/\\/g, "/");
534
- const fileUrl = normalizedPath.startsWith("/") ? `file://${normalizedPath}` : `file:///${normalizedPath}`;
535
- const result = exec("lynx", ["-dump", "-nolist", "-width", "120", fileUrl], { timeout });
536
- return { content: result.stdout, ok: result.ok };
593
+
594
+ // Try lynx first (can't auto-install, system package)
595
+ const lynx = hasCommand("lynx");
596
+ if (lynx) {
597
+ const normalizedPath = tmpFile.replace(/\\/g, "/");
598
+ const fileUrl = normalizedPath.startsWith("/") ? `file://${normalizedPath}` : `file:///${normalizedPath}`;
599
+ const result = exec("lynx", ["-dump", "-nolist", "-width", "120", fileUrl], { timeout });
600
+ if (result.ok) {
601
+ return { content: result.stdout, ok: true, method: "lynx" };
602
+ }
603
+ }
604
+
605
+ // Fall back to html2text (auto-install via uv/pip)
606
+ const html2text = await ensureTool("html2text", true);
607
+ if (html2text) {
608
+ const result = exec(html2text, [tmpFile], { timeout });
609
+ if (result.ok) {
610
+ return { content: result.stdout, ok: true, method: "html2text" };
611
+ }
612
+ }
613
+
614
+ return { content: "", ok: false, method: "none" };
537
615
  } finally {
538
616
  try {
539
617
  await Bun.$`rm ${tmpFile}`.quiet();
@@ -638,1295 +716,6 @@ async function fetchBinary(
638
716
  }
639
717
  }
640
718
 
641
- // =============================================================================
642
- // GitHub Special Handling
643
- // =============================================================================
644
-
645
- interface GitHubUrl {
646
- type: "blob" | "tree" | "repo" | "issue" | "issues" | "pull" | "pulls" | "discussion" | "discussions" | "other";
647
- owner: string;
648
- repo: string;
649
- ref?: string;
650
- path?: string;
651
- number?: number;
652
- }
653
-
654
- /**
655
- * Parse GitHub URL into components
656
- */
657
- function parseGitHubUrl(url: string): GitHubUrl | null {
658
- try {
659
- const parsed = new URL(url);
660
- if (parsed.hostname !== "github.com") return null;
661
-
662
- const parts = parsed.pathname.split("/").filter(Boolean);
663
- if (parts.length < 2) return null;
664
-
665
- const [owner, repo, ...rest] = parts;
666
-
667
- if (rest.length === 0) {
668
- return { type: "repo", owner, repo };
669
- }
670
-
671
- const [section, ...subParts] = rest;
672
-
673
- switch (section) {
674
- case "blob":
675
- case "tree": {
676
- const [ref, ...pathParts] = subParts;
677
- return { type: section, owner, repo, ref, path: pathParts.join("/") };
678
- }
679
- case "issues":
680
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
681
- return { type: "issue", owner, repo, number: parseInt(subParts[0], 10) };
682
- }
683
- return { type: "issues", owner, repo };
684
- case "pull":
685
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
686
- return { type: "pull", owner, repo, number: parseInt(subParts[0], 10) };
687
- }
688
- return { type: "pulls", owner, repo };
689
- case "pulls":
690
- return { type: "pulls", owner, repo };
691
- case "discussions":
692
- if (subParts.length > 0 && /^\d+$/.test(subParts[0])) {
693
- return { type: "discussion", owner, repo, number: parseInt(subParts[0], 10) };
694
- }
695
- return { type: "discussions", owner, repo };
696
- default:
697
- return { type: "other", owner, repo };
698
- }
699
- } catch {
700
- return null;
701
- }
702
- }
703
-
704
- /**
705
- * Convert GitHub blob URL to raw URL
706
- */
707
- function toRawGitHubUrl(gh: GitHubUrl): string {
708
- return `https://raw.githubusercontent.com/${gh.owner}/${gh.repo}/refs/heads/${gh.ref}/${gh.path}`;
709
- }
710
-
711
- /**
712
- * Fetch from GitHub API
713
- */
714
- async function fetchGitHubApi(endpoint: string, timeout: number): Promise<{ data: unknown; ok: boolean }> {
715
- try {
716
- const controller = new AbortController();
717
- const timeoutId = setTimeout(() => controller.abort(), timeout * 1000);
718
-
719
- const headers: Record<string, string> = {
720
- Accept: "application/vnd.github.v3+json",
721
- "User-Agent": "omp-web-fetch/1.0",
722
- };
723
-
724
- // Use GITHUB_TOKEN if available
725
- const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
726
- if (token) {
727
- headers.Authorization = `Bearer ${token}`;
728
- }
729
-
730
- const response = await fetch(`https://api.github.com${endpoint}`, {
731
- signal: controller.signal,
732
- headers,
733
- });
734
-
735
- clearTimeout(timeoutId);
736
-
737
- if (!response.ok) {
738
- return { data: null, ok: false };
739
- }
740
-
741
- return { data: await response.json(), ok: true };
742
- } catch {
743
- return { data: null, ok: false };
744
- }
745
- }
746
-
747
- /**
748
- * Render GitHub issue/PR to markdown
749
- */
750
- async function renderGitHubIssue(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
751
- const endpoint =
752
- gh.type === "pull"
753
- ? `/repos/${gh.owner}/${gh.repo}/pulls/${gh.number}`
754
- : `/repos/${gh.owner}/${gh.repo}/issues/${gh.number}`;
755
-
756
- const result = await fetchGitHubApi(endpoint, timeout);
757
- if (!result.ok || !result.data) return { content: "", ok: false };
758
-
759
- const issue = result.data as {
760
- title: string;
761
- number: number;
762
- state: string;
763
- user: { login: string };
764
- created_at: string;
765
- updated_at: string;
766
- body: string | null;
767
- labels: Array<{ name: string }>;
768
- comments: number;
769
- html_url: string;
770
- };
771
-
772
- let md = `# ${issue.title}\n\n`;
773
- md += `**#${issue.number}** · ${issue.state} · opened by @${issue.user.login}\n`;
774
- md += `Created: ${issue.created_at} · Updated: ${issue.updated_at}\n`;
775
- if (issue.labels.length > 0) {
776
- md += `Labels: ${issue.labels.map((l) => l.name).join(", ")}\n`;
777
- }
778
- md += `\n---\n\n`;
779
- md += issue.body || "*No description provided.*";
780
- md += `\n\n---\n\n`;
781
-
782
- // Fetch comments if any
783
- if (issue.comments > 0) {
784
- const commentsResult = await fetchGitHubApi(
785
- `/repos/${gh.owner}/${gh.repo}/issues/${gh.number}/comments?per_page=50`,
786
- timeout,
787
- );
788
- if (commentsResult.ok && Array.isArray(commentsResult.data)) {
789
- md += `## Comments (${issue.comments})\n\n`;
790
- for (const comment of commentsResult.data as Array<{
791
- user: { login: string };
792
- created_at: string;
793
- body: string;
794
- }>) {
795
- md += `### @${comment.user.login} · ${comment.created_at}\n\n`;
796
- md += `${comment.body}\n\n---\n\n`;
797
- }
798
- }
799
- }
800
-
801
- return { content: md, ok: true };
802
- }
803
-
804
- /**
805
- * Render GitHub issues list to markdown
806
- */
807
- async function renderGitHubIssuesList(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
808
- const result = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/issues?state=open&per_page=30`, timeout);
809
- if (!result.ok || !Array.isArray(result.data)) return { content: "", ok: false };
810
-
811
- const issues = result.data as Array<{
812
- number: number;
813
- title: string;
814
- state: string;
815
- user: { login: string };
816
- created_at: string;
817
- comments: number;
818
- labels: Array<{ name: string }>;
819
- pull_request?: unknown;
820
- }>;
821
-
822
- let md = `# ${gh.owner}/${gh.repo} - Open Issues\n\n`;
823
-
824
- for (const issue of issues) {
825
- if (issue.pull_request) continue; // Skip PRs in issues list
826
- const labels = issue.labels.length > 0 ? ` [${issue.labels.map((l) => l.name).join(", ")}]` : "";
827
- md += `- **#${issue.number}** ${issue.title}${labels}\n`;
828
- md += ` by @${issue.user.login} · ${issue.comments} comments · ${issue.created_at}\n\n`;
829
- }
830
-
831
- return { content: md, ok: true };
832
- }
833
-
834
- /**
835
- * Render GitHub tree (directory) to markdown
836
- */
837
- async function renderGitHubTree(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
838
- // Fetch repo info first to get default branch if ref not specified
839
- const repoResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}`, timeout);
840
- if (!repoResult.ok) return { content: "", ok: false };
841
-
842
- const repo = repoResult.data as {
843
- full_name: string;
844
- default_branch: string;
845
- };
846
-
847
- const ref = gh.ref || repo.default_branch;
848
- const dirPath = gh.path || "";
849
-
850
- let md = `# ${repo.full_name}/${dirPath || "(root)"}\n\n`;
851
- md += `**Branch:** ${ref}\n\n`;
852
-
853
- // Fetch directory contents
854
- const contentsResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/contents/${dirPath}?ref=${ref}`, timeout);
855
-
856
- if (contentsResult.ok && Array.isArray(contentsResult.data)) {
857
- const items = contentsResult.data as Array<{
858
- name: string;
859
- type: "file" | "dir" | "symlink" | "submodule";
860
- size?: number;
861
- path: string;
862
- }>;
863
-
864
- // Sort: directories first, then files, alphabetically
865
- items.sort((a, b) => {
866
- if (a.type === "dir" && b.type !== "dir") return -1;
867
- if (a.type !== "dir" && b.type === "dir") return 1;
868
- return a.name.localeCompare(b.name);
869
- });
870
-
871
- md += `## Contents\n\n`;
872
- md += "```\n";
873
- for (const item of items) {
874
- const prefix = item.type === "dir" ? "[dir] " : " ";
875
- const size = item.size ? ` (${item.size} bytes)` : "";
876
- md += `${prefix}${item.name}${item.type === "file" ? size : ""}\n`;
877
- }
878
- md += "```\n\n";
879
-
880
- // Look for README in this directory
881
- const readmeFile = items.find((item) => item.type === "file" && /^readme\.md$/i.test(item.name));
882
- if (readmeFile) {
883
- const readmePath = dirPath ? `${dirPath}/${readmeFile.name}` : readmeFile.name;
884
- const rawUrl = `https://raw.githubusercontent.com/${gh.owner}/${gh.repo}/refs/heads/${ref}/${readmePath}`;
885
- const readmeResult = await loadPage(rawUrl, { timeout });
886
- if (readmeResult.ok) {
887
- md += `---\n\n## README\n\n${readmeResult.content}`;
888
- }
889
- }
890
- }
891
-
892
- return { content: md, ok: true };
893
- }
894
-
895
- /**
896
- * Render GitHub repo to markdown (file list + README)
897
- */
898
- async function renderGitHubRepo(gh: GitHubUrl, timeout: number): Promise<{ content: string; ok: boolean }> {
899
- // Fetch repo info
900
- const repoResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}`, timeout);
901
- if (!repoResult.ok) return { content: "", ok: false };
902
-
903
- const repo = repoResult.data as {
904
- full_name: string;
905
- description: string | null;
906
- stargazers_count: number;
907
- forks_count: number;
908
- open_issues_count: number;
909
- default_branch: string;
910
- language: string | null;
911
- license: { name: string } | null;
912
- };
913
-
914
- let md = `# ${repo.full_name}\n\n`;
915
- if (repo.description) md += `${repo.description}\n\n`;
916
- md += `Stars: ${repo.stargazers_count} · Forks: ${repo.forks_count} · Issues: ${repo.open_issues_count}\n`;
917
- if (repo.language) md += `Language: ${repo.language}\n`;
918
- if (repo.license) md += `License: ${repo.license.name}\n`;
919
- md += `\n---\n\n`;
920
-
921
- // Fetch file tree
922
- const treeResult = await fetchGitHubApi(
923
- `/repos/${gh.owner}/${gh.repo}/git/trees/${repo.default_branch}?recursive=1`,
924
- timeout,
925
- );
926
- if (treeResult.ok && treeResult.data) {
927
- const tree = (treeResult.data as { tree: Array<{ path: string; type: string }> }).tree;
928
- md += `## Files\n\n`;
929
- md += "```\n";
930
- for (const item of tree.slice(0, 100)) {
931
- const prefix = item.type === "tree" ? "[dir] " : " ";
932
- md += `${prefix}${item.path}\n`;
933
- }
934
- if (tree.length > 100) {
935
- md += `... and ${tree.length - 100} more files\n`;
936
- }
937
- md += "```\n\n";
938
- }
939
-
940
- // Fetch README
941
- const readmeResult = await fetchGitHubApi(`/repos/${gh.owner}/${gh.repo}/readme`, timeout);
942
- if (readmeResult.ok && readmeResult.data) {
943
- const readme = readmeResult.data as { content: string; encoding: string };
944
- if (readme.encoding === "base64") {
945
- const decoded = Buffer.from(readme.content, "base64").toString("utf-8");
946
- md += `## README\n\n${decoded}`;
947
- }
948
- }
949
-
950
- return { content: md, ok: true };
951
- }
952
-
953
- /**
954
- * Handle GitHub URLs specially
955
- */
956
- async function handleGitHub(url: string, timeout: number): Promise<RenderResult | null> {
957
- const gh = parseGitHubUrl(url);
958
- if (!gh) return null;
959
-
960
- const fetchedAt = new Date().toISOString();
961
- const notes: string[] = [];
962
-
963
- switch (gh.type) {
964
- case "blob": {
965
- // Convert to raw URL and fetch
966
- const rawUrl = toRawGitHubUrl(gh);
967
- notes.push(`Fetched raw: ${rawUrl}`);
968
- const result = await loadPage(rawUrl, { timeout });
969
- if (result.ok) {
970
- const output = finalizeOutput(result.content);
971
- return {
972
- url,
973
- finalUrl: rawUrl,
974
- contentType: "text/plain",
975
- method: "github-raw",
976
- content: output.content,
977
- fetchedAt,
978
- truncated: output.truncated,
979
- notes,
980
- };
981
- }
982
- break;
983
- }
984
-
985
- case "tree": {
986
- notes.push(`Fetched via GitHub API`);
987
- const result = await renderGitHubTree(gh, timeout);
988
- if (result.ok) {
989
- const output = finalizeOutput(result.content);
990
- return {
991
- url,
992
- finalUrl: url,
993
- contentType: "text/markdown",
994
- method: "github-tree",
995
- content: output.content,
996
- fetchedAt,
997
- truncated: output.truncated,
998
- notes,
999
- };
1000
- }
1001
- break;
1002
- }
1003
-
1004
- case "issue":
1005
- case "pull": {
1006
- notes.push(`Fetched via GitHub API`);
1007
- const result = await renderGitHubIssue(gh, timeout);
1008
- if (result.ok) {
1009
- const output = finalizeOutput(result.content);
1010
- return {
1011
- url,
1012
- finalUrl: url,
1013
- contentType: "text/markdown",
1014
- method: gh.type === "pull" ? "github-pr" : "github-issue",
1015
- content: output.content,
1016
- fetchedAt,
1017
- truncated: output.truncated,
1018
- notes,
1019
- };
1020
- }
1021
- break;
1022
- }
1023
-
1024
- case "issues": {
1025
- notes.push(`Fetched via GitHub API`);
1026
- const result = await renderGitHubIssuesList(gh, timeout);
1027
- if (result.ok) {
1028
- const output = finalizeOutput(result.content);
1029
- return {
1030
- url,
1031
- finalUrl: url,
1032
- contentType: "text/markdown",
1033
- method: "github-issues",
1034
- content: output.content,
1035
- fetchedAt,
1036
- truncated: output.truncated,
1037
- notes,
1038
- };
1039
- }
1040
- break;
1041
- }
1042
-
1043
- case "repo": {
1044
- notes.push(`Fetched via GitHub API`);
1045
- const result = await renderGitHubRepo(gh, timeout);
1046
- if (result.ok) {
1047
- const output = finalizeOutput(result.content);
1048
- return {
1049
- url,
1050
- finalUrl: url,
1051
- contentType: "text/markdown",
1052
- method: "github-repo",
1053
- content: output.content,
1054
- fetchedAt,
1055
- truncated: output.truncated,
1056
- notes,
1057
- };
1058
- }
1059
- break;
1060
- }
1061
- }
1062
-
1063
- // Fall back to null (let normal rendering handle it)
1064
- return null;
1065
- }
1066
-
1067
- // =============================================================================
1068
- // Twitter/X Special Handling (via Nitter)
1069
- // =============================================================================
1070
-
1071
- // Active Nitter instances - check https://status.d420.de/instances for current status
1072
- const NITTER_INSTANCES = [
1073
- "nitter.privacyredirect.com",
1074
- "nitter.tiekoetter.com",
1075
- "nitter.poast.org",
1076
- "nitter.woodland.cafe",
1077
- ];
1078
-
1079
- /**
1080
- * Handle Twitter/X URLs via Nitter
1081
- */
1082
- async function handleTwitter(url: string, timeout: number): Promise<RenderResult | null> {
1083
- try {
1084
- const parsed = new URL(url);
1085
- if (!["twitter.com", "x.com", "www.twitter.com", "www.x.com"].includes(parsed.hostname)) {
1086
- return null;
1087
- }
1088
-
1089
- const fetchedAt = new Date().toISOString();
1090
-
1091
- // Try Nitter instances
1092
- for (const instance of NITTER_INSTANCES) {
1093
- const nitterUrl = `https://${instance}${parsed.pathname}`;
1094
- const result = await loadPage(nitterUrl, { timeout: Math.min(timeout, 10) });
1095
-
1096
- if (result.ok && result.content.length > 500) {
1097
- // Parse the Nitter HTML
1098
- const doc = parseHtml(result.content);
1099
-
1100
- // Extract tweet content
1101
- const tweetContent = doc.querySelector(".tweet-content")?.text?.trim();
1102
- const fullname = doc.querySelector(".fullname")?.text?.trim();
1103
- const username = doc.querySelector(".username")?.text?.trim();
1104
- const date = doc.querySelector(".tweet-date a")?.text?.trim();
1105
- const stats = doc.querySelector(".tweet-stats")?.text?.trim();
1106
-
1107
- if (tweetContent) {
1108
- let md = `# Tweet by ${fullname || "Unknown"} (${username || "@?"})\n\n`;
1109
- if (date) md += `*${date}*\n\n`;
1110
- md += `${tweetContent}\n\n`;
1111
- if (stats) md += `---\n${stats.replace(/\s+/g, " ")}\n`;
1112
-
1113
- // Check for replies/thread
1114
- const replies = doc.querySelectorAll(".timeline-item .tweet-content");
1115
- if (replies.length > 1) {
1116
- md += `\n---\n\n## Thread/Replies\n\n`;
1117
- for (const reply of Array.from(replies).slice(1, 10)) {
1118
- const replyUser = reply.parentNode?.querySelector(".username")?.text?.trim();
1119
- md += `**${replyUser || "@?"}**: ${reply.text?.trim()}\n\n`;
1120
- }
1121
- }
1122
-
1123
- const output = finalizeOutput(md);
1124
- return {
1125
- url,
1126
- finalUrl: nitterUrl,
1127
- contentType: "text/markdown",
1128
- method: "twitter-nitter",
1129
- content: output.content,
1130
- fetchedAt,
1131
- truncated: output.truncated,
1132
- notes: [`Via Nitter: ${instance}`],
1133
- };
1134
- }
1135
- }
1136
- }
1137
- } catch {}
1138
-
1139
- // X.com blocks all bots - return a helpful error instead of falling through
1140
- return {
1141
- url,
1142
- finalUrl: url,
1143
- contentType: "text/plain",
1144
- method: "twitter-blocked",
1145
- content:
1146
- "Twitter/X blocks automated access. Nitter instances were unavailable.\n\nTry:\n- Opening the link in a browser\n- Using a different Nitter instance manually\n- Checking if the tweet is available via an archive service",
1147
- fetchedAt: new Date().toISOString(),
1148
- truncated: false,
1149
- notes: ["X.com blocks bots; Nitter instances unavailable"],
1150
- };
1151
- }
1152
-
1153
- // =============================================================================
1154
- // Stack Overflow Special Handling
1155
- // =============================================================================
1156
-
1157
- interface SOQuestion {
1158
- title: string;
1159
- body: string;
1160
- score: number;
1161
- owner: { display_name: string };
1162
- creation_date: number;
1163
- tags: string[];
1164
- answer_count: number;
1165
- is_answered: boolean;
1166
- }
1167
-
1168
- interface SOAnswer {
1169
- body: string;
1170
- score: number;
1171
- is_accepted: boolean;
1172
- owner: { display_name: string };
1173
- creation_date: number;
1174
- }
1175
-
1176
- /**
1177
- * Convert basic HTML to markdown (for SO bodies)
1178
- */
1179
- function htmlToBasicMarkdown(html: string): string {
1180
- return html
1181
- .replace(/<pre><code[^>]*>/g, "\n```\n")
1182
- .replace(/<\/code><\/pre>/g, "\n```\n")
1183
- .replace(/<code>/g, "`")
1184
- .replace(/<\/code>/g, "`")
1185
- .replace(/<strong>/g, "**")
1186
- .replace(/<\/strong>/g, "**")
1187
- .replace(/<em>/g, "*")
1188
- .replace(/<\/em>/g, "*")
1189
- .replace(/<a href="([^"]+)"[^>]*>([^<]+)<\/a>/g, "[$2]($1)")
1190
- .replace(/<p>/g, "\n\n")
1191
- .replace(/<\/p>/g, "")
1192
- .replace(/<br\s*\/?>/g, "\n")
1193
- .replace(/<li>/g, "- ")
1194
- .replace(/<\/li>/g, "\n")
1195
- .replace(/<\/?[uo]l>/g, "\n")
1196
- .replace(/<h(\d)>/g, (_, n) => `\n${"#".repeat(parseInt(n, 10))} `)
1197
- .replace(/<\/h\d>/g, "\n")
1198
- .replace(/<blockquote>/g, "\n> ")
1199
- .replace(/<\/blockquote>/g, "\n")
1200
- .replace(/<[^>]+>/g, "") // Strip remaining tags
1201
- .replace(/&lt;/g, "<")
1202
- .replace(/&gt;/g, ">")
1203
- .replace(/&amp;/g, "&")
1204
- .replace(/&quot;/g, '"')
1205
- .replace(/&#39;/g, "'")
1206
- .replace(/\n{3,}/g, "\n\n")
1207
- .trim();
1208
- }
1209
-
1210
- /**
1211
- * Handle Stack Overflow URLs via API
1212
- */
1213
- async function handleStackOverflow(url: string, timeout: number): Promise<RenderResult | null> {
1214
- try {
1215
- const parsed = new URL(url);
1216
- if (!parsed.hostname.includes("stackoverflow.com") && !parsed.hostname.includes("stackexchange.com")) {
1217
- return null;
1218
- }
1219
-
1220
- // Extract question ID from URL patterns like /questions/12345/...
1221
- const match = parsed.pathname.match(/\/questions\/(\d+)/);
1222
- if (!match) return null;
1223
-
1224
- const questionId = match[1];
1225
- const site = parsed.hostname.includes("stackoverflow") ? "stackoverflow" : parsed.hostname.split(".")[0];
1226
- const fetchedAt = new Date().toISOString();
1227
-
1228
- // Fetch question with answers
1229
- const apiUrl = `https://api.stackexchange.com/2.3/questions/${questionId}?order=desc&sort=votes&site=${site}&filter=withbody`;
1230
- const qResult = await loadPage(apiUrl, { timeout });
1231
-
1232
- if (!qResult.ok) return null;
1233
-
1234
- const qData = JSON.parse(qResult.content) as { items: SOQuestion[] };
1235
- if (!qData.items?.length) return null;
1236
-
1237
- const question = qData.items[0];
1238
-
1239
- let md = `# ${question.title}\n\n`;
1240
- md += `**Score:** ${question.score} · **Answers:** ${question.answer_count}`;
1241
- md += question.is_answered ? " (Answered)" : "";
1242
- md += `\n**Tags:** ${question.tags.join(", ")}\n`;
1243
- md += `**Asked by:** ${question.owner.display_name} · ${new Date(question.creation_date * 1000).toISOString().split("T")[0]}\n\n`;
1244
- md += `---\n\n## Question\n\n${htmlToBasicMarkdown(question.body)}\n\n`;
1245
-
1246
- // Fetch answers
1247
- const aUrl = `https://api.stackexchange.com/2.3/questions/${questionId}/answers?order=desc&sort=votes&site=${site}&filter=withbody`;
1248
- const aResult = await loadPage(aUrl, { timeout });
1249
-
1250
- if (aResult.ok) {
1251
- const aData = JSON.parse(aResult.content) as { items: SOAnswer[] };
1252
- if (aData.items?.length) {
1253
- md += `---\n\n## Answers\n\n`;
1254
- for (const answer of aData.items.slice(0, 5)) {
1255
- const accepted = answer.is_accepted ? " (Accepted)" : "";
1256
- md += `### Score: ${answer.score}${accepted} · by ${answer.owner.display_name}\n\n`;
1257
- md += `${htmlToBasicMarkdown(answer.body)}\n\n---\n\n`;
1258
- }
1259
- }
1260
- }
1261
-
1262
- const output = finalizeOutput(md);
1263
- return {
1264
- url,
1265
- finalUrl: url,
1266
- contentType: "text/markdown",
1267
- method: "stackoverflow",
1268
- content: output.content,
1269
- fetchedAt,
1270
- truncated: output.truncated,
1271
- notes: ["Fetched via Stack Exchange API"],
1272
- };
1273
- } catch {}
1274
-
1275
- return null;
1276
- }
1277
-
1278
- // =============================================================================
1279
- // Wikipedia Special Handling
1280
- // =============================================================================
1281
-
1282
- /**
1283
- * Handle Wikipedia URLs via API
1284
- */
1285
- async function handleWikipedia(url: string, timeout: number): Promise<RenderResult | null> {
1286
- try {
1287
- const parsed = new URL(url);
1288
- // Match *.wikipedia.org
1289
- const wikiMatch = parsed.hostname.match(/^(\w+)\.wikipedia\.org$/);
1290
- if (!wikiMatch) return null;
1291
-
1292
- const lang = wikiMatch[1];
1293
- const titleMatch = parsed.pathname.match(/\/wiki\/(.+)/);
1294
- if (!titleMatch) return null;
1295
-
1296
- const title = decodeURIComponent(titleMatch[1]);
1297
- const fetchedAt = new Date().toISOString();
1298
-
1299
- // Use Wikipedia API to get plain text extract
1300
- const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(title)}`;
1301
- const summaryResult = await loadPage(apiUrl, { timeout });
1302
-
1303
- let md = "";
1304
-
1305
- if (summaryResult.ok) {
1306
- const summary = JSON.parse(summaryResult.content) as {
1307
- title: string;
1308
- description?: string;
1309
- extract: string;
1310
- };
1311
- md = `# ${summary.title}\n\n`;
1312
- if (summary.description) md += `*${summary.description}*\n\n`;
1313
- md += `${summary.extract}\n\n---\n\n`;
1314
- }
1315
-
1316
- // Get full article content via mobile-html or parse API
1317
- const contentUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(title)}`;
1318
- const contentResult = await loadPage(contentUrl, { timeout });
1319
-
1320
- if (contentResult.ok) {
1321
- const doc = parseHtml(contentResult.content);
1322
-
1323
- // Extract main content sections
1324
- const sections = doc.querySelectorAll("section");
1325
- for (const section of sections) {
1326
- const heading = section.querySelector("h2, h3, h4");
1327
- const headingText = heading?.text?.trim();
1328
-
1329
- // Skip certain sections
1330
- if (
1331
- headingText &&
1332
- ["References", "External links", "See also", "Notes", "Further reading"].includes(headingText)
1333
- ) {
1334
- continue;
1335
- }
1336
-
1337
- if (headingText) {
1338
- const level = heading?.tagName === "H2" ? "##" : "###";
1339
- md += `${level} ${headingText}\n\n`;
1340
- }
1341
-
1342
- const paragraphs = section.querySelectorAll("p");
1343
- for (const p of paragraphs) {
1344
- const text = p.text?.trim();
1345
- if (text && text.length > 20) {
1346
- md += `${text}\n\n`;
1347
- }
1348
- }
1349
- }
1350
- }
1351
-
1352
- if (!md) return null;
1353
-
1354
- const output = finalizeOutput(md);
1355
- return {
1356
- url,
1357
- finalUrl: url,
1358
- contentType: "text/markdown",
1359
- method: "wikipedia",
1360
- content: output.content,
1361
- fetchedAt,
1362
- truncated: output.truncated,
1363
- notes: ["Fetched via Wikipedia API"],
1364
- };
1365
- } catch {}
1366
-
1367
- return null;
1368
- }
1369
-
1370
- // =============================================================================
1371
- // Reddit Special Handling
1372
- // =============================================================================
1373
-
1374
- interface RedditPost {
1375
- title: string;
1376
- selftext: string;
1377
- author: string;
1378
- score: number;
1379
- num_comments: number;
1380
- created_utc: number;
1381
- subreddit: string;
1382
- url: string;
1383
- is_self: boolean;
1384
- }
1385
-
1386
- interface RedditComment {
1387
- body: string;
1388
- author: string;
1389
- score: number;
1390
- created_utc: number;
1391
- replies?: { data: { children: Array<{ data: RedditComment }> } };
1392
- }
1393
-
1394
- /**
1395
- * Handle Reddit URLs via JSON API
1396
- */
1397
- async function handleReddit(url: string, timeout: number): Promise<RenderResult | null> {
1398
- try {
1399
- const parsed = new URL(url);
1400
- if (!parsed.hostname.includes("reddit.com")) return null;
1401
-
1402
- const fetchedAt = new Date().toISOString();
1403
-
1404
- // Append .json to get JSON response
1405
- let jsonUrl = `${url.replace(/\/$/, "")}.json`;
1406
- if (parsed.search) {
1407
- jsonUrl = `${url.replace(/\/$/, "").replace(parsed.search, "")}.json${parsed.search}`;
1408
- }
1409
-
1410
- const result = await loadPage(jsonUrl, { timeout });
1411
- if (!result.ok) return null;
1412
-
1413
- const data = JSON.parse(result.content);
1414
- let md = "";
1415
-
1416
- // Handle different Reddit URL types
1417
- if (Array.isArray(data) && data.length >= 1) {
1418
- // Post page (with comments)
1419
- const postData = data[0]?.data?.children?.[0]?.data as RedditPost | undefined;
1420
- if (postData) {
1421
- md = `# ${postData.title}\n\n`;
1422
- md += `**r/${postData.subreddit}** · u/${postData.author} · ${postData.score} points · ${postData.num_comments} comments\n`;
1423
- md += `*${new Date(postData.created_utc * 1000).toISOString().split("T")[0]}*\n\n`;
1424
-
1425
- if (postData.is_self && postData.selftext) {
1426
- md += `---\n\n${postData.selftext}\n\n`;
1427
- } else if (!postData.is_self) {
1428
- md += `**Link:** ${postData.url}\n\n`;
1429
- }
1430
-
1431
- // Add comments if available
1432
- if (data.length >= 2 && data[1]?.data?.children) {
1433
- md += `---\n\n## Top Comments\n\n`;
1434
- const comments = data[1].data.children.filter((c: { kind: string }) => c.kind === "t1").slice(0, 10);
1435
-
1436
- for (const { data: comment } of comments as Array<{ data: RedditComment }>) {
1437
- md += `### u/${comment.author} · ${comment.score} points\n\n`;
1438
- md += `${comment.body}\n\n---\n\n`;
1439
- }
1440
- }
1441
- }
1442
- } else if (data?.data?.children) {
1443
- // Subreddit or listing page
1444
- const posts = data.data.children.slice(0, 20) as Array<{ data: RedditPost }>;
1445
- const subreddit = posts[0]?.data?.subreddit;
1446
-
1447
- md = `# r/${subreddit || "Reddit"}\n\n`;
1448
- for (const { data: post } of posts) {
1449
- md += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments)\n`;
1450
- md += ` by u/${post.author}\n\n`;
1451
- }
1452
- }
1453
-
1454
- if (!md) return null;
1455
-
1456
- const output = finalizeOutput(md);
1457
- return {
1458
- url,
1459
- finalUrl: url,
1460
- contentType: "text/markdown",
1461
- method: "reddit",
1462
- content: output.content,
1463
- fetchedAt,
1464
- truncated: output.truncated,
1465
- notes: ["Fetched via Reddit JSON API"],
1466
- };
1467
- } catch {}
1468
-
1469
- return null;
1470
- }
1471
-
1472
- // =============================================================================
1473
- // NPM Special Handling
1474
- // =============================================================================
1475
-
1476
- /**
1477
- * Handle NPM URLs via registry API
1478
- */
1479
- async function handleNpm(url: string, timeout: number): Promise<RenderResult | null> {
1480
- try {
1481
- const parsed = new URL(url);
1482
- if (parsed.hostname !== "www.npmjs.com" && parsed.hostname !== "npmjs.com") return null;
1483
-
1484
- // Extract package name from /package/[scope/]name
1485
- const match = parsed.pathname.match(/^\/package\/(.+?)(?:\/|$)/);
1486
- if (!match) return null;
1487
-
1488
- let packageName = decodeURIComponent(match[1]);
1489
- // Handle scoped packages: /package/@scope/name
1490
- if (packageName.startsWith("@")) {
1491
- const scopeMatch = parsed.pathname.match(/^\/package\/(@[^/]+\/[^/]+)/);
1492
- if (scopeMatch) packageName = decodeURIComponent(scopeMatch[1]);
1493
- }
1494
-
1495
- const fetchedAt = new Date().toISOString();
1496
-
1497
- // Fetch from npm registry - use /latest endpoint for smaller response
1498
- const latestUrl = `https://registry.npmjs.org/${packageName}/latest`;
1499
- const downloadsUrl = `https://api.npmjs.org/downloads/point/last-week/${encodeURIComponent(packageName)}`;
1500
-
1501
- // Fetch package info and download stats in parallel
1502
- const [result, downloadsResult] = await Promise.all([
1503
- loadPage(latestUrl, { timeout }),
1504
- loadPage(downloadsUrl, { timeout: Math.min(timeout, 5) }),
1505
- ]);
1506
-
1507
- if (!result.ok) return null;
1508
-
1509
- // Parse download stats
1510
- let weeklyDownloads: number | null = null;
1511
- if (downloadsResult.ok) {
1512
- try {
1513
- const dlData = JSON.parse(downloadsResult.content) as { downloads?: number };
1514
- weeklyDownloads = dlData.downloads ?? null;
1515
- } catch {}
1516
- }
1517
-
1518
- let pkg: {
1519
- name: string;
1520
- version: string;
1521
- description?: string;
1522
- license?: string;
1523
- homepage?: string;
1524
- repository?: { url: string } | string;
1525
- keywords?: string[];
1526
- maintainers?: Array<{ name: string }>;
1527
- dependencies?: Record<string, string>;
1528
- readme?: string;
1529
- };
1530
-
1531
- try {
1532
- pkg = JSON.parse(result.content);
1533
- } catch {
1534
- return null; // JSON parse failed (truncated response)
1535
- }
1536
-
1537
- let md = `# ${pkg.name}\n\n`;
1538
- if (pkg.description) md += `${pkg.description}\n\n`;
1539
-
1540
- md += `**Latest:** ${pkg.version || "unknown"}`;
1541
- if (pkg.license) md += ` · **License:** ${typeof pkg.license === "string" ? pkg.license : pkg.license}`;
1542
- md += "\n";
1543
- if (weeklyDownloads !== null) {
1544
- const formatted =
1545
- weeklyDownloads >= 1_000_000
1546
- ? `${(weeklyDownloads / 1_000_000).toFixed(1)}M`
1547
- : weeklyDownloads >= 1_000
1548
- ? `${(weeklyDownloads / 1_000).toFixed(1)}K`
1549
- : String(weeklyDownloads);
1550
- md += `**Weekly Downloads:** ${formatted}\n`;
1551
- }
1552
- md += "\n";
1553
-
1554
- if (pkg.homepage) md += `**Homepage:** ${pkg.homepage}\n`;
1555
- const repoUrl = typeof pkg.repository === "string" ? pkg.repository : pkg.repository?.url;
1556
- if (repoUrl) md += `**Repository:** ${repoUrl.replace(/^git\+/, "").replace(/\.git$/, "")}\n`;
1557
- if (pkg.keywords?.length) md += `**Keywords:** ${pkg.keywords.join(", ")}\n`;
1558
- if (pkg.maintainers?.length) md += `**Maintainers:** ${pkg.maintainers.map((m) => m.name).join(", ")}\n`;
1559
-
1560
- if (pkg.dependencies && Object.keys(pkg.dependencies).length > 0) {
1561
- md += `\n## Dependencies\n\n`;
1562
- for (const [dep, version] of Object.entries(pkg.dependencies)) {
1563
- md += `- ${dep}: ${version}\n`;
1564
- }
1565
- }
1566
-
1567
- if (pkg.readme) {
1568
- md += `\n---\n\n## README\n\n${pkg.readme}\n`;
1569
- }
1570
-
1571
- const output = finalizeOutput(md);
1572
- return {
1573
- url,
1574
- finalUrl: url,
1575
- contentType: "text/markdown",
1576
- method: "npm",
1577
- content: output.content,
1578
- fetchedAt,
1579
- truncated: output.truncated,
1580
- notes: ["Fetched via npm registry"],
1581
- };
1582
- } catch {}
1583
-
1584
- return null;
1585
- }
1586
-
1587
- // =============================================================================
1588
- // Crates.io Special Handling
1589
- // =============================================================================
1590
-
1591
- /**
1592
- * Handle crates.io URLs via API
1593
- */
1594
- async function handleCratesIo(url: string, timeout: number): Promise<RenderResult | null> {
1595
- try {
1596
- const parsed = new URL(url);
1597
- if (parsed.hostname !== "crates.io" && parsed.hostname !== "www.crates.io") return null;
1598
-
1599
- // Extract crate name from /crates/name or /crates/name/version
1600
- const match = parsed.pathname.match(/^\/crates\/([^/]+)/);
1601
- if (!match) return null;
1602
-
1603
- const crateName = decodeURIComponent(match[1]);
1604
- const fetchedAt = new Date().toISOString();
1605
-
1606
- // Fetch from crates.io API
1607
- const apiUrl = `https://crates.io/api/v1/crates/${crateName}`;
1608
- const result = await loadPage(apiUrl, {
1609
- timeout,
1610
- headers: { "User-Agent": "omp-web-fetch/1.0 (https://github.com/anthropics)" },
1611
- });
1612
-
1613
- if (!result.ok) return null;
1614
-
1615
- let data: {
1616
- crate: {
1617
- name: string;
1618
- description: string | null;
1619
- downloads: number;
1620
- recent_downloads: number;
1621
- max_version: string;
1622
- repository: string | null;
1623
- homepage: string | null;
1624
- documentation: string | null;
1625
- categories: string[];
1626
- keywords: string[];
1627
- created_at: string;
1628
- updated_at: string;
1629
- };
1630
- versions: Array<{
1631
- num: string;
1632
- downloads: number;
1633
- created_at: string;
1634
- license: string | null;
1635
- rust_version: string | null;
1636
- }>;
1637
- };
1638
-
1639
- try {
1640
- data = JSON.parse(result.content);
1641
- } catch {
1642
- return null;
1643
- }
1644
-
1645
- const crate = data.crate;
1646
- const latestVersion = data.versions?.[0];
1647
-
1648
- // Format download counts
1649
- const formatDownloads = (n: number): string =>
1650
- n >= 1_000_000 ? `${(n / 1_000_000).toFixed(1)}M` : n >= 1_000 ? `${(n / 1_000).toFixed(1)}K` : String(n);
1651
-
1652
- let md = `# ${crate.name}\n\n`;
1653
- if (crate.description) md += `${crate.description}\n\n`;
1654
-
1655
- md += `**Latest:** ${crate.max_version}`;
1656
- if (latestVersion?.license) md += ` · **License:** ${latestVersion.license}`;
1657
- if (latestVersion?.rust_version) md += ` · **MSRV:** ${latestVersion.rust_version}`;
1658
- md += "\n";
1659
- md += `**Downloads:** ${formatDownloads(crate.downloads)} total · ${formatDownloads(crate.recent_downloads)} recent\n\n`;
1660
-
1661
- if (crate.repository) md += `**Repository:** ${crate.repository}\n`;
1662
- if (crate.homepage && crate.homepage !== crate.repository) md += `**Homepage:** ${crate.homepage}\n`;
1663
- if (crate.documentation) md += `**Docs:** ${crate.documentation}\n`;
1664
- if (crate.keywords?.length) md += `**Keywords:** ${crate.keywords.join(", ")}\n`;
1665
- if (crate.categories?.length) md += `**Categories:** ${crate.categories.join(", ")}\n`;
1666
-
1667
- // Show recent versions
1668
- if (data.versions?.length > 0) {
1669
- md += `\n## Recent Versions\n\n`;
1670
- for (const ver of data.versions.slice(0, 5)) {
1671
- const date = ver.created_at.split("T")[0];
1672
- md += `- **${ver.num}** (${date}) - ${formatDownloads(ver.downloads)} downloads\n`;
1673
- }
1674
- }
1675
-
1676
- // Try to fetch README from docs.rs or repository
1677
- const docsRsUrl = `https://docs.rs/crate/${crateName}/${crate.max_version}/source/README.md`;
1678
- const readmeResult = await loadPage(docsRsUrl, { timeout: Math.min(timeout, 5) });
1679
- if (readmeResult.ok && readmeResult.content.length > 100 && !looksLikeHtml(readmeResult.content)) {
1680
- md += `\n---\n\n## README\n\n${readmeResult.content}\n`;
1681
- }
1682
-
1683
- const output = finalizeOutput(md);
1684
- return {
1685
- url,
1686
- finalUrl: url,
1687
- contentType: "text/markdown",
1688
- method: "crates.io",
1689
- content: output.content,
1690
- fetchedAt,
1691
- truncated: output.truncated,
1692
- notes: ["Fetched via crates.io API"],
1693
- };
1694
- } catch {}
1695
-
1696
- return null;
1697
- }
1698
-
1699
- // =============================================================================
1700
- // arXiv Special Handling
1701
- // =============================================================================
1702
-
1703
- /**
1704
- * Handle arXiv URLs - fetch abstract + optionally PDF
1705
- */
1706
- async function handleArxiv(url: string, timeout: number): Promise<RenderResult | null> {
1707
- try {
1708
- const parsed = new URL(url);
1709
- if (parsed.hostname !== "arxiv.org") return null;
1710
-
1711
- // Extract paper ID from various URL formats
1712
- // /abs/1234.56789, /pdf/1234.56789, /abs/cs/0123456
1713
- const match = parsed.pathname.match(/\/(abs|pdf)\/(.+?)(?:\.pdf)?$/);
1714
- if (!match) return null;
1715
-
1716
- const paperId = match[2];
1717
- const fetchedAt = new Date().toISOString();
1718
- const notes: string[] = [];
1719
-
1720
- // Fetch metadata via arXiv API
1721
- const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
1722
- const result = await loadPage(apiUrl, { timeout });
1723
-
1724
- if (!result.ok) return null;
1725
-
1726
- // Parse the Atom feed response
1727
- const doc = parseHtml(result.content, { parseNoneClosedTags: true });
1728
- const entry = doc.querySelector("entry");
1729
-
1730
- if (!entry) return null;
1731
-
1732
- const title = entry.querySelector("title")?.text?.trim()?.replace(/\s+/g, " ");
1733
- const summary = entry.querySelector("summary")?.text?.trim();
1734
- const authors = entry
1735
- .querySelectorAll("author name")
1736
- .map((n) => n.text?.trim())
1737
- .filter(Boolean);
1738
- const published = entry.querySelector("published")?.text?.trim()?.split("T")[0];
1739
- const categories = entry
1740
- .querySelectorAll("category")
1741
- .map((c) => c.getAttribute("term"))
1742
- .filter(Boolean);
1743
- const pdfLink = entry.querySelector('link[title="pdf"]')?.getAttribute("href");
1744
-
1745
- let md = `# ${title || "arXiv Paper"}\n\n`;
1746
- if (authors.length) md += `**Authors:** ${authors.join(", ")}\n`;
1747
- if (published) md += `**Published:** ${published}\n`;
1748
- if (categories.length) md += `**Categories:** ${categories.join(", ")}\n`;
1749
- md += `**arXiv:** ${paperId}\n\n`;
1750
- md += `---\n\n## Abstract\n\n${summary || "No abstract available."}\n\n`;
1751
-
1752
- // If it was a PDF link or we want full content, try to fetch and convert PDF
1753
- if (match[1] === "pdf" || parsed.pathname.includes(".pdf")) {
1754
- if (pdfLink) {
1755
- notes.push("Fetching PDF for full content...");
1756
- const pdfResult = await fetchBinary(pdfLink, timeout);
1757
- if (pdfResult.ok) {
1758
- const converted = await convertWithMarkitdown(pdfResult.buffer, ".pdf", timeout);
1759
- if (converted.ok && converted.content.length > 500) {
1760
- md += `---\n\n## Full Paper\n\n${converted.content}\n`;
1761
- notes.push("PDF converted via markitdown");
1762
- }
1763
- }
1764
- }
1765
- }
1766
-
1767
- const output = finalizeOutput(md);
1768
- return {
1769
- url,
1770
- finalUrl: url,
1771
- contentType: "text/markdown",
1772
- method: "arxiv",
1773
- content: output.content,
1774
- fetchedAt,
1775
- truncated: output.truncated,
1776
- notes: notes.length ? notes : ["Fetched via arXiv API"],
1777
- };
1778
- } catch {}
1779
-
1780
- return null;
1781
- }
1782
-
1783
- // =============================================================================
1784
- // IACR ePrint Special Handling
1785
- // =============================================================================
1786
-
1787
- /**
1788
- * Handle IACR Cryptology ePrint Archive URLs
1789
- */
1790
- async function handleIacr(url: string, timeout: number): Promise<RenderResult | null> {
1791
- try {
1792
- const parsed = new URL(url);
1793
- if (parsed.hostname !== "eprint.iacr.org") return null;
1794
-
1795
- // Extract paper ID from /year/number or /year/number.pdf
1796
- const match = parsed.pathname.match(/\/(\d{4})\/(\d+)(?:\.pdf)?$/);
1797
- if (!match) return null;
1798
-
1799
- const [, year, number] = match;
1800
- const paperId = `${year}/${number}`;
1801
- const fetchedAt = new Date().toISOString();
1802
- const notes: string[] = [];
1803
-
1804
- // Fetch the HTML page for metadata
1805
- const pageUrl = `https://eprint.iacr.org/${paperId}`;
1806
- const result = await loadPage(pageUrl, { timeout });
1807
-
1808
- if (!result.ok) return null;
1809
-
1810
- const doc = parseHtml(result.content);
1811
-
1812
- // Extract metadata from the page
1813
- const title =
1814
- doc.querySelector("h3.mb-3")?.text?.trim() ||
1815
- doc.querySelector('meta[name="citation_title"]')?.getAttribute("content");
1816
- const authors = doc
1817
- .querySelectorAll('meta[name="citation_author"]')
1818
- .map((m) => m.getAttribute("content"))
1819
- .filter(Boolean);
1820
- // Abstract is in <p> after <h5>Abstract</h5>
1821
- const abstractHeading = doc.querySelectorAll("h5").find((h) => h.text?.includes("Abstract"));
1822
- const abstract =
1823
- abstractHeading?.parentNode?.querySelector("p")?.text?.trim() ||
1824
- doc.querySelector('meta[name="description"]')?.getAttribute("content");
1825
- const keywords = doc.querySelector(".keywords")?.text?.replace("Keywords:", "").trim();
1826
- const pubDate = doc.querySelector('meta[name="citation_publication_date"]')?.getAttribute("content");
1827
-
1828
- let md = `# ${title || "IACR ePrint Paper"}\n\n`;
1829
- if (authors.length) md += `**Authors:** ${authors.join(", ")}\n`;
1830
- if (pubDate) md += `**Date:** ${pubDate}\n`;
1831
- md += `**ePrint:** ${paperId}\n`;
1832
- if (keywords) md += `**Keywords:** ${keywords}\n`;
1833
- md += `\n---\n\n## Abstract\n\n${abstract || "No abstract available."}\n\n`;
1834
-
1835
- // If it was a PDF link, try to fetch and convert PDF
1836
- if (parsed.pathname.endsWith(".pdf")) {
1837
- const pdfUrl = `https://eprint.iacr.org/${paperId}.pdf`;
1838
- notes.push("Fetching PDF for full content...");
1839
- const pdfResult = await fetchBinary(pdfUrl, timeout);
1840
- if (pdfResult.ok) {
1841
- const converted = await convertWithMarkitdown(pdfResult.buffer, ".pdf", timeout);
1842
- if (converted.ok && converted.content.length > 500) {
1843
- md += `---\n\n## Full Paper\n\n${converted.content}\n`;
1844
- notes.push("PDF converted via markitdown");
1845
- }
1846
- }
1847
- }
1848
-
1849
- const output = finalizeOutput(md);
1850
- return {
1851
- url,
1852
- finalUrl: url,
1853
- contentType: "text/markdown",
1854
- method: "iacr",
1855
- content: output.content,
1856
- fetchedAt,
1857
- truncated: output.truncated,
1858
- notes: notes.length ? notes : ["Fetched from IACR ePrint Archive"],
1859
- };
1860
- } catch {}
1861
-
1862
- return null;
1863
- }
1864
-
1865
- // =============================================================================
1866
- // GitHub Gist Special Handling
1867
- // =============================================================================
1868
-
1869
- /**
1870
- * Handle GitHub Gist URLs via API
1871
- */
1872
- async function handleGitHubGist(url: string, timeout: number): Promise<RenderResult | null> {
1873
- try {
1874
- const parsed = new URL(url);
1875
- if (parsed.hostname !== "gist.github.com") return null;
1876
-
1877
- // Extract gist ID from /username/gistId or just /gistId
1878
- const parts = parsed.pathname.split("/").filter(Boolean);
1879
- if (parts.length === 0) return null;
1880
-
1881
- // Gist ID is always the last path segment (or only segment for anonymous gists)
1882
- const gistId = parts[parts.length - 1];
1883
- if (!gistId || !/^[a-f0-9]+$/i.test(gistId)) return null;
1884
-
1885
- const fetchedAt = new Date().toISOString();
1886
-
1887
- // Fetch via GitHub API
1888
- const result = await fetchGitHubApi(`/gists/${gistId}`, timeout);
1889
- if (!result.ok || !result.data) return null;
1890
-
1891
- const gist = result.data as {
1892
- description: string | null;
1893
- owner?: { login: string };
1894
- created_at: string;
1895
- updated_at: string;
1896
- files: Record<string, { filename: string; language: string | null; size: number; content: string }>;
1897
- html_url: string;
1898
- };
1899
-
1900
- const files = Object.values(gist.files);
1901
- const owner = gist.owner?.login || "anonymous";
1902
-
1903
- let md = `# Gist by ${owner}\n\n`;
1904
- if (gist.description) md += `${gist.description}\n\n`;
1905
- md += `**Created:** ${gist.created_at} · **Updated:** ${gist.updated_at}\n`;
1906
- md += `**Files:** ${files.length}\n\n`;
1907
-
1908
- for (const file of files) {
1909
- const lang = file.language?.toLowerCase() || "";
1910
- md += `---\n\n## ${file.filename}\n\n`;
1911
- md += `\`\`\`${lang}\n${file.content}\n\`\`\`\n\n`;
1912
- }
1913
-
1914
- const output = finalizeOutput(md);
1915
- return {
1916
- url,
1917
- finalUrl: url,
1918
- contentType: "text/markdown",
1919
- method: "github-gist",
1920
- content: output.content,
1921
- fetchedAt,
1922
- truncated: output.truncated,
1923
- notes: ["Fetched via GitHub API"],
1924
- };
1925
- } catch {}
1926
-
1927
- return null;
1928
- }
1929
-
1930
719
  // =============================================================================
1931
720
  // Unified Special Handler Dispatch
1932
721
  // =============================================================================
@@ -1937,16 +726,70 @@ async function handleGitHubGist(url: string, timeout: number): Promise<RenderRes
1937
726
  async function handleSpecialUrls(url: string, timeout: number): Promise<RenderResult | null> {
1938
727
  // Order matters - more specific first
1939
728
  return (
729
+ // Git hosting
1940
730
  (await handleGitHubGist(url, timeout)) ||
1941
731
  (await handleGitHub(url, timeout)) ||
732
+ (await handleGitLab(url, timeout)) ||
733
+ // Video/Media
734
+ (await handleYouTube(url, timeout)) ||
735
+ (await handleVimeo(url, timeout)) ||
736
+ (await handleSpotify(url, timeout)) ||
737
+ (await handleDiscogs(url, timeout)) ||
738
+ // Social/News
1942
739
  (await handleTwitter(url, timeout)) ||
1943
- (await handleStackOverflow(url, timeout)) ||
1944
- (await handleWikipedia(url, timeout)) ||
740
+ (await handleBluesky(url, timeout)) ||
741
+ (await handleMastodon(url, timeout)) ||
742
+ (await handleHackerNews(url, timeout)) ||
743
+ (await handleLobsters(url, timeout)) ||
1945
744
  (await handleReddit(url, timeout)) ||
745
+ // Developer content
746
+ (await handleStackOverflow(url, timeout)) ||
747
+ (await handleDevTo(url, timeout)) ||
748
+ (await handleMDN(url, timeout)) ||
749
+ (await handleReadTheDocs(url, timeout)) ||
750
+ (await handleTldr(url, timeout)) ||
751
+ (await handleCheatSh(url, timeout)) ||
752
+ // Package registries
1946
753
  (await handleNpm(url, timeout)) ||
754
+ (await handleNuGet(url, timeout)) ||
755
+ (await handleChocolatey(url, timeout)) ||
756
+ (await handleBrew(url, timeout)) ||
757
+ (await handlePyPI(url, timeout)) ||
1947
758
  (await handleCratesIo(url, timeout)) ||
759
+ (await handleDockerHub(url, timeout)) ||
760
+ (await handleGoPkg(url, timeout)) ||
761
+ (await handleHex(url, timeout)) ||
762
+ (await handlePackagist(url, timeout)) ||
763
+ (await handlePubDev(url, timeout)) ||
764
+ (await handleMaven(url, timeout)) ||
765
+ (await handleArtifactHub(url, timeout)) ||
766
+ (await handleRubyGems(url, timeout)) ||
767
+ (await handleTerraform(url, timeout)) ||
768
+ (await handleAur(url, timeout)) ||
769
+ (await handleHackage(url, timeout)) ||
770
+ (await handleMetaCPAN(url, timeout)) ||
771
+ (await handleRepology(url, timeout)) ||
772
+ // ML/AI
773
+ (await handleHuggingFace(url, timeout)) ||
774
+ // Academic
1948
775
  (await handleArxiv(url, timeout)) ||
1949
- (await handleIacr(url, timeout))
776
+ (await handleBiorxiv(url, timeout)) ||
777
+ (await handleIacr(url, timeout)) ||
778
+ (await handleSemanticScholar(url, timeout)) ||
779
+ (await handlePubMed(url, timeout)) ||
780
+ (await handleRfc(url, timeout)) ||
781
+ // Security
782
+ (await handleNvd(url, timeout)) ||
783
+ (await handleOsv(url, timeout)) ||
784
+ // Crypto
785
+ (await handleCoinGecko(url, timeout)) ||
786
+ // Business
787
+ (await handleOpenCorporates(url, timeout)) ||
788
+ (await handleSecEdgar(url, timeout)) ||
789
+ // Reference
790
+ (await handleOpenLibrary(url, timeout)) ||
791
+ (await handleWikidata(url, timeout)) ||
792
+ (await handleWikipedia(url, timeout))
1950
793
  );
1951
794
  }
1952
795
 
@@ -2161,25 +1004,10 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2161
1004
  }
2162
1005
  }
2163
1006
 
2164
- // Step 6: Render HTML with lynx
2165
- if (!hasCommand("lynx")) {
2166
- notes.push("lynx not installed");
2167
- const output = finalizeOutput(rawContent);
2168
- return {
2169
- url,
2170
- finalUrl,
2171
- contentType: mime,
2172
- method: "raw-html",
2173
- content: output.content,
2174
- fetchedAt,
2175
- truncated: output.truncated,
2176
- notes,
2177
- };
2178
- }
2179
-
2180
- const lynxResult = await renderWithLynx(rawContent, timeout);
2181
- if (!lynxResult.ok) {
2182
- notes.push("lynx failed");
1007
+ // Step 6: Render HTML with lynx or html2text
1008
+ const htmlResult = await renderHtmlToText(rawContent, timeout);
1009
+ if (!htmlResult.ok) {
1010
+ notes.push("html rendering failed (lynx/html2text unavailable)");
2183
1011
  const output = finalizeOutput(rawContent);
2184
1012
  return {
2185
1013
  url,
@@ -2194,7 +1022,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2194
1022
  }
2195
1023
 
2196
1024
  // Step 7: If lynx output is low quality, try extracting document links
2197
- if (isLowQualityOutput(lynxResult.content)) {
1025
+ if (isLowQualityOutput(htmlResult.content)) {
2198
1026
  const docLinks = extractDocumentLinks(rawContent, finalUrl);
2199
1027
  if (docLinks.length > 0) {
2200
1028
  const docUrl = docLinks[0];
@@ -2202,7 +1030,7 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2202
1030
  if (binary.ok) {
2203
1031
  const ext = getExtensionHint(docUrl, binary.contentDisposition);
2204
1032
  const converted = await convertWithMarkitdown(binary.buffer, ext, timeout);
2205
- if (converted.ok && converted.content.trim().length > lynxResult.content.length) {
1033
+ if (converted.ok && converted.content.trim().length > htmlResult.content.length) {
2206
1034
  notes.push(`Extracted and converted document: ${docUrl}`);
2207
1035
  const output = finalizeOutput(converted.content);
2208
1036
  return {
@@ -2221,12 +1049,12 @@ async function renderUrl(url: string, timeout: number, raw: boolean = false): Pr
2221
1049
  notes.push("Page appears to require JavaScript or is mostly navigation");
2222
1050
  }
2223
1051
 
2224
- const output = finalizeOutput(lynxResult.content);
1052
+ const output = finalizeOutput(htmlResult.content);
2225
1053
  return {
2226
1054
  url,
2227
1055
  finalUrl,
2228
1056
  contentType: mime,
2229
- method: "lynx",
1057
+ method: htmlResult.method,
2230
1058
  content: output.content,
2231
1059
  fetchedAt,
2232
1060
  truncated: output.truncated,