@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
package/dist/search.js ADDED
@@ -0,0 +1,112 @@
1
+ import MiniSearch from "minisearch";
2
+ // ---------------------------------------------------------------------------
3
+ // Snippet builder — kept from v1, MiniSearch doesn't generate snippets.
4
+ // ---------------------------------------------------------------------------
5
+ /**
6
+ * Build a short snippet around the best match position.
7
+ * Falls back to the start of the text when no match is found.
8
+ */
9
+ function buildSnippet(text, fullQuery, queryTokens, radius) {
10
+ const lower = text.toLowerCase();
11
+ let pos = lower.indexOf(fullQuery);
12
+ if (pos === -1) {
13
+ for (const qt of queryTokens) {
14
+ const p = lower.indexOf(qt);
15
+ if (p !== -1) {
16
+ pos = p;
17
+ break;
18
+ }
19
+ }
20
+ }
21
+ if (pos === -1)
22
+ pos = 0;
23
+ const start = Math.max(0, pos - radius);
24
+ const end = Math.min(text.length, pos + Math.max(fullQuery.length, queryTokens[0]?.length ?? 1) + radius);
25
+ const raw = text.slice(start, end).replace(/\s+/g, " ").trim();
26
+ return (start > 0 ? "…" : "") + raw + (end < text.length ? "…" : "");
27
+ }
28
+ /** Tokenise and lower-case a string — used only for snippet generation. */
29
+ function tokenise(s) {
30
+ return s
31
+ .toLowerCase()
32
+ .split(/[\s\-_.,;:!?()[\]{}"'`/\\]+/)
33
+ .filter((t) => t.length > 1);
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Public API
37
+ // ---------------------------------------------------------------------------
38
+ /**
39
+ * Full-text search across a set of already-spidered pages using MiniSearch
40
+ * (BM25F ranking, fuzzy edit-distance, prefix search, heading field boost ×2).
41
+ *
42
+ * Searches both body chunks and page metadata (title, description, headings).
43
+ * Returns results ranked by score descending, normalised to 0–1.
44
+ *
45
+ * Designed for agent use: call after fetching pages to locate a specific
46
+ * fact, term, or section without dumping all content into context.
47
+ *
48
+ * @example
49
+ * const hits = searchPages(pages, "cost optimization selectors", { topN: 5 })
50
+ * // hits[0].snippet → "…LLM extraction vs Selectors…"
51
+ */
52
+ export function searchPages(pages, query, opts = {}) {
53
+ const { topN = 10, snippetRadius = 100 } = opts;
54
+ if (!query.trim())
55
+ return [];
56
+ // Build a flat document list — one entry per chunk, one per metadata field.
57
+ const docs = [];
58
+ for (const page of pages) {
59
+ // Metadata documents
60
+ const metaDocs = [
61
+ { id: `${page.url}#meta-title`, heading: "title", text: page.title },
62
+ ...(page.description
63
+ ? [{ id: `${page.url}#meta-description`, heading: "description", text: page.description }]
64
+ : []),
65
+ ...page.headings.map((h, i) => ({
66
+ id: `${page.url}#meta-h${i}`,
67
+ heading: `h${h.level}`,
68
+ text: h.text,
69
+ })),
70
+ ];
71
+ for (const m of metaDocs) {
72
+ docs.push({ id: m.id, url: page.url, heading: m.heading, text: m.text, chunkId: "" });
73
+ }
74
+ // Chunk documents
75
+ for (const c of page.chunks) {
76
+ docs.push({ id: c.id, url: page.url, heading: c.heading, text: c.text, chunkId: c.id });
77
+ }
78
+ }
79
+ if (docs.length === 0)
80
+ return [];
81
+ const ms = new MiniSearch({
82
+ fields: ["text", "heading"],
83
+ storeFields: ["url", "heading", "chunkId", "text"],
84
+ searchOptions: {
85
+ // BM25F: headings are 2× more important than body text.
86
+ boost: { heading: 2 },
87
+ // Edit-distance fuzzy — 0.2 × term length, rounded (e.g. ≤1 for 5-char terms).
88
+ fuzzy: 0.2,
89
+ // Prefix match: "automat" finds "automation", "automated".
90
+ prefix: true,
91
+ },
92
+ });
93
+ ms.addAll(docs);
94
+ const results = ms.search(query);
95
+ if (results.length === 0)
96
+ return [];
97
+ // Normalise raw BM25 scores to 0–1 by dividing by the top score.
98
+ // This preserves relative ranking while keeping values agent-friendly.
99
+ const maxRaw = results[0].score;
100
+ const fullQuery = query.trim().toLowerCase();
101
+ const queryTokens = tokenise(query);
102
+ return results.slice(0, topN).map((r) => ({
103
+ url: String(r["url"]),
104
+ chunkId: String(r["chunkId"]),
105
+ heading: String(r["heading"]),
106
+ score: Math.round(Math.min(r.score / maxRaw, 1) * 100) / 100,
107
+ snippet: buildSnippet(String(r["text"]), fullQuery, queryTokens, snippetRadius),
108
+ }));
109
+ }
110
+ /** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking. */
111
+ export const fuzzySearch = searchPages;
112
+ //# sourceMappingURL=search.js.map
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Sitemap fetcher and parser.
3
+ *
4
+ * Attempts /sitemap.xml and /sitemap_index.xml. Extracts <loc> URLs.
5
+ * Fails open — any error returns an empty array so callers fall back
6
+ * to normal BFS without noise.
7
+ */
8
+ import type { IHttpClient } from "./ports.js";
9
+ /**
10
+ * Fetch and parse sitemap URLs for the given origin.
11
+ * Supports both standard sitemaps and sitemap index files.
12
+ * Returns deduplicated absolute URLs, empty array on any failure.
13
+ */
14
+ export declare function fetchSitemapUrls(origin: string, httpClient: IHttpClient): Promise<string[]>;
15
+ //# sourceMappingURL=sitemap.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../src/sitemap.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAE9C;;;;GAIG;AACH,wBAAsB,gBAAgB,CACrC,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,WAAW,GACrB,OAAO,CAAC,MAAM,EAAE,CAAC,CA4BnB"}
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Sitemap fetcher and parser.
3
+ *
4
+ * Attempts /sitemap.xml and /sitemap_index.xml. Extracts <loc> URLs.
5
+ * Fails open — any error returns an empty array so callers fall back
6
+ * to normal BFS without noise.
7
+ */
8
+ /**
9
+ * Fetch and parse sitemap URLs for the given origin.
10
+ * Supports both standard sitemaps and sitemap index files.
11
+ * Returns deduplicated absolute URLs, empty array on any failure.
12
+ */
13
+ export async function fetchSitemapUrls(origin, httpClient) {
14
+ const candidates = [`${origin}/sitemap.xml`, `${origin}/sitemap_index.xml`];
15
+ const urls = new Set();
16
+ for (const sitemapUrl of candidates) {
17
+ try {
18
+ const res = await httpClient.fetch({
19
+ url: sitemapUrl,
20
+ headers: { Accept: "application/xml, text/xml, */*" },
21
+ });
22
+ if (!res.ok)
23
+ continue;
24
+ const xml = await res.text();
25
+ for (const loc of extractLocs(xml)) {
26
+ // Sitemap index entries point to other sitemaps — fetch those too
27
+ if (loc.endsWith(".xml")) {
28
+ const nested = await fetchSitemapXml(loc, httpClient);
29
+ for (const u of nested)
30
+ urls.add(u);
31
+ }
32
+ else {
33
+ urls.add(loc);
34
+ }
35
+ }
36
+ if (urls.size > 0)
37
+ break; // found a working sitemap
38
+ }
39
+ catch {
40
+ continue;
41
+ }
42
+ }
43
+ return [...urls];
44
+ }
45
+ async function fetchSitemapXml(url, httpClient) {
46
+ try {
47
+ const res = await httpClient.fetch({ url });
48
+ if (!res.ok)
49
+ return [];
50
+ return extractLocs(await res.text());
51
+ }
52
+ catch {
53
+ return [];
54
+ }
55
+ }
56
+ function extractLocs(xml) {
57
+ const urls = [];
58
+ const re = /<loc>\s*(https?:\/\/[^<\s]+)\s*<\/loc>/gi;
59
+ let match;
60
+ while ((match = re.exec(xml)) !== null) {
61
+ urls.push(match[1].trim());
62
+ }
63
+ return urls;
64
+ }
65
+ //# sourceMappingURL=sitemap.js.map
@@ -0,0 +1,74 @@
1
+ import type { IHttpClient, IRobotsChecker, IThrottle } from "./ports.js";
2
+ import type { DOMNode, LeanPage, SpideredPage } from "./types.js";
3
+ export interface SpiderOptions {
4
+ /**
5
+ * ms before aborting the fetch (default 10 000).
6
+ */
7
+ timeoutMs?: number;
8
+ /**
9
+ * Value sent as User-Agent.
10
+ * Default identifies the tool; override for sites that block generic crawlers.
11
+ */
12
+ userAgent?: string;
13
+ /**
14
+ * CSS selector that scopes content extraction to a specific element.
15
+ * Everything outside the matched element is discarded before Readability runs.
16
+ * Example: "article", ".main-content", "#post-body"
17
+ */
18
+ rootSelector?: string;
19
+ /**
20
+ * Comma-separated CSS selectors whose matched elements are removed before
21
+ * extraction. Applied before Readability, so excluded content never reaches
22
+ * the chunks or markdown.
23
+ * Example: "nav, footer, .sidebar, #ads"
24
+ */
25
+ excludeSelectors?: string;
26
+ /**
27
+ * Approximate maximum token budget for the returned content.
28
+ * Markdown is truncated to fit. Rough estimate: 1 token ≈ 4 characters.
29
+ * Does not affect lean view (headings/links are always small).
30
+ * Default: unlimited.
31
+ */
32
+ tokenBudget?: number;
33
+ /**
34
+ * Per-domain throttle — shared across spider() calls to enforce rate limits
35
+ * and exponential backoff on 429/503 responses.
36
+ */
37
+ throttle?: IThrottle;
38
+ /**
39
+ * robots.txt checker — when provided, spider() checks robots.txt before
40
+ * fetching and respects Crawl-delay directives.
41
+ */
42
+ robotsCache?: IRobotsChecker;
43
+ /**
44
+ * HTTP client — defaults to a global fetch() adapter.
45
+ * Inject a stub for testing without real network access.
46
+ */
47
+ httpClient?: IHttpClient;
48
+ /**
49
+ * When true, fetch <img> src URLs found in the article content and attach
50
+ * them as base64-encoded ImageRef objects to SpideredPage.images.
51
+ * Default: false — preserves current behaviour exactly.
52
+ */
53
+ captureImages?: boolean;
54
+ /**
55
+ * Maximum number of images to fetch per page.
56
+ * Default: 10.
57
+ */
58
+ maxImages?: number;
59
+ }
60
+ /** A page with its full DOM tree attached. */
61
+ export interface TreePage extends SpideredPage {
62
+ readonly view: "tree";
63
+ tree: DOMNode;
64
+ }
65
+ export declare function spider(url: string, opts: SpiderOptions & {
66
+ view: "lean";
67
+ }): Promise<LeanPage>;
68
+ export declare function spider(url: string, opts: SpiderOptions & {
69
+ view: "tree";
70
+ }): Promise<TreePage>;
71
+ export declare function spider(url: string, opts?: SpiderOptions & {
72
+ view?: "full";
73
+ }): Promise<SpideredPage>;
74
+ //# sourceMappingURL=spider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"spider.d.ts","sourceRoot":"","sources":["../src/spider.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAEzE,OAAO,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAoClE,MAAM,WAAW,aAAa;IAC7B;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,QAAQ,CAAC,EAAE,SAAS,CAAC;IACrB;;;OAGG;IACH,WAAW,CAAC,EAAE,cAAc,CAAC;IAC7B;;;OAGG;IACH,UAAU,CAAC,EAAE,WAAW,CAAC;IACzB;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AA6GD,8CAA8C;AAC9C,MAAM,WAAW,QAAS,SAAQ,YAAY;IAC7C,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,IAAI,EAAE,OAAO,CAAC;CACd;AAED,wBAAsB,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,aAAa,GAAG;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;AACrG,wBAAsB,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,aAAa,GAAG;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;AACrG,wBAAsB,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,aAAa,GAAG;IAAE,IAAI,CAAC,EAAE,MAAM,CAAA;CAAE,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC"}
package/dist/spider.js ADDED
@@ -0,0 +1,349 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { chunk, toMarkdown } from "./convert.js";
3
+ import { extractCanonicalUrl, extractHeadings, extractLinks, extractTags, parseDom } from "./parse.js";
4
+ import { buildTree } from "./tree.js";
5
+ import { toLean } from "./views.js";
6
+ // ---------------------------------------------------------------------------
7
+ // Constants
8
+ // ---------------------------------------------------------------------------
9
+ const WORDS_PER_MINUTE = 200;
10
+ // ---------------------------------------------------------------------------
11
+ // Default HTTP client adapter
12
+ // ---------------------------------------------------------------------------
13
+ const defaultHttpClient = {
14
+ async fetch(req) {
15
+ const res = await globalThis.fetch(req.url, {
16
+ signal: req.signal,
17
+ headers: req.headers,
18
+ });
19
+ return {
20
+ ok: res.ok,
21
+ status: res.status,
22
+ statusText: res.statusText,
23
+ headers: { get: (name) => res.headers.get(name) },
24
+ text: () => res.text(),
25
+ arrayBuffer: () => res.arrayBuffer(),
26
+ };
27
+ },
28
+ };
29
+ /**
30
+ * Spider a single URL and return a fully structured SpideredPage.
31
+ *
32
+ * Pass `view: "lean"` to skip chunking and markdown conversion — returns a
33
+ * LeanPage with only identity, metadata, and the heading/link outline.
34
+ * Significantly faster (~3×) and uses far fewer tokens in agent context.
35
+ *
36
+ * Errors are returned as thrown exceptions with a descriptive message rather
37
+ * than crashing silently. Common cases:
38
+ * - Non-HTTP URLs throw immediately with a clear message.
39
+ * - HTTP errors include the status code.
40
+ * - JS-rendered pages (wordCount === 0) include a hint.
41
+ * - Timeouts include the configured limit.
42
+ *
43
+ * @example
44
+ * // Full page — chunks, markdown, all metadata
45
+ * const page = await spider("https://example.com")
46
+ *
47
+ * @example
48
+ * // Lean overview — no body text, ideal for navigation decisions
49
+ * const lean = await spider("https://example.com", { view: "lean" })
50
+ */
51
+ // ---------------------------------------------------------------------------
52
+ // Image fetching
53
+ // ---------------------------------------------------------------------------
54
+ /** Detect MIME type from a URL path extension, defaulting to image/jpeg. */
55
+ function mimeFromUrl(src) {
56
+ const ext = src.split("?")[0].split(".").pop()?.toLowerCase();
57
+ const map = {
58
+ jpg: "image/jpeg",
59
+ jpeg: "image/jpeg",
60
+ png: "image/png",
61
+ webp: "image/webp",
62
+ gif: "image/gif",
63
+ svg: "image/svg+xml",
64
+ avif: "image/avif",
65
+ };
66
+ return map[ext ?? ""] ?? "image/jpeg";
67
+ }
68
+ /**
69
+ * Extract <img> elements from article HTML, resolve src URLs, and fetch
70
+ * each as a base64-encoded ImageRef. data: URLs are included without fetching.
71
+ * Failed fetches are silently skipped.
72
+ */
73
+ async function fetchImages(articleHtml, pageUrl, httpClient, maxImages, throttle) {
74
+ // Parse the article HTML to extract img elements.
75
+ const { parseDom } = await import("./parse.js");
76
+ const doc = parseDom(articleHtml, pageUrl);
77
+ const imgEls = [...doc.querySelectorAll("img")].slice(0, maxImages);
78
+ const results = [];
79
+ for (const el of imgEls) {
80
+ const rawSrc = el.getAttribute("src") ?? "";
81
+ if (!rawSrc)
82
+ continue;
83
+ const alt = el.getAttribute("alt") ?? "";
84
+ // data: URLs — include without fetching.
85
+ if (rawSrc.startsWith("data:")) {
86
+ const match = /^data:([^;]+);base64,(.+)$/.exec(rawSrc);
87
+ if (match) {
88
+ results.push({ src: rawSrc, mimeType: match[1], alt, base64: match[2] });
89
+ }
90
+ continue;
91
+ }
92
+ // Resolve relative URLs.
93
+ let absoluteSrc;
94
+ try {
95
+ absoluteSrc = new URL(rawSrc, pageUrl).toString();
96
+ }
97
+ catch {
98
+ continue;
99
+ }
100
+ try {
101
+ if (throttle)
102
+ await throttle.wait(absoluteSrc);
103
+ const res = await httpClient.fetch({
104
+ url: absoluteSrc,
105
+ headers: { "User-Agent": "web-spider/0.1", Accept: "image/*" },
106
+ });
107
+ if (!res.ok)
108
+ continue;
109
+ throttle?.success(absoluteSrc);
110
+ const buf = await res.arrayBuffer();
111
+ const base64 = Buffer.from(buf).toString("base64");
112
+ const contentType = res.headers.get("content-type");
113
+ const mimeType = contentType?.split(";")[0].trim() || mimeFromUrl(absoluteSrc);
114
+ results.push({ src: absoluteSrc, mimeType, alt, base64 });
115
+ }
116
+ catch {
117
+ // Skip failed image fetches silently — a missing image should never
118
+ // cause the whole page scrape to fail.
119
+ }
120
+ }
121
+ return results;
122
+ }
123
+ export async function spider(url, opts) {
124
+ const { timeoutMs = 30_000, userAgent = "web-spider/0.1 (AI agent research tool; +https://github.com/dpopsuev)", view = "full", rootSelector, excludeSelectors, tokenBudget, throttle, robotsCache, httpClient = defaultHttpClient, captureImages = false, maxImages = 10, } = opts ?? {};
125
+ // Poka-yoke: reject non-HTTP URLs immediately with a clear message.
126
+ let parsedUrl;
127
+ try {
128
+ parsedUrl = new URL(url);
129
+ }
130
+ catch {
131
+ throw new Error(`Invalid URL: "${url}" — must be a fully-qualified http/https URL`);
132
+ }
133
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) {
134
+ throw new Error(`Unsupported protocol "${parsedUrl.protocol}" — only http and https are supported`);
135
+ }
136
+ // Check robots.txt before fetching.
137
+ if (robotsCache) {
138
+ const { allowed, crawlDelayMs } = await robotsCache.check(url);
139
+ if (!allowed)
140
+ throw new Error(`Blocked by robots.txt: ${url}`);
141
+ if (crawlDelayMs && throttle) {
142
+ throttle.setDomainDelay(parsedUrl.hostname, crawlDelayMs);
143
+ }
144
+ }
145
+ // Fetch with optional throttle + retry on 429/503.
146
+ const maxRetries = throttle?.maxRetries ?? 0;
147
+ let html = "";
148
+ let fetchError = null;
149
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
150
+ if (throttle)
151
+ await throttle.wait(url);
152
+ const controller = new AbortController();
153
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
154
+ let res;
155
+ try {
156
+ res = await httpClient.fetch({
157
+ url,
158
+ signal: controller.signal,
159
+ headers: { "User-Agent": userAgent, Accept: "text/html" },
160
+ });
161
+ }
162
+ catch (err) {
163
+ clearTimeout(timer);
164
+ if (err instanceof Error && err.name === "AbortError") {
165
+ throw new Error(`Timeout after ${timeoutMs}ms — ${url}`);
166
+ }
167
+ throw err;
168
+ }
169
+ clearTimeout(timer);
170
+ if (res.status === 429 || res.status === 503) {
171
+ if (throttle && attempt < maxRetries) {
172
+ throttle.rateLimit(url, res.headers.get("Retry-After"));
173
+ fetchError = new Error(`HTTP ${res.status} — retrying (attempt ${attempt + 1}/${maxRetries})`);
174
+ continue;
175
+ }
176
+ throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
177
+ }
178
+ if (!res.ok)
179
+ throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
180
+ throttle?.success(url);
181
+ html = await res.text();
182
+ fetchError = null;
183
+ break;
184
+ }
185
+ if (fetchError)
186
+ throw fetchError;
187
+ // Parse DOM via parse.ts — keeps the JSDOM dependency in one module.
188
+ const doc = parseDom(html, url);
189
+ // Apply excludeSelectors before Readability strips the DOM.
190
+ if (excludeSelectors) {
191
+ for (const sel of excludeSelectors
192
+ .split(",")
193
+ .map((s) => s.trim())
194
+ .filter(Boolean)) {
195
+ for (const el of [...doc.querySelectorAll(sel)])
196
+ el.remove();
197
+ }
198
+ }
199
+ // Scope to rootSelector: replace body content with the matched element.
200
+ if (rootSelector) {
201
+ const root = doc.querySelector(rootSelector);
202
+ if (root) {
203
+ doc.body.innerHTML = root.outerHTML;
204
+ }
205
+ }
206
+ const links = extractLinks(doc, url);
207
+ const canonicalUrl = extractCanonicalUrl(doc, url);
208
+ // Readability content extraction (Firefox Reader View engine).
209
+ const readabilityResult = new Readability(doc).parse();
210
+ const jsRendered = !readabilityResult;
211
+ // Graceful degradation: if Readability finds nothing, return a partial page
212
+ // with jsRendered:true rather than throwing. The agent can decide what to do.
213
+ const article = readabilityResult ?? {
214
+ title: (doc.querySelector("title")?.textContent ?? "").trim(),
215
+ content: "",
216
+ textContent: "",
217
+ length: 0,
218
+ excerpt: "",
219
+ byline: "",
220
+ dir: "",
221
+ site_name: "",
222
+ lang: "",
223
+ publishedTime: null,
224
+ readingTimeMinutes: 0,
225
+ };
226
+ const domain = new URL(url).hostname.replace(/^www\./, "");
227
+ const fetchedAt = new Date().toISOString();
228
+ const meta = (name) => {
229
+ const el = doc.querySelector(`meta[name="${name}"]`) ??
230
+ doc.querySelector(`meta[property="og:${name}"]`) ??
231
+ doc.querySelector(`meta[property="${name}"]`);
232
+ return (el?.getAttribute("content") ?? "").trim();
233
+ };
234
+ // headings must come before tags so the heading fallback is available.
235
+ const headings = extractHeadings(article.content ?? "");
236
+ const tags = extractTags(doc);
237
+ // ---------------------------------------------------------------------------
238
+ // Lean fast-path — skip turndown + chunking entirely
239
+ // ---------------------------------------------------------------------------
240
+ if (view === "lean") {
241
+ const textContent = (article.textContent ?? "").trim();
242
+ const wordCount = textContent.split(/\s+/).filter(Boolean).length;
243
+ const chunkCount = Math.max(0, Math.floor(wordCount / 150));
244
+ const full = {
245
+ url,
246
+ domain,
247
+ fetchedAt,
248
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
249
+ title: article.title ?? meta("title"),
250
+ description: meta("description"),
251
+ author: article.byline ?? meta("author"),
252
+ publishedAt: meta("article:published_time") ?? meta("date"),
253
+ lang: doc.documentElement.lang ?? "en",
254
+ tags,
255
+ wordCount,
256
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
257
+ chunks: [], // placeholder — toLean reads chunks.length
258
+ headings,
259
+ links,
260
+ markdown: "",
261
+ };
262
+ const lean = toLean(full);
263
+ return { ...lean, chunkCount, ...(jsRendered ? { jsRendered: true } : {}) };
264
+ }
265
+ // ---------------------------------------------------------------------------
266
+ // Tree path — build semantic DOM tree, then also produce full markdown
267
+ // ---------------------------------------------------------------------------
268
+ if (view === "tree") {
269
+ const tree = buildTree(article.content ?? "", url);
270
+ const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
271
+ const wordCount = markdown.split(/\s+/).filter(Boolean).length;
272
+ const chunks = chunk(markdown, url);
273
+ const images = captureImages
274
+ ? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
275
+ : undefined;
276
+ return {
277
+ view: "tree",
278
+ url,
279
+ domain,
280
+ fetchedAt,
281
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
282
+ title: article.title ?? meta("title"),
283
+ description: meta("description"),
284
+ author: article.byline ?? meta("author"),
285
+ publishedAt: meta("article:published_time") ?? meta("date"),
286
+ lang: doc.documentElement.lang ?? "en",
287
+ tags,
288
+ wordCount,
289
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
290
+ headings,
291
+ chunks,
292
+ links,
293
+ markdown,
294
+ tree,
295
+ ...(images ? { images } : {}),
296
+ };
297
+ }
298
+ // ---------------------------------------------------------------------------
299
+ // Full path — turndown + chunk
300
+ // ---------------------------------------------------------------------------
301
+ const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
302
+ const wordCount = markdown.split(/\s+/).filter(Boolean).length;
303
+ // Chunk-aware tokenBudget: select whole chunks up to the budget rather
304
+ // than slicing markdown mid-sentence. Preserves chunk boundaries and
305
+ // returns the richest complete content that fits.
306
+ let allChunks = chunk(markdown, url);
307
+ if (tokenBudget !== undefined) {
308
+ const charBudget = tokenBudget * 4;
309
+ let remaining = charBudget;
310
+ let first = true;
311
+ allChunks = allChunks.filter((c) => {
312
+ // Always include at least the first chunk — agents need something
313
+ // even if it exceeds the budget.
314
+ if (!first && remaining <= 0)
315
+ return false;
316
+ first = false;
317
+ remaining -= c.text.length;
318
+ return true;
319
+ });
320
+ }
321
+ // Reconstruct markdown from selected chunks for full-page consumers.
322
+ const finalMarkdown = tokenBudget !== undefined
323
+ ? allChunks.map((c) => c.text).join("\n\n")
324
+ : markdown;
325
+ const images = captureImages
326
+ ? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
327
+ : undefined;
328
+ return {
329
+ url,
330
+ domain,
331
+ fetchedAt,
332
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
333
+ title: article.title ?? meta("title"),
334
+ description: meta("description"),
335
+ author: article.byline ?? meta("author"),
336
+ publishedAt: meta("article:published_time") ?? meta("date"),
337
+ lang: doc.documentElement.lang ?? "en",
338
+ tags,
339
+ wordCount,
340
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
341
+ headings,
342
+ chunks: allChunks,
343
+ links,
344
+ markdown: finalMarkdown,
345
+ ...(images ? { images } : {}),
346
+ ...(jsRendered ? { jsRendered: true } : {}),
347
+ };
348
+ }
349
+ //# sourceMappingURL=spider.js.map
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Per-domain request throttle with exponential backoff and jitter.
3
+ *
4
+ * Enforces a minimum gap between requests to the same hostname.
5
+ * On 429/503, backs off exponentially and respects Retry-After headers.
6
+ * Shared instances should be passed into spider() and crawl() so that
7
+ * all requests to a domain coordinate through one rate limiter.
8
+ */
9
+ import type { IThrottle } from "./ports.js";
10
+ export interface ThrottleOptions {
11
+ /** Minimum gap between requests to the same domain (ms). Default 500. */
12
+ minDelayMs?: number;
13
+ /** Base for exponential backoff (ms). Default 1000. */
14
+ backoffBaseMs?: number;
15
+ /** Maximum backoff delay (ms). Default 30 000. */
16
+ backoffCapMs?: number;
17
+ /** Maximum retry attempts on 429/503 before giving up. Default 3. */
18
+ maxRetries?: number;
19
+ }
20
+ export declare class DomainThrottle implements IThrottle {
21
+ private readonly states;
22
+ readonly minDelayMs: number;
23
+ readonly backoffBaseMs: number;
24
+ readonly backoffCapMs: number;
25
+ readonly maxRetries: number;
26
+ constructor(opts?: ThrottleOptions);
27
+ private state;
28
+ /** Wait until the domain's rate limit and backoff have cleared. */
29
+ wait(url: string): Promise<void>;
30
+ /** Record a successful request — resets backoff for the domain. */
31
+ success(url: string): void;
32
+ /**
33
+ * Record a rate-limit hit. Applies exponential backoff with jitter,
34
+ * using Retry-After header when present. Returns the wait duration in ms.
35
+ */
36
+ rateLimit(url: string, retryAfterHeader: string | null): number;
37
+ /**
38
+ * Override the minimum delay for a specific domain.
39
+ * Used to honour robots.txt Crawl-delay directives.
40
+ */
41
+ setDomainDelay(host: string, ms: number): void;
42
+ }
43
+ /**
44
+ * Factory — avoids jiti/Bun CJS re-export interop where class constructors
45
+ * accessed through a re-export chain can appear undefined at call site.
46
+ * Use this in extension code instead of `new DomainThrottle()`.
47
+ */
48
+ export declare function createThrottle(opts?: ThrottleOptions): DomainThrottle;
49
+ //# sourceMappingURL=throttle.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"throttle.d.ts","sourceRoot":"","sources":["../src/throttle.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAE5C,MAAM,WAAW,eAAe;IAC/B,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,uDAAuD;IACvD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,kDAAkD;IAClD,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,UAAU,CAAC,EAAE,MAAM,CAAC;CACpB;AAuBD,qBAAa,cAAe,YAAW,SAAS;IAC/C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAkC;IACzD,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;gBAEhB,IAAI,GAAE,eAAoB;IAOtC,OAAO,CAAC,KAAK;IASb,mEAAmE;IAC7D,IAAI,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAYtC,mEAAmE;IACnE,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAM1B;;;OAGG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM,GAAG,IAAI,GAAG,MAAM;IAW/D;;;OAGG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,IAAI;CAG9C;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,IAAI,CAAC,EAAE,eAAe,GAAG,cAAc,CAErE"}