@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
package/src/spider.ts ADDED
@@ -0,0 +1,475 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { chunk, toMarkdown } from "./convert.js";
3
+ import type { ImageRef } from "./types.js";
4
+ import { extractCanonicalUrl, extractHeadings, extractLinks, extractTags, parseDom } from "./parse.js";
5
+ import type { IHttpClient, IRobotsChecker, IThrottle } from "./ports.js";
6
+ import { buildTree } from "./tree.js";
7
+ import type { DOMNode, LeanPage, SpideredPage } from "./types.js";
8
+ import { toLean } from "./views.js";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Constants
12
+ // ---------------------------------------------------------------------------
13
+
14
+ const WORDS_PER_MINUTE = 200;
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Default HTTP client adapter
18
+ // ---------------------------------------------------------------------------
19
+
20
+ const defaultHttpClient: IHttpClient = {
21
+ async fetch(req) {
22
+ const res = await globalThis.fetch(req.url, {
23
+ signal: req.signal,
24
+ headers: req.headers,
25
+ });
26
+ return {
27
+ ok: res.ok,
28
+ status: res.status,
29
+ statusText: res.statusText,
30
+ headers: { get: (name: string) => res.headers.get(name) },
31
+ text: () => res.text(),
32
+ arrayBuffer: () => res.arrayBuffer(),
33
+ };
34
+ },
35
+ };
36
+
37
+
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // Public API
41
+ // ---------------------------------------------------------------------------
42
+
43
+ export interface SpiderOptions {
44
+ /**
45
+ * ms before aborting the fetch (default 10 000).
46
+ */
47
+ timeoutMs?: number;
48
+ /**
49
+ * Value sent as User-Agent.
50
+ * Default identifies the tool; override for sites that block generic crawlers.
51
+ */
52
+ userAgent?: string;
53
+ /**
54
+ * CSS selector that scopes content extraction to a specific element.
55
+ * Everything outside the matched element is discarded before Readability runs.
56
+ * Example: "article", ".main-content", "#post-body"
57
+ */
58
+ rootSelector?: string;
59
+ /**
60
+ * Comma-separated CSS selectors whose matched elements are removed before
61
+ * extraction. Applied before Readability, so excluded content never reaches
62
+ * the chunks or markdown.
63
+ * Example: "nav, footer, .sidebar, #ads"
64
+ */
65
+ excludeSelectors?: string;
66
+ /**
67
+ * Approximate maximum token budget for the returned content.
68
+ * Markdown is truncated to fit. Rough estimate: 1 token ≈ 4 characters.
69
+ * Does not affect lean view (headings/links are always small).
70
+ * Default: unlimited.
71
+ */
72
+ tokenBudget?: number;
73
+ /**
74
+ * Per-domain throttle — shared across spider() calls to enforce rate limits
75
+ * and exponential backoff on 429/503 responses.
76
+ */
77
+ throttle?: IThrottle;
78
+ /**
79
+ * robots.txt checker — when provided, spider() checks robots.txt before
80
+ * fetching and respects Crawl-delay directives.
81
+ */
82
+ robotsCache?: IRobotsChecker;
83
+ /**
84
+ * HTTP client — defaults to a global fetch() adapter.
85
+ * Inject a stub for testing without real network access.
86
+ */
87
+ httpClient?: IHttpClient;
88
+ /**
89
+ * When true, fetch <img> src URLs found in the article content and attach
90
+ * them as base64-encoded ImageRef objects to SpideredPage.images.
91
+ * Default: false — preserves current behaviour exactly.
92
+ */
93
+ captureImages?: boolean;
94
+ /**
95
+ * Maximum number of images to fetch per page.
96
+ * Default: 10.
97
+ */
98
+ maxImages?: number;
99
+ }
100
+
101
+ /**
102
+ * Spider a single URL and return a fully structured SpideredPage.
103
+ *
104
+ * Pass `view: "lean"` to skip chunking and markdown conversion — returns a
105
+ * LeanPage with only identity, metadata, and the heading/link outline.
106
+ * Significantly faster (~3×) and uses far fewer tokens in agent context.
107
+ *
108
+ * Errors are returned as thrown exceptions with a descriptive message rather
109
+ * than crashing silently. Common cases:
110
+ * - Non-HTTP URLs throw immediately with a clear message.
111
+ * - HTTP errors include the status code.
112
+ * - JS-rendered pages (wordCount === 0) include a hint.
113
+ * - Timeouts include the configured limit.
114
+ *
115
+ * @example
116
+ * // Full page — chunks, markdown, all metadata
117
+ * const page = await spider("https://example.com")
118
+ *
119
+ * @example
120
+ * // Lean overview — no body text, ideal for navigation decisions
121
+ * const lean = await spider("https://example.com", { view: "lean" })
122
+ */
123
+ // ---------------------------------------------------------------------------
124
+ // Image fetching
125
+ // ---------------------------------------------------------------------------
126
+
127
+ /** Detect MIME type from a URL path extension, defaulting to image/jpeg. */
128
+ function mimeFromUrl(src: string): string {
129
+ const ext = src.split("?")[0].split(".").pop()?.toLowerCase();
130
+ const map: Record<string, string> = {
131
+ jpg: "image/jpeg",
132
+ jpeg: "image/jpeg",
133
+ png: "image/png",
134
+ webp: "image/webp",
135
+ gif: "image/gif",
136
+ svg: "image/svg+xml",
137
+ avif: "image/avif",
138
+ };
139
+ return map[ext ?? ""] ?? "image/jpeg";
140
+ }
141
+
142
+ /**
143
+ * Extract <img> elements from article HTML, resolve src URLs, and fetch
144
+ * each as a base64-encoded ImageRef. data: URLs are included without fetching.
145
+ * Failed fetches are silently skipped.
146
+ */
147
+ async function fetchImages(
148
+ articleHtml: string,
149
+ pageUrl: string,
150
+ httpClient: IHttpClient,
151
+ maxImages: number,
152
+ throttle?: IThrottle,
153
+ ): Promise<ImageRef[]> {
154
+ // Parse the article HTML to extract img elements.
155
+ const { parseDom } = await import("./parse.js");
156
+ const doc = parseDom(articleHtml, pageUrl);
157
+ const imgEls = [...doc.querySelectorAll("img")].slice(0, maxImages);
158
+
159
+ const results: ImageRef[] = [];
160
+
161
+ for (const el of imgEls) {
162
+ const rawSrc = el.getAttribute("src") ?? "";
163
+ if (!rawSrc) continue;
164
+
165
+ const alt = el.getAttribute("alt") ?? "";
166
+
167
+ // data: URLs — include without fetching.
168
+ if (rawSrc.startsWith("data:")) {
169
+ const match = /^data:([^;]+);base64,(.+)$/.exec(rawSrc);
170
+ if (match) {
171
+ results.push({ src: rawSrc, mimeType: match[1], alt, base64: match[2] });
172
+ }
173
+ continue;
174
+ }
175
+
176
+ // Resolve relative URLs.
177
+ let absoluteSrc: string;
178
+ try {
179
+ absoluteSrc = new URL(rawSrc, pageUrl).toString();
180
+ } catch {
181
+ continue;
182
+ }
183
+
184
+ try {
185
+ if (throttle) await throttle.wait(absoluteSrc);
186
+ const res = await httpClient.fetch({
187
+ url: absoluteSrc,
188
+ headers: { "User-Agent": "web-spider/0.1", Accept: "image/*" },
189
+ });
190
+ if (!res.ok) continue;
191
+ throttle?.success(absoluteSrc);
192
+
193
+ const buf = await res.arrayBuffer();
194
+ const base64 = Buffer.from(buf).toString("base64");
195
+ const contentType = res.headers.get("content-type");
196
+ const mimeType = contentType?.split(";")[0].trim() || mimeFromUrl(absoluteSrc);
197
+
198
+ results.push({ src: absoluteSrc, mimeType, alt, base64 });
199
+ } catch {
200
+ // Skip failed image fetches silently — a missing image should never
201
+ // cause the whole page scrape to fail.
202
+ }
203
+ }
204
+
205
+ return results;
206
+ }
207
+
208
+ /** A page with its full DOM tree attached. */
209
+ export interface TreePage extends SpideredPage {
210
+ readonly view: "tree";
211
+ tree: DOMNode;
212
+ }
213
+
214
+ export async function spider(url: string, opts: SpiderOptions & { view: "lean" }): Promise<LeanPage>;
215
+ export async function spider(url: string, opts: SpiderOptions & { view: "tree" }): Promise<TreePage>;
216
+ export async function spider(url: string, opts?: SpiderOptions & { view?: "full" }): Promise<SpideredPage>;
217
+ export async function spider(
218
+ url: string,
219
+ opts?: SpiderOptions & { view?: "lean" | "full" | "tree" },
220
+ ): Promise<SpideredPage | LeanPage | TreePage> {
221
+ const {
222
+ timeoutMs = 30_000,
223
+ userAgent = "web-spider/0.1 (AI agent research tool; +https://github.com/dpopsuev)",
224
+ view = "full",
225
+ rootSelector,
226
+ excludeSelectors,
227
+ tokenBudget,
228
+ throttle,
229
+ robotsCache,
230
+ httpClient = defaultHttpClient,
231
+ captureImages = false,
232
+ maxImages = 10,
233
+ } = opts ?? {};
234
+
235
+ // Poka-yoke: reject non-HTTP URLs immediately with a clear message.
236
+ let parsedUrl: URL;
237
+ try {
238
+ parsedUrl = new URL(url);
239
+ } catch {
240
+ throw new Error(`Invalid URL: "${url}" — must be a fully-qualified http/https URL`);
241
+ }
242
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) {
243
+ throw new Error(`Unsupported protocol "${parsedUrl.protocol}" — only http and https are supported`);
244
+ }
245
+
246
+ // Check robots.txt before fetching.
247
+ if (robotsCache) {
248
+ const { allowed, crawlDelayMs } = await robotsCache.check(url);
249
+ if (!allowed) throw new Error(`Blocked by robots.txt: ${url}`);
250
+ if (crawlDelayMs && throttle) {
251
+ throttle.setDomainDelay(parsedUrl.hostname, crawlDelayMs);
252
+ }
253
+ }
254
+
255
+ // Fetch with optional throttle + retry on 429/503.
256
+ const maxRetries = throttle?.maxRetries ?? 0;
257
+ let html = "";
258
+ let fetchError: Error | null = null;
259
+
260
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
261
+ if (throttle) await throttle.wait(url);
262
+
263
+ const controller = new AbortController();
264
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
265
+ let res: Awaited<ReturnType<IHttpClient["fetch"]>>;
266
+ try {
267
+ res = await httpClient.fetch({
268
+ url,
269
+ signal: controller.signal,
270
+ headers: { "User-Agent": userAgent, Accept: "text/html" },
271
+ });
272
+ } catch (err) {
273
+ clearTimeout(timer);
274
+ if (err instanceof Error && err.name === "AbortError") {
275
+ throw new Error(`Timeout after ${timeoutMs}ms — ${url}`);
276
+ }
277
+ throw err;
278
+ }
279
+ clearTimeout(timer);
280
+
281
+ if (res.status === 429 || res.status === 503) {
282
+ if (throttle && attempt < maxRetries) {
283
+ throttle.rateLimit(url, res.headers.get("Retry-After"));
284
+ fetchError = new Error(`HTTP ${res.status} — retrying (attempt ${attempt + 1}/${maxRetries})`);
285
+ continue;
286
+ }
287
+ throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
288
+ }
289
+
290
+ if (!res.ok) throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
291
+
292
+ throttle?.success(url);
293
+ html = await res.text();
294
+ fetchError = null;
295
+ break;
296
+ }
297
+
298
+ if (fetchError) throw fetchError;
299
+
300
+ // Parse DOM via parse.ts — keeps the JSDOM dependency in one module.
301
+ const doc = parseDom(html, url);
302
+
303
+ // Apply excludeSelectors before Readability strips the DOM.
304
+ if (excludeSelectors) {
305
+ for (const sel of excludeSelectors
306
+ .split(",")
307
+ .map((s) => s.trim())
308
+ .filter(Boolean)) {
309
+ for (const el of [...doc.querySelectorAll(sel)]) el.remove();
310
+ }
311
+ }
312
+
313
+ // Scope to rootSelector: replace body content with the matched element.
314
+ if (rootSelector) {
315
+ const root = doc.querySelector(rootSelector);
316
+ if (root) {
317
+ doc.body.innerHTML = root.outerHTML;
318
+ }
319
+ }
320
+
321
+ const links = extractLinks(doc, url);
322
+ const canonicalUrl = extractCanonicalUrl(doc, url);
323
+
324
+ // Readability content extraction (Firefox Reader View engine).
325
+ const readabilityResult = new Readability(doc).parse();
326
+ const jsRendered = !readabilityResult;
327
+ // Graceful degradation: if Readability finds nothing, return a partial page
328
+ // with jsRendered:true rather than throwing. The agent can decide what to do.
329
+ const article = readabilityResult ?? {
330
+ title: (doc.querySelector("title")?.textContent ?? "").trim(),
331
+ content: "",
332
+ textContent: "",
333
+ length: 0,
334
+ excerpt: "",
335
+ byline: "",
336
+ dir: "",
337
+ site_name: "",
338
+ lang: "",
339
+ publishedTime: null,
340
+ readingTimeMinutes: 0,
341
+ };
342
+
343
+ const domain = new URL(url).hostname.replace(/^www\./, "");
344
+ const fetchedAt = new Date().toISOString();
345
+
346
+ const meta = (name: string): string => {
347
+ const el =
348
+ doc.querySelector(`meta[name="${name}"]`) ??
349
+ doc.querySelector(`meta[property="og:${name}"]`) ??
350
+ doc.querySelector(`meta[property="${name}"]`);
351
+ return (el?.getAttribute("content") ?? "").trim();
352
+ };
353
+
354
+ // headings must come before tags so the heading fallback is available.
355
+ const headings = extractHeadings(article.content ?? "");
356
+ const tags = extractTags(doc);
357
+
358
+ // ---------------------------------------------------------------------------
359
+ // Lean fast-path — skip turndown + chunking entirely
360
+ // ---------------------------------------------------------------------------
361
+ if (view === "lean") {
362
+ const textContent = (article.textContent ?? "").trim();
363
+ const wordCount = textContent.split(/\s+/).filter(Boolean).length;
364
+ const chunkCount = Math.max(0, Math.floor(wordCount / 150));
365
+
366
+ const full = {
367
+ url,
368
+ domain,
369
+ fetchedAt,
370
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
371
+ title: article.title ?? meta("title"),
372
+ description: meta("description"),
373
+ author: article.byline ?? meta("author"),
374
+ publishedAt: meta("article:published_time") ?? meta("date"),
375
+ lang: doc.documentElement.lang ?? "en",
376
+ tags,
377
+ wordCount,
378
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
379
+ chunks: [], // placeholder — toLean reads chunks.length
380
+ headings,
381
+ links,
382
+ markdown: "",
383
+ } satisfies SpideredPage;
384
+ const lean = toLean(full);
385
+ return { ...lean, chunkCount, ...(jsRendered ? { jsRendered: true } : {}) };
386
+ }
387
+
388
+ // ---------------------------------------------------------------------------
389
+ // Tree path — build semantic DOM tree, then also produce full markdown
390
+ // ---------------------------------------------------------------------------
391
+ if (view === "tree") {
392
+ const tree = buildTree(article.content ?? "", url);
393
+ const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
394
+ const wordCount = markdown.split(/\s+/).filter(Boolean).length;
395
+ const chunks = chunk(markdown, url);
396
+ const images = captureImages
397
+ ? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
398
+ : undefined;
399
+ return {
400
+ view: "tree",
401
+ url,
402
+ domain,
403
+ fetchedAt,
404
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
405
+ title: article.title ?? meta("title"),
406
+ description: meta("description"),
407
+ author: article.byline ?? meta("author"),
408
+ publishedAt: meta("article:published_time") ?? meta("date"),
409
+ lang: doc.documentElement.lang ?? "en",
410
+ tags,
411
+ wordCount,
412
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
413
+ headings,
414
+ chunks,
415
+ links,
416
+ markdown,
417
+ tree,
418
+ ...(images ? { images } : {}),
419
+ };
420
+ }
421
+
422
+ // ---------------------------------------------------------------------------
423
+ // Full path — turndown + chunk
424
+ // ---------------------------------------------------------------------------
425
+ const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
426
+ const wordCount = markdown.split(/\s+/).filter(Boolean).length;
427
+
428
+ // Chunk-aware tokenBudget: select whole chunks up to the budget rather
429
+ // than slicing markdown mid-sentence. Preserves chunk boundaries and
430
+ // returns the richest complete content that fits.
431
+ let allChunks = chunk(markdown, url);
432
+ if (tokenBudget !== undefined) {
433
+ const charBudget = tokenBudget * 4;
434
+ let remaining = charBudget;
435
+ let first = true;
436
+ allChunks = allChunks.filter((c) => {
437
+ // Always include at least the first chunk — agents need something
438
+ // even if it exceeds the budget.
439
+ if (!first && remaining <= 0) return false;
440
+ first = false;
441
+ remaining -= c.text.length;
442
+ return true;
443
+ });
444
+ }
445
+
446
+ // Reconstruct markdown from selected chunks for full-page consumers.
447
+ const finalMarkdown = tokenBudget !== undefined
448
+ ? allChunks.map((c) => c.text).join("\n\n")
449
+ : markdown;
450
+
451
+ const images = captureImages
452
+ ? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
453
+ : undefined;
454
+
455
+ return {
456
+ url,
457
+ domain,
458
+ fetchedAt,
459
+ ...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
460
+ title: article.title ?? meta("title"),
461
+ description: meta("description"),
462
+ author: article.byline ?? meta("author"),
463
+ publishedAt: meta("article:published_time") ?? meta("date"),
464
+ lang: doc.documentElement.lang ?? "en",
465
+ tags,
466
+ wordCount,
467
+ readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
468
+ headings,
469
+ chunks: allChunks,
470
+ links,
471
+ markdown: finalMarkdown,
472
+ ...(images ? { images } : {}),
473
+ ...(jsRendered ? { jsRendered: true } : {}),
474
+ };
475
+ }
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Per-domain request throttle with exponential backoff and jitter.
3
+ *
4
+ * Enforces a minimum gap between requests to the same hostname.
5
+ * On 429/503, backs off exponentially and respects Retry-After headers.
6
+ * Shared instances should be passed into spider() and crawl() so that
7
+ * all requests to a domain coordinate through one rate limiter.
8
+ */
9
+
10
+ import type { IThrottle } from "./ports.js";
11
+
12
+ export interface ThrottleOptions {
13
+ /** Minimum gap between requests to the same domain (ms). Default 500. */
14
+ minDelayMs?: number;
15
+ /** Base for exponential backoff (ms). Default 1000. */
16
+ backoffBaseMs?: number;
17
+ /** Maximum backoff delay (ms). Default 30 000. */
18
+ backoffCapMs?: number;
19
+ /** Maximum retry attempts on 429/503 before giving up. Default 3. */
20
+ maxRetries?: number;
21
+ }
22
+
23
+ interface DomainState {
24
+ lastAt: number;
25
+ backoffUntil: number;
26
+ errors: number;
27
+ /** Per-domain minimum delay override (e.g. from robots.txt Crawl-delay). */
28
+ minDelayMs?: number;
29
+ }
30
+
31
+ function sleep(ms: number): Promise<void> {
32
+ return new Promise((r) => setTimeout(r, ms));
33
+ }
34
+
35
+ function parseRetryAfter(header: string | null): number {
36
+ if (!header) return 0;
37
+ const seconds = parseInt(header, 10);
38
+ if (!isNaN(seconds)) return seconds * 1_000;
39
+ const date = new Date(header).getTime();
40
+ if (!isNaN(date)) return Math.max(0, date - Date.now());
41
+ return 0;
42
+ }
43
+
44
+ export class DomainThrottle implements IThrottle {
45
+ private readonly states = new Map<string, DomainState>();
46
+ readonly minDelayMs: number;
47
+ readonly backoffBaseMs: number;
48
+ readonly backoffCapMs: number;
49
+ readonly maxRetries: number;
50
+
51
+ constructor(opts: ThrottleOptions = {}) {
52
+ this.minDelayMs = opts.minDelayMs ?? 500;
53
+ this.backoffBaseMs = opts.backoffBaseMs ?? 1_000;
54
+ this.backoffCapMs = opts.backoffCapMs ?? 30_000;
55
+ this.maxRetries = opts.maxRetries ?? 3;
56
+ }
57
+
58
+ private state(host: string): DomainState {
59
+ let s = this.states.get(host);
60
+ if (!s) {
61
+ s = { lastAt: 0, backoffUntil: 0, errors: 0 };
62
+ this.states.set(host, s);
63
+ }
64
+ return s;
65
+ }
66
+
67
+ /** Wait until the domain's rate limit and backoff have cleared. */
68
+ async wait(url: string): Promise<void> {
69
+ const s = this.state(new URL(url).hostname);
70
+ const minDelay = s.minDelayMs ?? this.minDelayMs;
71
+ const now = Date.now();
72
+ const delay = Math.max(
73
+ Math.max(0, s.backoffUntil - now),
74
+ Math.max(0, s.lastAt + minDelay - now),
75
+ );
76
+ if (delay > 0) await sleep(delay);
77
+ s.lastAt = Date.now();
78
+ }
79
+
80
+ /** Record a successful request — resets backoff for the domain. */
81
+ success(url: string): void {
82
+ const s = this.state(new URL(url).hostname);
83
+ s.errors = 0;
84
+ s.backoffUntil = 0;
85
+ }
86
+
87
+ /**
88
+ * Record a rate-limit hit. Applies exponential backoff with jitter,
89
+ * using Retry-After header when present. Returns the wait duration in ms.
90
+ */
91
+ rateLimit(url: string, retryAfterHeader: string | null): number {
92
+ const s = this.state(new URL(url).hostname);
93
+ s.errors++;
94
+ const retryAfterMs = parseRetryAfter(retryAfterHeader);
95
+ const jitter = Math.random() * this.backoffBaseMs;
96
+ const backoffMs = Math.min(this.backoffCapMs, this.backoffBaseMs * 2 ** (s.errors - 1) + jitter);
97
+ const waitMs = Math.max(retryAfterMs, backoffMs);
98
+ s.backoffUntil = Date.now() + waitMs;
99
+ return waitMs;
100
+ }
101
+
102
+ /**
103
+ * Override the minimum delay for a specific domain.
104
+ * Used to honour robots.txt Crawl-delay directives.
105
+ */
106
+ setDomainDelay(host: string, ms: number): void {
107
+ this.state(host).minDelayMs = ms;
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Factory — avoids jiti/Bun CJS re-export interop where class constructors
113
+ * accessed through a re-export chain can appear undefined at call site.
114
+ * Use this in extension code instead of `new DomainThrottle()`.
115
+ */
116
+ export function createThrottle(opts?: ThrottleOptions): DomainThrottle {
117
+ return new DomainThrottle(opts);
118
+ }