@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/batch.js.map +1 -0
  2. package/dist/cache.js.map +1 -0
  3. package/dist/convert.js.map +1 -0
  4. package/dist/crawl.js.map +1 -0
  5. package/dist/disk-cache.js.map +1 -0
  6. package/dist/graph.js.map +1 -0
  7. package/dist/index.js.map +1 -0
  8. package/dist/parse.js.map +1 -0
  9. package/dist/playwright.js.map +1 -0
  10. package/dist/ports.js.map +1 -0
  11. package/dist/robots.js.map +1 -0
  12. package/dist/search.js.map +1 -0
  13. package/dist/sitemap.js.map +1 -0
  14. package/dist/spider.js.map +1 -0
  15. package/dist/throttle.js.map +1 -0
  16. package/dist/tree.js.map +1 -0
  17. package/dist/types.js.map +1 -0
  18. package/dist/views.js.map +1 -0
  19. package/dist/web-search.js.map +1 -0
  20. package/package.json +2 -1
  21. package/fixtures/article-with-images.html +0 -94
  22. package/fixtures/gh-shell.html +0 -32
  23. package/fixtures/guide-ai-agents-web-scraping.json +0 -552
  24. package/fixtures/images/large.jpg +0 -0
  25. package/fixtures/images/small.jpg +0 -0
  26. package/fixtures/images/tiny.png +0 -0
  27. package/fixtures/quotes-index.json +0 -40
  28. package/scripts/fetch-guide.mjs +0 -25
  29. package/src/cache.ts +0 -99
  30. package/src/convert.ts +0 -161
  31. package/src/crawl.ts +0 -186
  32. package/src/disk-cache.ts +0 -228
  33. package/src/graph.ts +0 -189
  34. package/src/index.ts +0 -74
  35. package/src/parse.ts +0 -154
  36. package/src/playwright.ts +0 -193
  37. package/src/ports.ts +0 -131
  38. package/src/robots.ts +0 -121
  39. package/src/search.ts +0 -173
  40. package/src/sitemap.ts +0 -67
  41. package/src/spider.ts +0 -475
  42. package/src/throttle.ts +0 -118
  43. package/src/tree.ts +0 -379
  44. package/src/types.ts +0 -225
  45. package/src/views.ts +0 -42
  46. package/src/web-search.ts +0 -548
  47. package/test/convert-images.test.ts +0 -69
  48. package/test/disk-cache-images.test.ts +0 -193
  49. package/test/engine-registry.test.ts +0 -114
  50. package/test/exports.test.ts +0 -124
  51. package/test/get-chunk.test.ts +0 -115
  52. package/test/images-integration.test.ts +0 -359
  53. package/test/improvements.test.ts +0 -279
  54. package/test/inbound-count.test.ts +0 -111
  55. package/test/lean.test.ts +0 -105
  56. package/test/playwright.test.ts +0 -128
  57. package/test/ports.test.ts +0 -161
  58. package/test/search.test.ts +0 -219
  59. package/test/spider-images.test.ts +0 -180
  60. package/test/spider-unit.test.ts +0 -610
  61. package/test/tree.test.ts +0 -272
  62. package/test/types.test.ts +0 -169
  63. package/test/web-search-integration.test.ts +0 -180
  64. package/test/web-search.test.ts +0 -305
  65. package/tsconfig.json +0 -9
  66. package/tsconfig.test.json +0 -7
  67. package/vitest.config.ts +0 -8
@@ -1,40 +0,0 @@
1
- {
2
- "url": "https://quotes.toscrape.com/",
3
- "domain": "quotes.toscrape.com",
4
- "fetchedAt": "2026-05-14T00:00:00.000Z",
5
- "title": "Quotes to Scrape",
6
- "description": "",
7
- "author": "Albert Einstein",
8
- "publishedAt": "",
9
- "lang": "en",
10
- "wordCount": 210,
11
- "readingTimeMinutes": 2,
12
- "headings": [],
13
- "chunks": [
14
- {
15
- "id": "https://quotes.toscrape.com/#chunk-0",
16
- "index": 0,
17
- "heading": "",
18
- "text": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d by [Albert Einstein](https://quotes.toscrape.com/author/Albert-Einstein) [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d by J.K. Rowling [(about)](https://quotes.toscrape.com/author/J-K-Rowling)\n\n\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d by Albert Einstein [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d by Jane Austen [(about)](https://quotes.toscrape.com/author/Jane-Austen)",
19
- "wordCount": 114
20
- },
21
- {
22
- "id": "https://quotes.toscrape.com/#chunk-1",
23
- "index": 1,
24
- "heading": "",
25
- "text": "\u201cImperfection is beauty, madness is genius and it\u2019s better to be absolutely ridiculous than absolutely boring.\u201d by Marilyn Monroe [(about)](https://quotes.toscrape.com/author/Marilyn-Monroe)\n\n\u201cTry not to become a man of success. Rather become a man of value.\u201d by Albert Einstein [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d by Andr\u00e9 Gide [(about)](https://quotes.toscrape.com/author/Andre-Gide)\n\n\u201cI have not failed. I\u2019ve just found 10,000 ways that won\u2019t work.\u201d by Thomas A. Edison [(about)](https://quotes.toscrape.com/author/Thomas-A-Edison)\n\n\u201cA woman is like a tea bag; you never know how strong it is until it\u2019s in hot water.\u201d by Eleanor Roosevelt [(about)](https://quotes.toscrape.com/author/Eleanor-Roosevelt)\n\n\u201cA day without sunshine is like, you know, night.\u201d by Steve Martin [(about)](https://quotes.toscrape.com/author/Steve-Martin)",
26
- "wordCount": 127
27
- }
28
- ],
29
- "links": [
30
- { "href": "https://quotes.toscrape.com/login", "text": "Login", "isExternal": false },
31
- { "href": "https://quotes.toscrape.com/author/Albert-Einstein", "text": "Albert Einstein", "isExternal": false },
32
- { "href": "https://quotes.toscrape.com/tag/change/", "text": "change", "isExternal": false },
33
- { "href": "https://quotes.toscrape.com/tag/deep-thoughts/", "text": "deep-thoughts", "isExternal": false },
34
- { "href": "https://quotes.toscrape.com/tag/thinking/", "text": "thinking", "isExternal": false },
35
- { "href": "https://quotes.toscrape.com/tag/world/", "text": "world", "isExternal": false },
36
- { "href": "https://quotes.toscrape.com/author/J-K-Rowling", "text": "J.K. Rowling", "isExternal": false },
37
- { "href": "https://quotes.toscrape.com/page/2/", "text": "Next", "isExternal": false }
38
- ],
39
- "markdown": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d by [Albert Einstein](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d by J.K. Rowling\n\n\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d by Albert Einstein\n\n\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d by Jane Austen\n\n\u201cImperfection is beauty, madness is genius and it\u2019s better to be absolutely ridiculous than absolutely boring.\u201d by Marilyn Monroe\n\n\u201cTry not to become a man of success. Rather become a man of value.\u201d by Albert Einstein\n\n\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d by Andr\u00e9 Gide\n\n\u201cI have not failed. I\u2019ve just found 10,000 ways that won\u2019t work.\u201d by Thomas A. Edison\n\n\u201cA woman is like a tea bag; you never know how strong it is until it\u2019s in hot water.\u201d by Eleanor Roosevelt\n\n\u201cA day without sunshine is like, you know, night.\u201d by Steve Martin"
40
- }
@@ -1,25 +0,0 @@
1
- import { spider } from "../dist/spider.js"
2
- import { writeFileSync, mkdirSync } from "fs"
3
-
4
- const url = "https://easyparser.com/blog/ai-agents-web-scraping-guide"
5
- console.log(`Spidering ${url} ...`)
6
-
7
- try {
8
- const page = await spider(url)
9
- console.log("title :", page.title)
10
- console.log("domain :", page.domain)
11
- console.log("wordCount :", page.wordCount)
12
- console.log("chunks :", page.chunks.length)
13
- console.log("links :", page.links.length)
14
- console.log("headings :", page.headings.map((h) => `H${h.level} ${h.text}`).join(" | "))
15
- console.log("\n--- First 3 chunks ---")
16
- for (const c of page.chunks.slice(0, 3)) {
17
- console.log(`\n[${c.index}] heading="${c.heading}" words=${c.wordCount}`)
18
- console.log(c.text.slice(0, 300) + "...")
19
- }
20
- mkdirSync("fixtures", { recursive: true })
21
- writeFileSync("fixtures/guide-ai-agents-web-scraping.json", JSON.stringify(page, null, 2))
22
- console.log("\nFixture written to fixtures/guide-ai-agents-web-scraping.json")
23
- } catch (e) {
24
- console.error("Failed:", e.message)
25
- }
package/src/cache.ts DELETED
@@ -1,99 +0,0 @@
1
- import type { ICache } from "./ports.js";
2
- import type { SpideredPage } from "./types.js";
3
-
4
- interface CacheEntry {
5
- page: SpideredPage;
6
- expiresAt: number;
7
- }
8
-
9
- export interface SpiderCacheOptions {
10
- /** Maximum number of pages to hold (default 500) */
11
- maxSize?: number;
12
- /** Time-to-live in milliseconds (default 30 min) */
13
- ttlMs?: number;
14
- }
15
-
16
- /**
17
- * LRU cache for spidered pages.
18
- *
19
- * Implements the Identity Map pattern from Local Materialized View:
20
- * exactly one entry per normalised URL — duplicate fetches never happen.
21
- *
22
- * Uses a plain object (Object.create(null)) for storage rather than a Map.
23
- * Plain objects carry no realm-specific internal slots, so they are safe
24
- * across V8 context (realm) boundaries — e.g. when the cache is constructed
25
- * in an ESM module realm but called from a jiti VM-sandbox realm.
26
- *
27
- * JavaScript objects maintain insertion order for string keys (ES2015+),
28
- * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
29
- * Map without any cross-realm risk.
30
- */
31
- export class SpiderCache implements ICache<string, SpideredPage> {
32
- private readonly store: Record<string, CacheEntry | undefined> = Object.create(null);
33
- private readonly maxSize: number;
34
- private readonly ttlMs: number;
35
-
36
- constructor(opts: SpiderCacheOptions = {}) {
37
- this.maxSize = opts.maxSize ?? 500;
38
- this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
39
- }
40
-
41
- /** Normalise a URL so http/https and trailing slashes don't cause misses. */
42
- private key(url: string): string {
43
- try {
44
- const u = new URL(url);
45
- u.hash = "";
46
- return u.toString().replace(/\/$/, "");
47
- } catch {
48
- return url;
49
- }
50
- }
51
-
52
- get(url: string): SpideredPage | undefined {
53
- const k = this.key(url);
54
- const entry = this.store[k];
55
- if (!entry) return undefined;
56
- if (Date.now() > entry.expiresAt) {
57
- delete this.store[k];
58
- return undefined;
59
- }
60
- // Promote to tail (most-recently-used) by delete + reinsert.
61
- // Object insertion order is preserved for string keys in ES2015+.
62
- delete this.store[k];
63
- this.store[k] = entry;
64
- return entry.page;
65
- }
66
-
67
- set(url: string, page: SpideredPage): void {
68
- const k = this.key(url);
69
- if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
70
- const lruKey = Object.keys(this.store)[0];
71
- if (lruKey !== undefined) delete this.store[lruKey];
72
- }
73
- this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
74
- }
75
-
76
- has(url: string): boolean {
77
- return this.get(url) !== undefined;
78
- }
79
-
80
- delete(url: string): void {
81
- delete this.store[this.key(url)];
82
- }
83
-
84
- clear(): void {
85
- for (const k of Object.keys(this.store)) delete this.store[k];
86
- }
87
-
88
- get size(): number {
89
- return Object.keys(this.store).length;
90
- }
91
-
92
- /** All currently valid pages (does not update LRU order). */
93
- values(): SpideredPage[] {
94
- const now = Date.now();
95
- return Object.values(this.store)
96
- .filter((e): e is CacheEntry => e !== undefined && e.expiresAt > now)
97
- .map((e) => e.page);
98
- }
99
- }
package/src/convert.ts DELETED
@@ -1,161 +0,0 @@
1
- /**
2
- * Markdown conversion and chunk splitting.
3
- *
4
- * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
5
- * it never imports Turndown directly.
6
- */
7
-
8
- import TurndownService from "turndown";
9
- import type { Chunk, ChunkType } from "./types.js";
10
-
11
- // ---------------------------------------------------------------------------
12
- // Turndown setup
13
- // ---------------------------------------------------------------------------
14
-
15
- // TurndownService exposes .escape as a mutable internal — not in @types/turndown.
16
- interface PatchableTurndown { escape: (s: string) => string }
17
-
18
- const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
19
-
20
- // Disable escape — Turndown escapes markdown-special chars by default,
21
- // producing backslash noise that is unnatural for agent consumption.
22
- (turndown as unknown as PatchableTurndown).escape = (s) => s;
23
-
24
- // Strip images by default — agents cannot see them and alt-text is noise.
25
- // Disabled when keepImages: true is passed to toMarkdown().
26
- turndown.addRule("strip-images", {
27
- filter: "img",
28
- replacement: () => "",
29
- });
30
-
31
- const turndownWithImages = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
32
- (turndownWithImages as unknown as PatchableTurndown).escape = (s) => s;
33
- // Default Turndown behaviour already renders <img> as ![alt](src) — no extra rule needed.
34
-
35
- // ---------------------------------------------------------------------------
36
- // Markdown conversion
37
- // ---------------------------------------------------------------------------
38
-
39
- export interface ToMarkdownOptions {
40
- /**
41
- * When true, <img> tags are rendered as ![alt](src) instead of being stripped.
42
- * Use when captureImages is enabled so image references appear in the markdown.
43
- * Default: false.
44
- */
45
- keepImages?: boolean;
46
- }
47
-
48
- /** Convert Readability article HTML to clean markdown. */
49
- export function toMarkdown(html: string, opts?: ToMarkdownOptions): string {
50
- if (opts?.keepImages) return turndownWithImages.turndown(html);
51
- return turndown.turndown(html);
52
- }
53
-
54
- // ---------------------------------------------------------------------------
55
- // Content type detection
56
- // ---------------------------------------------------------------------------
57
-
58
- const CHUNK_TARGET_WORDS = 150;
59
-
60
- /** Detect the dominant content type from a markdown buffer. */
61
- export function detectContentType(lines: string[]): ChunkType {
62
- for (const line of lines) {
63
- const t = line.trim();
64
- if (!t) continue;
65
- if (t.startsWith("```")) return "code";
66
- if (t.startsWith("|")) return "table";
67
- if (/^[-*+] /.test(t) || /^\d+\. /.test(t)) return "list";
68
- if (t.startsWith(">")) return "blockquote";
69
- return "text";
70
- }
71
- return "text";
72
- }
73
-
74
- // ---------------------------------------------------------------------------
75
- // Chunking
76
- // ---------------------------------------------------------------------------
77
-
78
- /**
79
- * Split markdown into RAG-ready chunks at heading boundaries.
80
- *
81
- * Atomicity guarantees:
82
- * - Fenced code blocks (``` ... ```) are never split.
83
- * - Markdown tables (lines starting with |) are always flushed as a single
84
- * chunk. Prose before the table is flushed first so the table is isolated.
85
- */
86
- export function chunk(markdown: string, baseUrl: string): Chunk[] {
87
- const chunks: Chunk[] = [];
88
- const lines = markdown.split("\n");
89
-
90
- let heading = "";
91
- let buffer: string[] = [];
92
- let index = 0;
93
- let inCode = false;
94
- let inTable = false;
95
-
96
- const flush = (): void => {
97
- const text = buffer.join("\n").trim();
98
- if (!text) return;
99
- const wordCount = text.split(/\s+/).filter(Boolean).length;
100
- if (wordCount < 10) return;
101
- const contentType = detectContentType(buffer);
102
- chunks.push({ id: `${baseUrl}#chunk-${index}`, index, heading, text, wordCount, contentType });
103
- index++;
104
- buffer = [];
105
- };
106
-
107
- for (const line of lines) {
108
- const trimmed = line.trim();
109
-
110
- // ── Fenced code block toggle ──────────────────────────────────────────
111
- if (trimmed.startsWith("```")) {
112
- inCode = !inCode;
113
- buffer.push(line);
114
- continue;
115
- }
116
- if (inCode) {
117
- buffer.push(line);
118
- continue;
119
- }
120
-
121
- // ── Table rows ────────────────────────────────────────────────────────
122
- const isTableRow = trimmed.startsWith("|");
123
-
124
- if (isTableRow) {
125
- if (!inTable) {
126
- // Table is starting — flush any preceding prose so the table
127
- // gets its own isolated chunk.
128
- flush();
129
- inTable = true;
130
- }
131
- buffer.push(line);
132
- continue;
133
- }
134
-
135
- if (inTable) {
136
- // Table just ended — flush it before processing the next line.
137
- flush();
138
- inTable = false;
139
- }
140
-
141
- // ── Normal prose / headings ───────────────────────────────────────────
142
- if (!trimmed) {
143
- buffer.push(line);
144
- continue;
145
- }
146
-
147
- const headingMatch = /^#{1,3} (.+)/.exec(trimmed);
148
- if (headingMatch) {
149
- const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
150
- if (currentWords >= CHUNK_TARGET_WORDS) flush();
151
- heading = headingMatch[1];
152
- buffer.push(line);
153
- } else {
154
- buffer.push(line);
155
- const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
156
- if (currentWords >= CHUNK_TARGET_WORDS) flush();
157
- }
158
- }
159
- flush();
160
- return chunks;
161
- }
package/src/crawl.ts DELETED
@@ -1,186 +0,0 @@
1
- import { SpiderCache } from "./cache.js";
2
- import { PageGraph } from "./graph.js";
3
- import type { ICache } from "./ports.js";
4
- import { RobotsCache } from "./robots.js";
5
- import { fetchSitemapUrls } from "./sitemap.js";
6
- import type { SpiderOptions } from "./spider.js";
7
- import { spider } from "./spider.js";
8
- import { DomainThrottle } from "./throttle.js";
9
- import type { SpideredPage } from "./types.js";
10
-
11
- export interface CrawlOptions extends SpiderOptions {
12
- /** How many link hops from the start URL (default 2) */
13
- maxDepth?: number;
14
- /** Hard cap on total pages spidered (default 50) */
15
- maxPages?: number;
16
- /** Only follow links on the same domain as the start URL (default true) */
17
- sameDomainOnly?: boolean;
18
- /** Max concurrent fetches (default 3) */
19
- concurrency?: number;
20
- /**
21
- * Minimum delay between requests to the same domain (ms).
22
- * When a throttle is provided this sets its minDelayMs.
23
- * Default 500.
24
- */
25
- delayMs?: number;
26
- /** Bring your own cache — already-spidered URLs are skipped */
27
- cache?: ICache<string, SpideredPage>;
28
- /** Bring your own graph — nodes/edges added as pages are spidered */
29
- graph?: PageGraph;
30
- /** Called with each successfully spidered page */
31
- onPage?: (page: SpideredPage, depth: number) => void;
32
- /** Return false to skip a URL before fetching it */
33
- urlFilter?: (url: string) => boolean;
34
- /**
35
- * Whether to check and respect robots.txt for each domain (default true).
36
- * Automatically creates a RobotsCache if not provided via SpiderOptions.
37
- */
38
- respectRobots?: boolean;
39
- /**
40
- * Attempt to fetch /sitemap.xml before BFS to seed the frontier with
41
- * all known URLs. Falls back to normal BFS on any error (default true).
42
- */
43
- useSitemap?: boolean;
44
- }
45
-
46
- export interface CrawlResult {
47
- pages: Map<string, SpideredPage>;
48
- graph: PageGraph;
49
- errors: Map<string, Error>;
50
- }
51
-
52
- /**
53
- * Recursive BFS crawler.
54
- *
55
- * Starts at `startUrl`, spiders it, extracts links, filters them, then
56
- * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
57
- * and `urlFilter`. Populates the provided (or freshly created) cache and
58
- * graph as it goes.
59
- *
60
- * Concurrency is bounded per depth level — we fully finish each level
61
- * before proceeding, giving BFS ordering and predictable memory use.
62
- */
63
- export async function crawl(startUrl: string, opts: CrawlOptions = {}): Promise<CrawlResult> {
64
- const {
65
- maxDepth = 2,
66
- maxPages = 50,
67
- sameDomainOnly = true,
68
- concurrency = 3,
69
- delayMs = 500,
70
- cache = new SpiderCache() as ICache<string, SpideredPage>,
71
- graph = new PageGraph(),
72
- onPage,
73
- urlFilter,
74
- respectRobots = true,
75
- useSitemap = true,
76
- ...spiderOpts
77
- } = opts;
78
-
79
- const throttle = spiderOpts.throttle ?? new DomainThrottle({ minDelayMs: delayMs });
80
- const robotsCache = spiderOpts.robotsCache ?? (respectRobots ? new RobotsCache(spiderOpts.userAgent) : undefined);
81
- const httpClient = spiderOpts.httpClient;
82
-
83
- const startDomain = new URL(startUrl).hostname;
84
- const pages = new Map<string, SpideredPage>();
85
- const errors = new Map<string, Error>();
86
- const seen = new Set<string>();
87
-
88
- const shouldVisit = (url: string): boolean => {
89
- if (seen.has(url)) return false;
90
- if (pages.size + errors.size >= maxPages) return false;
91
- try {
92
- const u = new URL(url);
93
- if (!["http:", "https:"].includes(u.protocol)) return false;
94
- if (sameDomainOnly && u.hostname !== startDomain) return false;
95
- } catch {
96
- return false;
97
- }
98
- if (urlFilter && !urlFilter(url)) return false;
99
- return true;
100
- };
101
-
102
- // Throttle and robots.txt are handled inside spider() via shared instances.
103
- const fetchBatch = async (urls: string[], depth: number): Promise<void> => {
104
- let index = 0;
105
- let inFlight = 0;
106
- let completed = 0;
107
-
108
- await new Promise<void>((resolve) => {
109
- const tryNext = (): void => {
110
- while (inFlight < concurrency && index < urls.length) {
111
- const url = urls[index++];
112
- inFlight++;
113
-
114
- const fetch_ = cache.has(url)
115
- ? Promise.resolve(cache.get(url)!)
116
- : spider(url, { ...spiderOpts, throttle, robotsCache });
117
-
118
- fetch_
119
- .then((page) => {
120
- pages.set(url, page);
121
- cache.set(url, page);
122
- graph.addPage(page);
123
- onPage?.(page, depth);
124
- })
125
- .catch((err: unknown) => {
126
- errors.set(url, err instanceof Error ? err : new Error(String(err)));
127
- })
128
- .finally(() => {
129
- completed++;
130
- inFlight--;
131
- if (completed === urls.length) resolve();
132
- else tryNext();
133
- });
134
- }
135
- };
136
- tryNext();
137
- });
138
- };
139
-
140
- let frontier = [startUrl];
141
- seen.add(startUrl);
142
-
143
- if (useSitemap) {
144
- const origin = new URL(startUrl).origin;
145
- // Use a minimal default httpClient if none was injected
146
- const client = httpClient ?? {
147
- async fetch(req: { url: string; headers?: Record<string, string> }) {
148
- return globalThis.fetch(req.url, { headers: req.headers });
149
- },
150
- };
151
- const sitemapUrls = await fetchSitemapUrls(origin, client);
152
- for (const u of sitemapUrls) {
153
- if (shouldVisit(u)) {
154
- seen.add(u);
155
- frontier.push(u);
156
- }
157
- }
158
- }
159
-
160
- for (let depth = 0; depth <= maxDepth; depth++) {
161
- if (frontier.length === 0) break;
162
- if (pages.size + errors.size >= maxPages) break;
163
-
164
- const remaining = maxPages - pages.size - errors.size;
165
- const batch = frontier.slice(0, remaining);
166
-
167
- await fetchBatch(batch, depth);
168
-
169
- if (depth === maxDepth) break;
170
-
171
- const nextFrontier: string[] = [];
172
- for (const url of batch) {
173
- const page = pages.get(url);
174
- if (!page) continue;
175
- for (const link of page.links) {
176
- if (shouldVisit(link.href)) {
177
- seen.add(link.href);
178
- nextFrontier.push(link.href);
179
- }
180
- }
181
- }
182
- frontier = nextFrontier;
183
- }
184
-
185
- return { pages, graph, errors };
186
- }