npm - @dpopsuev/web-spider - Versions diffs - 0.10.4 - Mend

@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

package/dist/batch.d.ts +24 -0
package/dist/batch.d.ts.map +1 -0
package/dist/batch.js +68 -0
package/dist/cache.d.ts +40 -0
package/dist/cache.d.ts.map +1 -0
package/dist/cache.js +78 -0
package/dist/convert.d.ts +29 -0
package/dist/convert.d.ts.map +1 -0
package/dist/convert.js +131 -0
package/dist/crawl.d.ts +56 -0
package/dist/crawl.d.ts.map +1 -0
package/dist/crawl.js +126 -0
package/dist/disk-cache.d.ts +75 -0
package/dist/disk-cache.d.ts.map +1 -0
package/dist/disk-cache.js +185 -0
package/dist/graph.d.ts +76 -0
package/dist/graph.d.ts.map +1 -0
package/dist/graph.js +156 -0
package/dist/index.d.ts +45 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +44 -0
package/dist/parse.d.ts +27 -0
package/dist/parse.d.ts.map +1 -0
package/dist/parse.js +131 -0
package/dist/playwright.d.ts +75 -0
package/dist/playwright.d.ts.map +1 -0
package/dist/playwright.js +141 -0
package/dist/ports.d.ts +104 -0
package/dist/ports.d.ts.map +1 -0
package/dist/ports.js +10 -0
package/dist/robots.d.ts +24 -0
package/dist/robots.d.ts.map +1 -0
package/dist/robots.js +104 -0
package/dist/search.d.ts +47 -0
package/dist/search.d.ts.map +1 -0
package/dist/search.js +112 -0
package/dist/sitemap.d.ts +15 -0
package/dist/sitemap.d.ts.map +1 -0
package/dist/sitemap.js +65 -0
package/dist/spider.d.ts +74 -0
package/dist/spider.d.ts.map +1 -0
package/dist/spider.js +349 -0
package/dist/throttle.d.ts +49 -0
package/dist/throttle.d.ts.map +1 -0
package/dist/throttle.js +85 -0
package/dist/tree.d.ts +34 -0
package/dist/tree.d.ts.map +1 -0
package/dist/tree.js +354 -0
package/dist/types.d.ts +189 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +2 -0
package/dist/views.d.ts +17 -0
package/dist/views.d.ts.map +1 -0
package/dist/views.js +39 -0
package/dist/web-search.d.ts +184 -0
package/dist/web-search.d.ts.map +1 -0
package/dist/web-search.js +399 -0
package/fixtures/article-with-images.html +94 -0
package/fixtures/gh-shell.html +32 -0
package/fixtures/guide-ai-agents-web-scraping.json +552 -0
package/fixtures/images/large.jpg +0 -0
package/fixtures/images/small.jpg +0 -0
package/fixtures/images/tiny.png +0 -0
package/fixtures/quotes-index.json +40 -0
package/package.json +47 -0
package/scripts/fetch-guide.mjs +25 -0
package/src/cache.ts +99 -0
package/src/convert.ts +161 -0
package/src/crawl.ts +186 -0
package/src/disk-cache.ts +228 -0
package/src/graph.ts +189 -0
package/src/index.ts +74 -0
package/src/parse.ts +154 -0
package/src/playwright.ts +193 -0
package/src/ports.ts +131 -0
package/src/robots.ts +121 -0
package/src/search.ts +173 -0
package/src/sitemap.ts +67 -0
package/src/spider.ts +475 -0
package/src/throttle.ts +118 -0
package/src/tree.ts +379 -0
package/src/types.ts +225 -0
package/src/views.ts +42 -0
package/src/web-search.ts +548 -0
package/test/convert-images.test.ts +69 -0
package/test/disk-cache-images.test.ts +193 -0
package/test/engine-registry.test.ts +114 -0
package/test/exports.test.ts +124 -0
package/test/get-chunk.test.ts +115 -0
package/test/images-integration.test.ts +359 -0
package/test/improvements.test.ts +279 -0
package/test/inbound-count.test.ts +111 -0
package/test/lean.test.ts +105 -0
package/test/playwright.test.ts +128 -0
package/test/ports.test.ts +161 -0
package/test/search.test.ts +219 -0
package/test/spider-images.test.ts +180 -0
package/test/spider-unit.test.ts +610 -0
package/test/tree.test.ts +272 -0
package/test/types.test.ts +169 -0
package/test/web-search-integration.test.ts +180 -0
package/test/web-search.test.ts +305 -0
package/tsconfig.json +9 -0
package/tsconfig.test.json +7 -0
package/vitest.config.ts +8 -0

package/dist/batch.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+import type { SpiderCache } from "./cache.js";
+import type { SpiderOptions } from "./spider.js";
+import type { SpideredPage } from "./types.js";
+export interface BatchOptions extends SpiderOptions {
+    /** Max concurrent fetches (default 3 — be polite) */
+    concurrency?: number;
+    /** Fixed delay in ms between each fetch start (default 300) */
+    delayMs?: number;
+    /** Optional cache — already-cached URLs are skipped */
+    cache?: SpiderCache;
+    /** Called after each URL completes (success or failure) */
+    onProgress?: (done: number, total: number, url: string, error?: Error) => void;
+}
+/**
+ * Spider multiple URLs concurrently with a bounded semaphore.
+ *
+ * Returns a Map keyed by URL. Value is either a SpideredPage (success)
+ * or an Error (failure). Errors do not poison the batch.
+ *
+ * Cache integration: if `opts.cache` is provided, cached pages are
+ * returned immediately and do not count toward concurrency.
+ */
+export declare function batchSpider(urls: string[], opts?: BatchOptions): Promise<Map<string, SpideredPage | Error>>;
+//# sourceMappingURL=batch.d.ts.map

package/dist/batch.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../src/batch.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAC9C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAEjD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,MAAM,WAAW,YAAa,SAAQ,aAAa;IAClD,qDAAqD;IACrD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,2DAA2D;IAC3D,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,KAAK,KAAK,IAAI,CAAC;CAC/E;AAED;;;;;;;;GAQG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,IAAI,GAAE,YAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,KAAK,CAAC,CAAC,CAmErH"}

package/dist/batch.js ADDED Viewed

@@ -0,0 +1,68 @@
+import { spider } from "./spider.js";
+/**
+ * Spider multiple URLs concurrently with a bounded semaphore.
+ *
+ * Returns a Map keyed by URL. Value is either a SpideredPage (success)
+ * or an Error (failure). Errors do not poison the batch.
+ *
+ * Cache integration: if `opts.cache` is provided, cached pages are
+ * returned immediately and do not count toward concurrency.
+ */
+export async function batchSpider(urls, opts = {}) {
+    // Strip crawl-only options that batchSpider doesn't use so they don't
+    // confuse callers and don't get forwarded to spider() where they'd be
+    // applied per-call rather than shared (use crawl() for that).
+    const { concurrency = 3, delayMs = 300, cache, onProgress, throttle: _throttle, robotsCache: _robotsCache, // consumed here, not forwarded
+    ...spiderOpts } = opts;
+    const results = new Map();
+    const unique = [...new Set(urls)];
+    let done = 0;
+    // Satisfy cache hits synchronously before touching the network
+    const toFetch = [];
+    for (const url of unique) {
+        const cached = cache?.get(url);
+        if (cached) {
+            results.set(url, cached);
+            done++;
+            onProgress?.(done, unique.length, url);
+        }
+        else {
+            toFetch.push(url);
+        }
+    }
+    if (toFetch.length === 0)
+        return results;
+    // Semaphore: at most `concurrency` in-flight at once
+    let inFlight = 0;
+    let index = 0;
+    await new Promise((resolve) => {
+        const tryNext = () => {
+            while (inFlight < concurrency && index < toFetch.length) {
+                const url = toFetch[index++];
+                inFlight++;
+                const delay = delayMs > 0 ? new Promise((r) => setTimeout(r, delayMs * (index - 1))) : Promise.resolve();
+                delay
+                    .then(() => spider(url, spiderOpts))
+                    .then((page) => {
+                    results.set(url, page);
+                    cache?.set(url, page);
+                })
+                    .catch((err) => {
+                    results.set(url, err instanceof Error ? err : new Error(String(err)));
+                })
+                    .finally(() => {
+                    done++;
+                    onProgress?.(done, unique.length, url, results.get(url) instanceof Error ? results.get(url) : undefined);
+                    inFlight--;
+                    if (done === unique.length)
+                        resolve();
+                    else
+                        tryNext();
+                });
+            }
+        };
+        tryNext();
+    });
+    return results;
+}
+//# sourceMappingURL=batch.js.map

package/dist/cache.d.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import type { ICache } from "./ports.js";
+import type { SpideredPage } from "./types.js";
+export interface SpiderCacheOptions {
+    /** Maximum number of pages to hold (default 500) */
+    maxSize?: number;
+    /** Time-to-live in milliseconds (default 30 min) */
+    ttlMs?: number;
+}
+/**
+ * LRU cache for spidered pages.
+ *
+ * Implements the Identity Map pattern from Local Materialized View:
+ * exactly one entry per normalised URL — duplicate fetches never happen.
+ *
+ * Uses a plain object (Object.create(null)) for storage rather than a Map.
+ * Plain objects carry no realm-specific internal slots, so they are safe
+ * across V8 context (realm) boundaries — e.g. when the cache is constructed
+ * in an ESM module realm but called from a jiti VM-sandbox realm.
+ *
+ * JavaScript objects maintain insertion order for string keys (ES2015+),
+ * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
+ * Map without any cross-realm risk.
+ */
+export declare class SpiderCache implements ICache<string, SpideredPage> {
+    private readonly store;
+    private readonly maxSize;
+    private readonly ttlMs;
+    constructor(opts?: SpiderCacheOptions);
+    /** Normalise a URL so http/https and trailing slashes don't cause misses. */
+    private key;
+    get(url: string): SpideredPage | undefined;
+    set(url: string, page: SpideredPage): void;
+    has(url: string): boolean;
+    delete(url: string): void;
+    clear(): void;
+    get size(): number;
+    /** All currently valid pages (does not update LRU order). */
+    values(): SpideredPage[];
+}
+//# sourceMappingURL=cache.d.ts.map

package/dist/cache.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../src/cache.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAO/C,MAAM,WAAW,kBAAkB;IAClC,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,KAAK,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,WAAY,YAAW,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC;IAC/D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA+D;IACrF,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;gBAEnB,IAAI,GAAE,kBAAuB;IAKzC,6EAA6E;IAC7E,OAAO,CAAC,GAAG;IAUX,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,YAAY,GAAG,SAAS;IAe1C,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,IAAI;IAS1C,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAIzB,KAAK,IAAI,IAAI;IAIb,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,6DAA6D;IAC7D,MAAM,IAAI,YAAY,EAAE;CAMxB"}

package/dist/cache.js ADDED Viewed

@@ -0,0 +1,78 @@
+/**
+ * LRU cache for spidered pages.
+ *
+ * Implements the Identity Map pattern from Local Materialized View:
+ * exactly one entry per normalised URL — duplicate fetches never happen.
+ *
+ * Uses a plain object (Object.create(null)) for storage rather than a Map.
+ * Plain objects carry no realm-specific internal slots, so they are safe
+ * across V8 context (realm) boundaries — e.g. when the cache is constructed
+ * in an ESM module realm but called from a jiti VM-sandbox realm.
+ *
+ * JavaScript objects maintain insertion order for string keys (ES2015+),
+ * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
+ * Map without any cross-realm risk.
+ */
+export class SpiderCache {
+    constructor(opts = {}) {
+        this.store = Object.create(null);
+        this.maxSize = opts.maxSize ?? 500;
+        this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
+    }
+    /** Normalise a URL so http/https and trailing slashes don't cause misses. */
+    key(url) {
+        try {
+            const u = new URL(url);
+            u.hash = "";
+            return u.toString().replace(/\/$/, "");
+        }
+        catch {
+            return url;
+        }
+    }
+    get(url) {
+        const k = this.key(url);
+        const entry = this.store[k];
+        if (!entry)
+            return undefined;
+        if (Date.now() > entry.expiresAt) {
+            delete this.store[k];
+            return undefined;
+        }
+        // Promote to tail (most-recently-used) by delete + reinsert.
+        // Object insertion order is preserved for string keys in ES2015+.
+        delete this.store[k];
+        this.store[k] = entry;
+        return entry.page;
+    }
+    set(url, page) {
+        const k = this.key(url);
+        if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
+            const lruKey = Object.keys(this.store)[0];
+            if (lruKey !== undefined)
+                delete this.store[lruKey];
+        }
+        this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
+    }
+    has(url) {
+        return this.get(url) !== undefined;
+    }
+    delete(url) {
+        delete this.store[this.key(url)];
+    }
+    clear() {
+        for (const k of Object.keys(this.store))
+            delete this.store[k];
+    }
+    get size() {
+        return Object.keys(this.store).length;
+    }
+    /** All currently valid pages (does not update LRU order). */
+    values() {
+        const now = Date.now();
+        return Object.values(this.store)
+            .filter((e) => e !== undefined && e.expiresAt > now)
+            .map((e) => e.page);
+    }
+}
+//# sourceMappingURL=cache.js.map

package/dist/convert.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * Markdown conversion and chunk splitting.
+ *
+ * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
+ * it never imports Turndown directly.
+ */
+import type { Chunk, ChunkType } from "./types.js";
+export interface ToMarkdownOptions {
+    /**
+     * When true, <img> tags are rendered as ![alt](src) instead of being stripped.
+     * Use when captureImages is enabled so image references appear in the markdown.
+     * Default: false.
+     */
+    keepImages?: boolean;
+}
+/** Convert Readability article HTML to clean markdown. */
+export declare function toMarkdown(html: string, opts?: ToMarkdownOptions): string;
+/** Detect the dominant content type from a markdown buffer. */
+export declare function detectContentType(lines: string[]): ChunkType;
+/**
+ * Split markdown into RAG-ready chunks at heading boundaries.
+ *
+ * Atomicity guarantees:
+ *   - Fenced code blocks (``` ... ```) are never split.
+ *   - Markdown tables (lines starting with |) are always flushed as a single
+ *     chunk. Prose before the table is flushed first so the table is isolated.
+ */
+export declare function chunk(markdown: string, baseUrl: string): Chunk[];
+//# sourceMappingURL=convert.d.ts.map

package/dist/convert.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"convert.d.ts","sourceRoot":"","sources":["../src/convert.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AA8BnD,MAAM,WAAW,iBAAiB;IACjC;;;;OAIG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,0DAA0D;AAC1D,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,iBAAiB,GAAG,MAAM,CAGzE;AAQD,+DAA+D;AAC/D,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,SAAS,CAW5D;AAMD;;;;;;;GAOG;AACH,wBAAgB,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,KAAK,EAAE,CA2EhE"}

package/dist/convert.js ADDED Viewed

@@ -0,0 +1,131 @@
+/**
+ * Markdown conversion and chunk splitting.
+ *
+ * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
+ * it never imports Turndown directly.
+ */
+import TurndownService from "turndown";
+const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+// Disable escape — Turndown escapes markdown-special chars by default,
+// producing backslash noise that is unnatural for agent consumption.
+turndown.escape = (s) => s;
+// Strip images by default — agents cannot see them and alt-text is noise.
+// Disabled when keepImages: true is passed to toMarkdown().
+turndown.addRule("strip-images", {
+    filter: "img",
+    replacement: () => "",
+});
+const turndownWithImages = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+turndownWithImages.escape = (s) => s;
+/** Convert Readability article HTML to clean markdown. */
+export function toMarkdown(html, opts) {
+    if (opts?.keepImages)
+        return turndownWithImages.turndown(html);
+    return turndown.turndown(html);
+}
+// ---------------------------------------------------------------------------
+// Content type detection
+// ---------------------------------------------------------------------------
+const CHUNK_TARGET_WORDS = 150;
+/** Detect the dominant content type from a markdown buffer. */
+export function detectContentType(lines) {
+    for (const line of lines) {
+        const t = line.trim();
+        if (!t)
+            continue;
+        if (t.startsWith("```"))
+            return "code";
+        if (t.startsWith("|"))
+            return "table";
+        if (/^[-*+] /.test(t) || /^\d+\. /.test(t))
+            return "list";
+        if (t.startsWith(">"))
+            return "blockquote";
+        return "text";
+    }
+    return "text";
+}
+// ---------------------------------------------------------------------------
+// Chunking
+// ---------------------------------------------------------------------------
+/**
+ * Split markdown into RAG-ready chunks at heading boundaries.
+ *
+ * Atomicity guarantees:
+ *   - Fenced code blocks (``` ... ```) are never split.
+ *   - Markdown tables (lines starting with |) are always flushed as a single
+ *     chunk. Prose before the table is flushed first so the table is isolated.
+ */
+export function chunk(markdown, baseUrl) {
+    const chunks = [];
+    const lines = markdown.split("\n");
+    let heading = "";
+    let buffer = [];
+    let index = 0;
+    let inCode = false;
+    let inTable = false;
+    const flush = () => {
+        const text = buffer.join("\n").trim();
+        if (!text)
+            return;
+        const wordCount = text.split(/\s+/).filter(Boolean).length;
+        if (wordCount < 10)
+            return;
+        const contentType = detectContentType(buffer);
+        chunks.push({ id: `${baseUrl}#chunk-${index}`, index, heading, text, wordCount, contentType });
+        index++;
+        buffer = [];
+    };
+    for (const line of lines) {
+        const trimmed = line.trim();
+        // ── Fenced code block toggle ──────────────────────────────────────────
+        if (trimmed.startsWith("```")) {
+            inCode = !inCode;
+            buffer.push(line);
+            continue;
+        }
+        if (inCode) {
+            buffer.push(line);
+            continue;
+        }
+        // ── Table rows ────────────────────────────────────────────────────────
+        const isTableRow = trimmed.startsWith("|");
+        if (isTableRow) {
+            if (!inTable) {
+                // Table is starting — flush any preceding prose so the table
+                // gets its own isolated chunk.
+                flush();
+                inTable = true;
+            }
+            buffer.push(line);
+            continue;
+        }
+        if (inTable) {
+            // Table just ended — flush it before processing the next line.
+            flush();
+            inTable = false;
+        }
+        // ── Normal prose / headings ───────────────────────────────────────────
+        if (!trimmed) {
+            buffer.push(line);
+            continue;
+        }
+        const headingMatch = /^#{1,3} (.+)/.exec(trimmed);
+        if (headingMatch) {
+            const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
+            if (currentWords >= CHUNK_TARGET_WORDS)
+                flush();
+            heading = headingMatch[1];
+            buffer.push(line);
+        }
+        else {
+            buffer.push(line);
+            const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
+            if (currentWords >= CHUNK_TARGET_WORDS)
+                flush();
+        }
+    }
+    flush();
+    return chunks;
+}
+//# sourceMappingURL=convert.js.map

package/dist/crawl.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import { PageGraph } from "./graph.js";
+import type { ICache } from "./ports.js";
+import type { SpiderOptions } from "./spider.js";
+import type { SpideredPage } from "./types.js";
+export interface CrawlOptions extends SpiderOptions {
+    /** How many link hops from the start URL (default 2) */
+    maxDepth?: number;
+    /** Hard cap on total pages spidered (default 50) */
+    maxPages?: number;
+    /** Only follow links on the same domain as the start URL (default true) */
+    sameDomainOnly?: boolean;
+    /** Max concurrent fetches (default 3) */
+    concurrency?: number;
+    /**
+     * Minimum delay between requests to the same domain (ms).
+     * When a throttle is provided this sets its minDelayMs.
+     * Default 500.
+     */
+    delayMs?: number;
+    /** Bring your own cache — already-spidered URLs are skipped */
+    cache?: ICache<string, SpideredPage>;
+    /** Bring your own graph — nodes/edges added as pages are spidered */
+    graph?: PageGraph;
+    /** Called with each successfully spidered page */
+    onPage?: (page: SpideredPage, depth: number) => void;
+    /** Return false to skip a URL before fetching it */
+    urlFilter?: (url: string) => boolean;
+    /**
+     * Whether to check and respect robots.txt for each domain (default true).
+     * Automatically creates a RobotsCache if not provided via SpiderOptions.
+     */
+    respectRobots?: boolean;
+    /**
+     * Attempt to fetch /sitemap.xml before BFS to seed the frontier with
+     * all known URLs. Falls back to normal BFS on any error (default true).
+     */
+    useSitemap?: boolean;
+}
+export interface CrawlResult {
+    pages: Map<string, SpideredPage>;
+    graph: PageGraph;
+    errors: Map<string, Error>;
+}
+/**
+ * Recursive BFS crawler.
+ *
+ * Starts at `startUrl`, spiders it, extracts links, filters them, then
+ * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
+ * and `urlFilter`. Populates the provided (or freshly created) cache and
+ * graph as it goes.
+ *
+ * Concurrency is bounded per depth level — we fully finish each level
+ * before proceeding, giving BFS ordering and predictable memory use.
+ */
+export declare function crawl(startUrl: string, opts?: CrawlOptions): Promise<CrawlResult>;
+//# sourceMappingURL=crawl.d.ts.map

package/dist/crawl.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"crawl.d.ts","sourceRoot":"","sources":["../src/crawl.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AAGzC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAGjD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,MAAM,WAAW,YAAa,SAAQ,aAAa;IAClD,wDAAwD;IACxD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oDAAoD;IACpD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2EAA2E;IAC3E,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,yCAAyC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+DAA+D;IAC/D,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IACrC,qEAAqE;IACrE,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,kDAAkD;IAClD,MAAM,CAAC,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACrD,oDAAoD;IACpD,SAAS,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC;IACrC;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,MAAM,WAAW,WAAW;IAC3B,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IACjC,KAAK,EAAE,SAAS,CAAC;IACjB,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;CAC3B;AAED;;;;;;;;;;GAUG;AACH,wBAAsB,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC,CA2H3F"}

package/dist/crawl.js ADDED Viewed

@@ -0,0 +1,126 @@
+import { SpiderCache } from "./cache.js";
+import { PageGraph } from "./graph.js";
+import { RobotsCache } from "./robots.js";
+import { fetchSitemapUrls } from "./sitemap.js";
+import { spider } from "./spider.js";
+import { DomainThrottle } from "./throttle.js";
+/**
+ * Recursive BFS crawler.
+ *
+ * Starts at `startUrl`, spiders it, extracts links, filters them, then
+ * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
+ * and `urlFilter`. Populates the provided (or freshly created) cache and
+ * graph as it goes.
+ *
+ * Concurrency is bounded per depth level — we fully finish each level
+ * before proceeding, giving BFS ordering and predictable memory use.
+ */
+export async function crawl(startUrl, opts = {}) {
+    const { maxDepth = 2, maxPages = 50, sameDomainOnly = true, concurrency = 3, delayMs = 500, cache = new SpiderCache(), graph = new PageGraph(), onPage, urlFilter, respectRobots = true, useSitemap = true, ...spiderOpts } = opts;
+    const throttle = spiderOpts.throttle ?? new DomainThrottle({ minDelayMs: delayMs });
+    const robotsCache = spiderOpts.robotsCache ?? (respectRobots ? new RobotsCache(spiderOpts.userAgent) : undefined);
+    const httpClient = spiderOpts.httpClient;
+    const startDomain = new URL(startUrl).hostname;
+    const pages = new Map();
+    const errors = new Map();
+    const seen = new Set();
+    const shouldVisit = (url) => {
+        if (seen.has(url))
+            return false;
+        if (pages.size + errors.size >= maxPages)
+            return false;
+        try {
+            const u = new URL(url);
+            if (!["http:", "https:"].includes(u.protocol))
+                return false;
+            if (sameDomainOnly && u.hostname !== startDomain)
+                return false;
+        }
+        catch {
+            return false;
+        }
+        if (urlFilter && !urlFilter(url))
+            return false;
+        return true;
+    };
+    // Throttle and robots.txt are handled inside spider() via shared instances.
+    const fetchBatch = async (urls, depth) => {
+        let index = 0;
+        let inFlight = 0;
+        let completed = 0;
+        await new Promise((resolve) => {
+            const tryNext = () => {
+                while (inFlight < concurrency && index < urls.length) {
+                    const url = urls[index++];
+                    inFlight++;
+                    const fetch_ = cache.has(url)
+                        ? Promise.resolve(cache.get(url))
+                        : spider(url, { ...spiderOpts, throttle, robotsCache });
+                    fetch_
+                        .then((page) => {
+                        pages.set(url, page);
+                        cache.set(url, page);
+                        graph.addPage(page);
+                        onPage?.(page, depth);
+                    })
+                        .catch((err) => {
+                        errors.set(url, err instanceof Error ? err : new Error(String(err)));
+                    })
+                        .finally(() => {
+                        completed++;
+                        inFlight--;
+                        if (completed === urls.length)
+                            resolve();
+                        else
+                            tryNext();
+                    });
+                }
+            };
+            tryNext();
+        });
+    };
+    let frontier = [startUrl];
+    seen.add(startUrl);
+    if (useSitemap) {
+        const origin = new URL(startUrl).origin;
+        // Use a minimal default httpClient if none was injected
+        const client = httpClient ?? {
+            async fetch(req) {
+                return globalThis.fetch(req.url, { headers: req.headers });
+            },
+        };
+        const sitemapUrls = await fetchSitemapUrls(origin, client);
+        for (const u of sitemapUrls) {
+            if (shouldVisit(u)) {
+                seen.add(u);
+                frontier.push(u);
+            }
+        }
+    }
+    for (let depth = 0; depth <= maxDepth; depth++) {
+        if (frontier.length === 0)
+            break;
+        if (pages.size + errors.size >= maxPages)
+            break;
+        const remaining = maxPages - pages.size - errors.size;
+        const batch = frontier.slice(0, remaining);
+        await fetchBatch(batch, depth);
+        if (depth === maxDepth)
+            break;
+        const nextFrontier = [];
+        for (const url of batch) {
+            const page = pages.get(url);
+            if (!page)
+                continue;
+            for (const link of page.links) {
+                if (shouldVisit(link.href)) {
+                    seen.add(link.href);
+                    nextFrontier.push(link.href);
+                }
+            }
+        }
+        frontier = nextFrontier;
+    }
+    return { pages, graph, errors };
+}
+//# sourceMappingURL=crawl.js.map

package/dist/disk-cache.d.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * Disk-backed cache implementing ICache<string, SpideredPage>.
+ *
+ * Persists to a JSON file so the cache survives extension reloads and
+ * pi restarts. Call flush() to write — set() auto-flushes by default.
+ *
+ * The images directory is derived automatically from `dirname(path)/images`.
+ * Callers do not need to create it — DiskCache creates it on first large-image
+ * flush. Pre-creating it at startup (e.g. in the extension boot path) is
+ * harmless and avoids a first-write delay.
+ *
+ * Internal storage uses a plain object (Object.create(null)) rather than a
+ * Map. Plain objects carry no realm-specific internal slots, making them safe
+ * across V8 context (realm) boundaries — e.g. when DiskCache is constructed
+ * in an ESM module realm but called from a jiti VM-sandbox realm (Bun binary
+ * mode). The Map-backed version threw "Map operation called on non-Map object"
+ * in that scenario.
+ *
+ * A schema version field in the persisted JSON guards against stale cache
+ * files from previous major versions being silently loaded with wrong shapes.
+ */
+import type { ICache } from "./ports.js";
+import type { SpideredPage } from "./types.js";
+export interface DiskCacheOptions {
+    /** Time-to-live in ms. Default 30 min. */
+    ttlMs?: number;
+    /** Max entries. Default 500. */
+    maxSize?: number;
+    /** Auto-flush to disk on every set(). Default true. */
+    autoFlush?: boolean;
+    /**
+     * Base64 byte threshold for inline vs. file storage of images.
+     * Images whose base64 string length exceeds this are written as binary
+     * files to <cache-dir>/images/ instead of being stored inline in the JSON.
+     * Default: 32 * 1024 (32 KB of base64 ≈ 24 KB binary).
+     */
+    inlineImageThreshold?: number;
+}
+export declare class DiskCache implements ICache<string, SpideredPage> {
+    private readonly store;
+    private readonly path;
+    private readonly ttlMs;
+    private readonly maxSize;
+    private readonly autoFlush;
+    private readonly inlineImageThreshold;
+    /** Directory where large image binaries are stored. */
+    private readonly imagesDir;
+    constructor(path: string, opts?: DiskCacheOptions);
+    private key;
+    set(url: string, page: SpideredPage): void;
+    has(url: string): boolean;
+    delete(url: string): void;
+    /** Derive a stable filename for an image binary from its src URL. */
+    private imageFilename;
+    /**
+     * Prepare images for serialisation:
+     * - Images whose base64 length ≤ threshold are kept inline.
+     * - Larger images are written to imagesDir as binary files; base64 is
+     *   replaced by filePath in the serialised entry.
+     */
+    private spill;
+    /**
+     * Hydrate images on read: if an image has filePath but no base64,
+     * load the binary from disk and re-encode.
+     */
+    private hydrate;
+    /** Write current contents to disk. Large images are spilled to imagesDir. */
+    flush(): void;
+    private load;
+    /** All currently valid (non-expired) pages, sorted newest-first. */
+    values(): SpideredPage[];
+    /** Retrieve a page, hydrating any file-backed images from disk. */
+    get(url: string): SpideredPage | undefined;
+}
+//# sourceMappingURL=disk-cache.d.ts.map

package/dist/disk-cache.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"disk-cache.d.ts","sourceRoot":"","sources":["../src/disk-cache.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAKH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,KAAK,EAAY,YAAY,EAAE,MAAM,YAAY,CAAC;AAKzD,MAAM,WAAW,gBAAgB;IAChC,0CAA0C;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAaD,qBAAa,SAAU,YAAW,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC;IAC7D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA0D;IAChF,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAU;IACpC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;IAC9C,uDAAuD;IACvD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB;IAUrD,OAAO,CAAC,GAAG;IAUX,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,IAAI;IAU1C,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IASzB,qEAAqE;IACrE,OAAO,CAAC,aAAa;IAMrB;;;;;OAKG;IACH,OAAO,CAAC,KAAK;IAgBb;;;OAGG;IACH,OAAO,CAAC,OAAO;IAiBf,6EAA6E;IAC7E,KAAK,IAAI,IAAI;IAeb,OAAO,CAAC,IAAI;IAyBZ,oEAAoE;IACpE,MAAM,IAAI,YAAY,EAAE;IAWxB,mEAAmE;IACnE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,YAAY,GAAG,SAAS;CAY1C"}