npm - @dpopsuev/web-spider - Versions diffs - 0.10.4 → 0.10.5 - Mend

@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/dist/batch.js.map +1 -0
package/dist/cache.js.map +1 -0
package/dist/convert.js.map +1 -0
package/dist/crawl.js.map +1 -0
package/dist/disk-cache.js.map +1 -0
package/dist/graph.js.map +1 -0
package/dist/index.js.map +1 -0
package/dist/parse.js.map +1 -0
package/dist/playwright.js.map +1 -0
package/dist/ports.js.map +1 -0
package/dist/robots.js.map +1 -0
package/dist/search.js.map +1 -0
package/dist/sitemap.js.map +1 -0
package/dist/spider.js.map +1 -0
package/dist/throttle.js.map +1 -0
package/dist/tree.js.map +1 -0
package/dist/types.js.map +1 -0
package/dist/views.js.map +1 -0
package/dist/web-search.js.map +1 -0
package/package.json +2 -1
package/fixtures/article-with-images.html +0 -94
package/fixtures/gh-shell.html +0 -32
package/fixtures/guide-ai-agents-web-scraping.json +0 -552
package/fixtures/images/large.jpg +0 -0
package/fixtures/images/small.jpg +0 -0
package/fixtures/images/tiny.png +0 -0
package/fixtures/quotes-index.json +0 -40
package/scripts/fetch-guide.mjs +0 -25
package/src/cache.ts +0 -99
package/src/convert.ts +0 -161
package/src/crawl.ts +0 -186
package/src/disk-cache.ts +0 -228
package/src/graph.ts +0 -189
package/src/index.ts +0 -74
package/src/parse.ts +0 -154
package/src/playwright.ts +0 -193
package/src/ports.ts +0 -131
package/src/robots.ts +0 -121
package/src/search.ts +0 -173
package/src/sitemap.ts +0 -67
package/src/spider.ts +0 -475
package/src/throttle.ts +0 -118
package/src/tree.ts +0 -379
package/src/types.ts +0 -225
package/src/views.ts +0 -42
package/src/web-search.ts +0 -548
package/test/convert-images.test.ts +0 -69
package/test/disk-cache-images.test.ts +0 -193
package/test/engine-registry.test.ts +0 -114
package/test/exports.test.ts +0 -124
package/test/get-chunk.test.ts +0 -115
package/test/images-integration.test.ts +0 -359
package/test/improvements.test.ts +0 -279
package/test/inbound-count.test.ts +0 -111
package/test/lean.test.ts +0 -105
package/test/playwright.test.ts +0 -128
package/test/ports.test.ts +0 -161
package/test/search.test.ts +0 -219
package/test/spider-images.test.ts +0 -180
package/test/spider-unit.test.ts +0 -610
package/test/tree.test.ts +0 -272
package/test/types.test.ts +0 -169
package/test/web-search-integration.test.ts +0 -180
package/test/web-search.test.ts +0 -305
package/tsconfig.json +0 -9
package/tsconfig.test.json +0 -7
package/vitest.config.ts +0 -8

package/fixtures/quotes-index.json DELETED Viewed

@@ -1,40 +0,0 @@
-{
-  "url": "https://quotes.toscrape.com/",
-  "domain": "quotes.toscrape.com",
-  "fetchedAt": "2026-05-14T00:00:00.000Z",
-  "title": "Quotes to Scrape",
-  "description": "",
-  "author": "Albert Einstein",
-  "publishedAt": "",
-  "lang": "en",
-  "wordCount": 210,
-  "readingTimeMinutes": 2,
-  "headings": [],
-  "chunks": [
-    {
-      "id": "https://quotes.toscrape.com/#chunk-0",
-      "index": 0,
-      "heading": "",
-      "text": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d by [Albert Einstein](https://quotes.toscrape.com/author/Albert-Einstein) [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d by J.K. Rowling [(about)](https://quotes.toscrape.com/author/J-K-Rowling)\n\n\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d by Albert Einstein [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d by Jane Austen [(about)](https://quotes.toscrape.com/author/Jane-Austen)",
-      "wordCount": 114
-    },
-    {
-      "id": "https://quotes.toscrape.com/#chunk-1",
-      "index": 1,
-      "heading": "",
-      "text": "\u201cImperfection is beauty, madness is genius and it\u2019s better to be absolutely ridiculous than absolutely boring.\u201d by Marilyn Monroe [(about)](https://quotes.toscrape.com/author/Marilyn-Monroe)\n\n\u201cTry not to become a man of success. Rather become a man of value.\u201d by Albert Einstein [(about)](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d by Andr\u00e9 Gide [(about)](https://quotes.toscrape.com/author/Andre-Gide)\n\n\u201cI have not failed. I\u2019ve just found 10,000 ways that won\u2019t work.\u201d by Thomas A. Edison [(about)](https://quotes.toscrape.com/author/Thomas-A-Edison)\n\n\u201cA woman is like a tea bag; you never know how strong it is until it\u2019s in hot water.\u201d by Eleanor Roosevelt [(about)](https://quotes.toscrape.com/author/Eleanor-Roosevelt)\n\n\u201cA day without sunshine is like, you know, night.\u201d by Steve Martin [(about)](https://quotes.toscrape.com/author/Steve-Martin)",
-      "wordCount": 127
-    }
-  ],
-  "links": [
-    { "href": "https://quotes.toscrape.com/login", "text": "Login", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/author/Albert-Einstein", "text": "Albert Einstein", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/tag/change/", "text": "change", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/tag/deep-thoughts/", "text": "deep-thoughts", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/tag/thinking/", "text": "thinking", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/tag/world/", "text": "world", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/author/J-K-Rowling", "text": "J.K. Rowling", "isExternal": false },
-    { "href": "https://quotes.toscrape.com/page/2/", "text": "Next", "isExternal": false }
-  ],
-  "markdown": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d by [Albert Einstein](https://quotes.toscrape.com/author/Albert-Einstein)\n\n\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d by J.K. Rowling\n\n\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d by Albert Einstein\n\n\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d by Jane Austen\n\n\u201cImperfection is beauty, madness is genius and it\u2019s better to be absolutely ridiculous than absolutely boring.\u201d by Marilyn Monroe\n\n\u201cTry not to become a man of success. Rather become a man of value.\u201d by Albert Einstein\n\n\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d by Andr\u00e9 Gide\n\n\u201cI have not failed. I\u2019ve just found 10,000 ways that won\u2019t work.\u201d by Thomas A. Edison\n\n\u201cA woman is like a tea bag; you never know how strong it is until it\u2019s in hot water.\u201d by Eleanor Roosevelt\n\n\u201cA day without sunshine is like, you know, night.\u201d by Steve Martin"
-}

package/scripts/fetch-guide.mjs DELETED Viewed

@@ -1,25 +0,0 @@
-import { spider } from "../dist/spider.js"
-import { writeFileSync, mkdirSync } from "fs"
-const url = "https://easyparser.com/blog/ai-agents-web-scraping-guide"
-console.log(`Spidering ${url} ...`)
-try {
-  const page = await spider(url)
-  console.log("title      :", page.title)
-  console.log("domain     :", page.domain)
-  console.log("wordCount  :", page.wordCount)
-  console.log("chunks     :", page.chunks.length)
-  console.log("links      :", page.links.length)
-  console.log("headings   :", page.headings.map((h) => `H${h.level} ${h.text}`).join(" | "))
-  console.log("\n--- First 3 chunks ---")
-  for (const c of page.chunks.slice(0, 3)) {
-    console.log(`\n[${c.index}] heading="${c.heading}" words=${c.wordCount}`)
-    console.log(c.text.slice(0, 300) + "...")
-  }
-  mkdirSync("fixtures", { recursive: true })
-  writeFileSync("fixtures/guide-ai-agents-web-scraping.json", JSON.stringify(page, null, 2))
-  console.log("\nFixture written to fixtures/guide-ai-agents-web-scraping.json")
-} catch (e) {
-  console.error("Failed:", e.message)
-}

package/src/cache.ts DELETED Viewed

@@ -1,99 +0,0 @@
-import type { ICache } from "./ports.js";
-import type { SpideredPage } from "./types.js";
-interface CacheEntry {
-	page: SpideredPage;
-	expiresAt: number;
-}
-export interface SpiderCacheOptions {
-	/** Maximum number of pages to hold (default 500) */
-	maxSize?: number;
-	/** Time-to-live in milliseconds (default 30 min) */
-	ttlMs?: number;
-}
-/**
- * LRU cache for spidered pages.
- *
- * Implements the Identity Map pattern from Local Materialized View:
- * exactly one entry per normalised URL — duplicate fetches never happen.
- *
- * Uses a plain object (Object.create(null)) for storage rather than a Map.
- * Plain objects carry no realm-specific internal slots, so they are safe
- * across V8 context (realm) boundaries — e.g. when the cache is constructed
- * in an ESM module realm but called from a jiti VM-sandbox realm.
- *
- * JavaScript objects maintain insertion order for string keys (ES2015+),
- * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
- * Map without any cross-realm risk.
- */
-export class SpiderCache implements ICache<string, SpideredPage> {
-	private readonly store: Record<string, CacheEntry | undefined> = Object.create(null);
-	private readonly maxSize: number;
-	private readonly ttlMs: number;
-	constructor(opts: SpiderCacheOptions = {}) {
-		this.maxSize = opts.maxSize ?? 500;
-		this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
-	}
-	/** Normalise a URL so http/https and trailing slashes don't cause misses. */
-	private key(url: string): string {
-		try {
-			const u = new URL(url);
-			u.hash = "";
-			return u.toString().replace(/\/$/, "");
-		} catch {
-			return url;
-		}
-	}
-	get(url: string): SpideredPage | undefined {
-		const k = this.key(url);
-		const entry = this.store[k];
-		if (!entry) return undefined;
-		if (Date.now() > entry.expiresAt) {
-			delete this.store[k];
-			return undefined;
-		}
-		// Promote to tail (most-recently-used) by delete + reinsert.
-		// Object insertion order is preserved for string keys in ES2015+.
-		delete this.store[k];
-		this.store[k] = entry;
-		return entry.page;
-	}
-	set(url: string, page: SpideredPage): void {
-		const k = this.key(url);
-		if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
-			const lruKey = Object.keys(this.store)[0];
-			if (lruKey !== undefined) delete this.store[lruKey];
-		}
-		this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
-	}
-	has(url: string): boolean {
-		return this.get(url) !== undefined;
-	}
-	delete(url: string): void {
-		delete this.store[this.key(url)];
-	}
-	clear(): void {
-		for (const k of Object.keys(this.store)) delete this.store[k];
-	}
-	get size(): number {
-		return Object.keys(this.store).length;
-	}
-	/** All currently valid pages (does not update LRU order). */
-	values(): SpideredPage[] {
-		const now = Date.now();
-		return Object.values(this.store)
-			.filter((e): e is CacheEntry => e !== undefined && e.expiresAt > now)
-			.map((e) => e.page);
-	}
-}

package/src/convert.ts DELETED Viewed

@@ -1,161 +0,0 @@
-/**
- * Markdown conversion and chunk splitting.
- *
- * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
- * it never imports Turndown directly.
- */
-import TurndownService from "turndown";
-import type { Chunk, ChunkType } from "./types.js";
-// ---------------------------------------------------------------------------
-// Turndown setup
-// ---------------------------------------------------------------------------
-// TurndownService exposes .escape as a mutable internal — not in @types/turndown.
-interface PatchableTurndown { escape: (s: string) => string }
-const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
-// Disable escape — Turndown escapes markdown-special chars by default,
-// producing backslash noise that is unnatural for agent consumption.
-(turndown as unknown as PatchableTurndown).escape = (s) => s;
-// Strip images by default — agents cannot see them and alt-text is noise.
-// Disabled when keepImages: true is passed to toMarkdown().
-turndown.addRule("strip-images", {
-	filter: "img",
-	replacement: () => "",
-});
-const turndownWithImages = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
-(turndownWithImages as unknown as PatchableTurndown).escape = (s) => s;
-// Default Turndown behaviour already renders <img> as ![alt](src) — no extra rule needed.
-// ---------------------------------------------------------------------------
-// Markdown conversion
-// ---------------------------------------------------------------------------
-export interface ToMarkdownOptions {
-	/**
-	 * When true, <img> tags are rendered as ![alt](src) instead of being stripped.
-	 * Use when captureImages is enabled so image references appear in the markdown.
-	 * Default: false.
-	 */
-	keepImages?: boolean;
-}
-/** Convert Readability article HTML to clean markdown. */
-export function toMarkdown(html: string, opts?: ToMarkdownOptions): string {
-	if (opts?.keepImages) return turndownWithImages.turndown(html);
-	return turndown.turndown(html);
-}
-// ---------------------------------------------------------------------------
-// Content type detection
-// ---------------------------------------------------------------------------
-const CHUNK_TARGET_WORDS = 150;
-/** Detect the dominant content type from a markdown buffer. */
-export function detectContentType(lines: string[]): ChunkType {
-	for (const line of lines) {
-		const t = line.trim();
-		if (!t) continue;
-		if (t.startsWith("```")) return "code";
-		if (t.startsWith("|")) return "table";
-		if (/^[-*+] /.test(t) || /^\d+\. /.test(t)) return "list";
-		if (t.startsWith(">")) return "blockquote";
-		return "text";
-	}
-	return "text";
-}
-// ---------------------------------------------------------------------------
-// Chunking
-// ---------------------------------------------------------------------------
-/**
- * Split markdown into RAG-ready chunks at heading boundaries.
- *
- * Atomicity guarantees:
- *   - Fenced code blocks (``` ... ```) are never split.
- *   - Markdown tables (lines starting with |) are always flushed as a single
- *     chunk. Prose before the table is flushed first so the table is isolated.
- */
-export function chunk(markdown: string, baseUrl: string): Chunk[] {
-	const chunks: Chunk[] = [];
-	const lines = markdown.split("\n");
-	let heading = "";
-	let buffer: string[] = [];
-	let index = 0;
-	let inCode = false;
-	let inTable = false;
-	const flush = (): void => {
-		const text = buffer.join("\n").trim();
-		if (!text) return;
-		const wordCount = text.split(/\s+/).filter(Boolean).length;
-		if (wordCount < 10) return;
-		const contentType = detectContentType(buffer);
-		chunks.push({ id: `${baseUrl}#chunk-${index}`, index, heading, text, wordCount, contentType });
-		index++;
-		buffer = [];
-	};
-	for (const line of lines) {
-		const trimmed = line.trim();
-		// ── Fenced code block toggle ──────────────────────────────────────────
-		if (trimmed.startsWith("```")) {
-			inCode = !inCode;
-			buffer.push(line);
-			continue;
-		}
-		if (inCode) {
-			buffer.push(line);
-			continue;
-		}
-		// ── Table rows ────────────────────────────────────────────────────────
-		const isTableRow = trimmed.startsWith("|");
-		if (isTableRow) {
-			if (!inTable) {
-				// Table is starting — flush any preceding prose so the table
-				// gets its own isolated chunk.
-				flush();
-				inTable = true;
-			}
-			buffer.push(line);
-			continue;
-		}
-		if (inTable) {
-			// Table just ended — flush it before processing the next line.
-			flush();
-			inTable = false;
-		}
-		// ── Normal prose / headings ───────────────────────────────────────────
-		if (!trimmed) {
-			buffer.push(line);
-			continue;
-		}
-		const headingMatch = /^#{1,3} (.+)/.exec(trimmed);
-		if (headingMatch) {
-			const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
-			if (currentWords >= CHUNK_TARGET_WORDS) flush();
-			heading = headingMatch[1];
-			buffer.push(line);
-		} else {
-			buffer.push(line);
-			const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
-			if (currentWords >= CHUNK_TARGET_WORDS) flush();
-		}
-	}
-	flush();
-	return chunks;
-}

package/src/crawl.ts DELETED Viewed

@@ -1,186 +0,0 @@
-import { SpiderCache } from "./cache.js";
-import { PageGraph } from "./graph.js";
-import type { ICache } from "./ports.js";
-import { RobotsCache } from "./robots.js";
-import { fetchSitemapUrls } from "./sitemap.js";
-import type { SpiderOptions } from "./spider.js";
-import { spider } from "./spider.js";
-import { DomainThrottle } from "./throttle.js";
-import type { SpideredPage } from "./types.js";
-export interface CrawlOptions extends SpiderOptions {
-	/** How many link hops from the start URL (default 2) */
-	maxDepth?: number;
-	/** Hard cap on total pages spidered (default 50) */
-	maxPages?: number;
-	/** Only follow links on the same domain as the start URL (default true) */
-	sameDomainOnly?: boolean;
-	/** Max concurrent fetches (default 3) */
-	concurrency?: number;
-	/**
-	 * Minimum delay between requests to the same domain (ms).
-	 * When a throttle is provided this sets its minDelayMs.
-	 * Default 500.
-	 */
-	delayMs?: number;
-	/** Bring your own cache — already-spidered URLs are skipped */
-	cache?: ICache<string, SpideredPage>;
-	/** Bring your own graph — nodes/edges added as pages are spidered */
-	graph?: PageGraph;
-	/** Called with each successfully spidered page */
-	onPage?: (page: SpideredPage, depth: number) => void;
-	/** Return false to skip a URL before fetching it */
-	urlFilter?: (url: string) => boolean;
-	/**
-	 * Whether to check and respect robots.txt for each domain (default true).
-	 * Automatically creates a RobotsCache if not provided via SpiderOptions.
-	 */
-	respectRobots?: boolean;
-	/**
-	 * Attempt to fetch /sitemap.xml before BFS to seed the frontier with
-	 * all known URLs. Falls back to normal BFS on any error (default true).
-	 */
-	useSitemap?: boolean;
-}
-export interface CrawlResult {
-	pages: Map<string, SpideredPage>;
-	graph: PageGraph;
-	errors: Map<string, Error>;
-}
-/**
- * Recursive BFS crawler.
- *
- * Starts at `startUrl`, spiders it, extracts links, filters them, then
- * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
- * and `urlFilter`. Populates the provided (or freshly created) cache and
- * graph as it goes.
- *
- * Concurrency is bounded per depth level — we fully finish each level
- * before proceeding, giving BFS ordering and predictable memory use.
- */
-export async function crawl(startUrl: string, opts: CrawlOptions = {}): Promise<CrawlResult> {
-	const {
-		maxDepth = 2,
-		maxPages = 50,
-		sameDomainOnly = true,
-		concurrency = 3,
-		delayMs = 500,
-		cache = new SpiderCache() as ICache<string, SpideredPage>,
-		graph = new PageGraph(),
-		onPage,
-		urlFilter,
-		respectRobots = true,
-		useSitemap = true,
-		...spiderOpts
-	} = opts;
-	const throttle = spiderOpts.throttle ?? new DomainThrottle({ minDelayMs: delayMs });
-	const robotsCache = spiderOpts.robotsCache ?? (respectRobots ? new RobotsCache(spiderOpts.userAgent) : undefined);
-	const httpClient = spiderOpts.httpClient;
-	const startDomain = new URL(startUrl).hostname;
-	const pages = new Map<string, SpideredPage>();
-	const errors = new Map<string, Error>();
-	const seen = new Set<string>();
-	const shouldVisit = (url: string): boolean => {
-		if (seen.has(url)) return false;
-		if (pages.size + errors.size >= maxPages) return false;
-		try {
-			const u = new URL(url);
-			if (!["http:", "https:"].includes(u.protocol)) return false;
-			if (sameDomainOnly && u.hostname !== startDomain) return false;
-		} catch {
-			return false;
-		}
-		if (urlFilter && !urlFilter(url)) return false;
-		return true;
-	};
-	// Throttle and robots.txt are handled inside spider() via shared instances.
-	const fetchBatch = async (urls: string[], depth: number): Promise<void> => {
-		let index = 0;
-		let inFlight = 0;
-		let completed = 0;
-		await new Promise<void>((resolve) => {
-			const tryNext = (): void => {
-				while (inFlight < concurrency && index < urls.length) {
-					const url = urls[index++];
-					inFlight++;
-					const fetch_ = cache.has(url)
-						? Promise.resolve(cache.get(url)!)
-						: spider(url, { ...spiderOpts, throttle, robotsCache });
-					fetch_
-						.then((page) => {
-							pages.set(url, page);
-							cache.set(url, page);
-							graph.addPage(page);
-							onPage?.(page, depth);
-						})
-						.catch((err: unknown) => {
-							errors.set(url, err instanceof Error ? err : new Error(String(err)));
-						})
-						.finally(() => {
-							completed++;
-							inFlight--;
-							if (completed === urls.length) resolve();
-							else tryNext();
-						});
-				}
-			};
-			tryNext();
-		});
-	};
-	let frontier = [startUrl];
-	seen.add(startUrl);
-	if (useSitemap) {
-		const origin = new URL(startUrl).origin;
-		// Use a minimal default httpClient if none was injected
-		const client = httpClient ?? {
-			async fetch(req: { url: string; headers?: Record<string, string> }) {
-				return globalThis.fetch(req.url, { headers: req.headers });
-			},
-		};
-		const sitemapUrls = await fetchSitemapUrls(origin, client);
-		for (const u of sitemapUrls) {
-			if (shouldVisit(u)) {
-				seen.add(u);
-				frontier.push(u);
-			}
-		}
-	}
-	for (let depth = 0; depth <= maxDepth; depth++) {
-		if (frontier.length === 0) break;
-		if (pages.size + errors.size >= maxPages) break;
-		const remaining = maxPages - pages.size - errors.size;
-		const batch = frontier.slice(0, remaining);
-		await fetchBatch(batch, depth);
-		if (depth === maxDepth) break;
-		const nextFrontier: string[] = [];
-		for (const url of batch) {
-			const page = pages.get(url);
-			if (!page) continue;
-			for (const link of page.links) {
-				if (shouldVisit(link.href)) {
-					seen.add(link.href);
-					nextFrontier.push(link.href);
-				}
-			}
-		}
-		frontier = nextFrontier;
-	}
-	return { pages, graph, errors };
-}