npm - @dpopsuev/web-spider - Versions diffs - 0.10.4 → 0.10.5 - Mend

@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/dist/batch.js.map +1 -0
package/dist/cache.js.map +1 -0
package/dist/convert.js.map +1 -0
package/dist/crawl.js.map +1 -0
package/dist/disk-cache.js.map +1 -0
package/dist/graph.js.map +1 -0
package/dist/index.js.map +1 -0
package/dist/parse.js.map +1 -0
package/dist/playwright.js.map +1 -0
package/dist/ports.js.map +1 -0
package/dist/robots.js.map +1 -0
package/dist/search.js.map +1 -0
package/dist/sitemap.js.map +1 -0
package/dist/spider.js.map +1 -0
package/dist/throttle.js.map +1 -0
package/dist/tree.js.map +1 -0
package/dist/types.js.map +1 -0
package/dist/views.js.map +1 -0
package/dist/web-search.js.map +1 -0
package/package.json +2 -1
package/fixtures/article-with-images.html +0 -94
package/fixtures/gh-shell.html +0 -32
package/fixtures/guide-ai-agents-web-scraping.json +0 -552
package/fixtures/images/large.jpg +0 -0
package/fixtures/images/small.jpg +0 -0
package/fixtures/images/tiny.png +0 -0
package/fixtures/quotes-index.json +0 -40
package/scripts/fetch-guide.mjs +0 -25
package/src/cache.ts +0 -99
package/src/convert.ts +0 -161
package/src/crawl.ts +0 -186
package/src/disk-cache.ts +0 -228
package/src/graph.ts +0 -189
package/src/index.ts +0 -74
package/src/parse.ts +0 -154
package/src/playwright.ts +0 -193
package/src/ports.ts +0 -131
package/src/robots.ts +0 -121
package/src/search.ts +0 -173
package/src/sitemap.ts +0 -67
package/src/spider.ts +0 -475
package/src/throttle.ts +0 -118
package/src/tree.ts +0 -379
package/src/types.ts +0 -225
package/src/views.ts +0 -42
package/src/web-search.ts +0 -548
package/test/convert-images.test.ts +0 -69
package/test/disk-cache-images.test.ts +0 -193
package/test/engine-registry.test.ts +0 -114
package/test/exports.test.ts +0 -124
package/test/get-chunk.test.ts +0 -115
package/test/images-integration.test.ts +0 -359
package/test/improvements.test.ts +0 -279
package/test/inbound-count.test.ts +0 -111
package/test/lean.test.ts +0 -105
package/test/playwright.test.ts +0 -128
package/test/ports.test.ts +0 -161
package/test/search.test.ts +0 -219
package/test/spider-images.test.ts +0 -180
package/test/spider-unit.test.ts +0 -610
package/test/tree.test.ts +0 -272
package/test/types.test.ts +0 -169
package/test/web-search-integration.test.ts +0 -180
package/test/web-search.test.ts +0 -305
package/tsconfig.json +0 -9
package/tsconfig.test.json +0 -7
package/vitest.config.ts +0 -8

package/src/spider.ts DELETED Viewed

@@ -1,475 +0,0 @@
-import { Readability } from "@mozilla/readability";
-import { chunk, toMarkdown } from "./convert.js";
-import type { ImageRef } from "./types.js";
-import { extractCanonicalUrl, extractHeadings, extractLinks, extractTags, parseDom } from "./parse.js";
-import type { IHttpClient, IRobotsChecker, IThrottle } from "./ports.js";
-import { buildTree } from "./tree.js";
-import type { DOMNode, LeanPage, SpideredPage } from "./types.js";
-import { toLean } from "./views.js";
-// ---------------------------------------------------------------------------
-// Constants
-// ---------------------------------------------------------------------------
-const WORDS_PER_MINUTE = 200;
-// ---------------------------------------------------------------------------
-// Default HTTP client adapter
-// ---------------------------------------------------------------------------
-const defaultHttpClient: IHttpClient = {
-	async fetch(req) {
-		const res = await globalThis.fetch(req.url, {
-			signal: req.signal,
-			headers: req.headers,
-		});
-		return {
-			ok: res.ok,
-			status: res.status,
-			statusText: res.statusText,
-			headers: { get: (name: string) => res.headers.get(name) },
-			text: () => res.text(),
-			arrayBuffer: () => res.arrayBuffer(),
-		};
-	},
-};
-// ---------------------------------------------------------------------------
-// Public API
-// ---------------------------------------------------------------------------
-export interface SpiderOptions {
-	/**
-	 * ms before aborting the fetch (default 10 000).
-	 */
-	timeoutMs?: number;
-	/**
-	 * Value sent as User-Agent.
-	 * Default identifies the tool; override for sites that block generic crawlers.
-	 */
-	userAgent?: string;
-	/**
-	 * CSS selector that scopes content extraction to a specific element.
-	 * Everything outside the matched element is discarded before Readability runs.
-	 * Example: "article", ".main-content", "#post-body"
-	 */
-	rootSelector?: string;
-	/**
-	 * Comma-separated CSS selectors whose matched elements are removed before
-	 * extraction. Applied before Readability, so excluded content never reaches
-	 * the chunks or markdown.
-	 * Example: "nav, footer, .sidebar, #ads"
-	 */
-	excludeSelectors?: string;
-	/**
-	 * Approximate maximum token budget for the returned content.
-	 * Markdown is truncated to fit. Rough estimate: 1 token ≈ 4 characters.
-	 * Does not affect lean view (headings/links are always small).
-	 * Default: unlimited.
-	 */
-	tokenBudget?: number;
-	/**
-	 * Per-domain throttle — shared across spider() calls to enforce rate limits
-	 * and exponential backoff on 429/503 responses.
-	 */
-	throttle?: IThrottle;
-	/**
-	 * robots.txt checker — when provided, spider() checks robots.txt before
-	 * fetching and respects Crawl-delay directives.
-	 */
-	robotsCache?: IRobotsChecker;
-	/**
-	 * HTTP client — defaults to a global fetch() adapter.
-	 * Inject a stub for testing without real network access.
-	 */
-	httpClient?: IHttpClient;
-	/**
-	 * When true, fetch <img> src URLs found in the article content and attach
-	 * them as base64-encoded ImageRef objects to SpideredPage.images.
-	 * Default: false — preserves current behaviour exactly.
-	 */
-	captureImages?: boolean;
-	/**
-	 * Maximum number of images to fetch per page.
-	 * Default: 10.
-	 */
-	maxImages?: number;
-}
-/**
- * Spider a single URL and return a fully structured SpideredPage.
- *
- * Pass `view: "lean"` to skip chunking and markdown conversion — returns a
- * LeanPage with only identity, metadata, and the heading/link outline.
- * Significantly faster (~3×) and uses far fewer tokens in agent context.
- *
- * Errors are returned as thrown exceptions with a descriptive message rather
- * than crashing silently. Common cases:
- * - Non-HTTP URLs throw immediately with a clear message.
- * - HTTP errors include the status code.
- * - JS-rendered pages (wordCount === 0) include a hint.
- * - Timeouts include the configured limit.
- *
- * @example
- * // Full page — chunks, markdown, all metadata
- * const page = await spider("https://example.com")
- *
- * @example
- * // Lean overview — no body text, ideal for navigation decisions
- * const lean = await spider("https://example.com", { view: "lean" })
- */
-// ---------------------------------------------------------------------------
-// Image fetching
-// ---------------------------------------------------------------------------
-/** Detect MIME type from a URL path extension, defaulting to image/jpeg. */
-function mimeFromUrl(src: string): string {
-	const ext = src.split("?")[0].split(".").pop()?.toLowerCase();
-	const map: Record<string, string> = {
-		jpg: "image/jpeg",
-		jpeg: "image/jpeg",
-		png: "image/png",
-		webp: "image/webp",
-		gif: "image/gif",
-		svg: "image/svg+xml",
-		avif: "image/avif",
-	};
-	return map[ext ?? ""] ?? "image/jpeg";
-}
-/**
- * Extract <img> elements from article HTML, resolve src URLs, and fetch
- * each as a base64-encoded ImageRef. data: URLs are included without fetching.
- * Failed fetches are silently skipped.
- */
-async function fetchImages(
-	articleHtml: string,
-	pageUrl: string,
-	httpClient: IHttpClient,
-	maxImages: number,
-	throttle?: IThrottle,
-): Promise<ImageRef[]> {
-	// Parse the article HTML to extract img elements.
-	const { parseDom } = await import("./parse.js");
-	const doc = parseDom(articleHtml, pageUrl);
-	const imgEls = [...doc.querySelectorAll("img")].slice(0, maxImages);
-	const results: ImageRef[] = [];
-	for (const el of imgEls) {
-		const rawSrc = el.getAttribute("src") ?? "";
-		if (!rawSrc) continue;
-		const alt = el.getAttribute("alt") ?? "";
-		// data: URLs — include without fetching.
-		if (rawSrc.startsWith("data:")) {
-			const match = /^data:([^;]+);base64,(.+)$/.exec(rawSrc);
-			if (match) {
-				results.push({ src: rawSrc, mimeType: match[1], alt, base64: match[2] });
-			}
-			continue;
-		}
-		// Resolve relative URLs.
-		let absoluteSrc: string;
-		try {
-			absoluteSrc = new URL(rawSrc, pageUrl).toString();
-		} catch {
-			continue;
-		}
-		try {
-			if (throttle) await throttle.wait(absoluteSrc);
-			const res = await httpClient.fetch({
-				url: absoluteSrc,
-				headers: { "User-Agent": "web-spider/0.1", Accept: "image/*" },
-			});
-			if (!res.ok) continue;
-			throttle?.success(absoluteSrc);
-			const buf = await res.arrayBuffer();
-			const base64 = Buffer.from(buf).toString("base64");
-			const contentType = res.headers.get("content-type");
-			const mimeType = contentType?.split(";")[0].trim() || mimeFromUrl(absoluteSrc);
-			results.push({ src: absoluteSrc, mimeType, alt, base64 });
-		} catch {
-			// Skip failed image fetches silently — a missing image should never
-			// cause the whole page scrape to fail.
-		}
-	}
-	return results;
-}
-/** A page with its full DOM tree attached. */
-export interface TreePage extends SpideredPage {
-	readonly view: "tree";
-	tree: DOMNode;
-}
-export async function spider(url: string, opts: SpiderOptions & { view: "lean" }): Promise<LeanPage>;
-export async function spider(url: string, opts: SpiderOptions & { view: "tree" }): Promise<TreePage>;
-export async function spider(url: string, opts?: SpiderOptions & { view?: "full" }): Promise<SpideredPage>;
-export async function spider(
-	url: string,
-	opts?: SpiderOptions & { view?: "lean" | "full" | "tree" },
-): Promise<SpideredPage | LeanPage | TreePage> {
-	const {
-		timeoutMs = 30_000,
-		userAgent = "web-spider/0.1 (AI agent research tool; +https://github.com/dpopsuev)",
-		view = "full",
-		rootSelector,
-		excludeSelectors,
-		tokenBudget,
-		throttle,
-		robotsCache,
-		httpClient = defaultHttpClient,
-		captureImages = false,
-		maxImages = 10,
-	} = opts ?? {};
-	// Poka-yoke: reject non-HTTP URLs immediately with a clear message.
-	let parsedUrl: URL;
-	try {
-		parsedUrl = new URL(url);
-	} catch {
-		throw new Error(`Invalid URL: "${url}" — must be a fully-qualified http/https URL`);
-	}
-	if (!["http:", "https:"].includes(parsedUrl.protocol)) {
-		throw new Error(`Unsupported protocol "${parsedUrl.protocol}" — only http and https are supported`);
-	}
-	// Check robots.txt before fetching.
-	if (robotsCache) {
-		const { allowed, crawlDelayMs } = await robotsCache.check(url);
-		if (!allowed) throw new Error(`Blocked by robots.txt: ${url}`);
-		if (crawlDelayMs && throttle) {
-			throttle.setDomainDelay(parsedUrl.hostname, crawlDelayMs);
-		}
-	}
-	// Fetch with optional throttle + retry on 429/503.
-	const maxRetries = throttle?.maxRetries ?? 0;
-	let html = "";
-	let fetchError: Error | null = null;
-	for (let attempt = 0; attempt <= maxRetries; attempt++) {
-		if (throttle) await throttle.wait(url);
-		const controller = new AbortController();
-		const timer = setTimeout(() => controller.abort(), timeoutMs);
-		let res: Awaited<ReturnType<IHttpClient["fetch"]>>;
-		try {
-			res = await httpClient.fetch({
-				url,
-				signal: controller.signal,
-				headers: { "User-Agent": userAgent, Accept: "text/html" },
-			});
-		} catch (err) {
-			clearTimeout(timer);
-			if (err instanceof Error && err.name === "AbortError") {
-				throw new Error(`Timeout after ${timeoutMs}ms — ${url}`);
-			}
-			throw err;
-		}
-		clearTimeout(timer);
-		if (res.status === 429 || res.status === 503) {
-			if (throttle && attempt < maxRetries) {
-				throttle.rateLimit(url, res.headers.get("Retry-After"));
-				fetchError = new Error(`HTTP ${res.status} — retrying (attempt ${attempt + 1}/${maxRetries})`);
-				continue;
-			}
-			throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
-		}
-		if (!res.ok) throw new Error(`HTTP ${res.status} ${res.statusText} — ${url}`);
-		throttle?.success(url);
-		html = await res.text();
-		fetchError = null;
-		break;
-	}
-	if (fetchError) throw fetchError;
-	// Parse DOM via parse.ts — keeps the JSDOM dependency in one module.
-	const doc = parseDom(html, url);
-	// Apply excludeSelectors before Readability strips the DOM.
-	if (excludeSelectors) {
-		for (const sel of excludeSelectors
-			.split(",")
-			.map((s) => s.trim())
-			.filter(Boolean)) {
-			for (const el of [...doc.querySelectorAll(sel)]) el.remove();
-		}
-	}
-	// Scope to rootSelector: replace body content with the matched element.
-	if (rootSelector) {
-		const root = doc.querySelector(rootSelector);
-		if (root) {
-			doc.body.innerHTML = root.outerHTML;
-		}
-	}
-	const links = extractLinks(doc, url);
-	const canonicalUrl = extractCanonicalUrl(doc, url);
-	// Readability content extraction (Firefox Reader View engine).
-	const readabilityResult = new Readability(doc).parse();
-	const jsRendered = !readabilityResult;
-	// Graceful degradation: if Readability finds nothing, return a partial page
-	// with jsRendered:true rather than throwing. The agent can decide what to do.
-	const article = readabilityResult ?? {
-		title: (doc.querySelector("title")?.textContent ?? "").trim(),
-		content: "",
-		textContent: "",
-		length: 0,
-		excerpt: "",
-		byline: "",
-		dir: "",
-		site_name: "",
-		lang: "",
-		publishedTime: null,
-		readingTimeMinutes: 0,
-	};
-	const domain = new URL(url).hostname.replace(/^www\./, "");
-	const fetchedAt = new Date().toISOString();
-	const meta = (name: string): string => {
-		const el =
-			doc.querySelector(`meta[name="${name}"]`) ??
-			doc.querySelector(`meta[property="og:${name}"]`) ??
-			doc.querySelector(`meta[property="${name}"]`);
-		return (el?.getAttribute("content") ?? "").trim();
-	};
-	// headings must come before tags so the heading fallback is available.
-	const headings = extractHeadings(article.content ?? "");
-	const tags = extractTags(doc);
-	// ---------------------------------------------------------------------------
-	// Lean fast-path — skip turndown + chunking entirely
-	// ---------------------------------------------------------------------------
-	if (view === "lean") {
-		const textContent = (article.textContent ?? "").trim();
-		const wordCount = textContent.split(/\s+/).filter(Boolean).length;
-		const chunkCount = Math.max(0, Math.floor(wordCount / 150));
-		const full = {
-			url,
-			domain,
-			fetchedAt,
-			...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
-			title: article.title ?? meta("title"),
-			description: meta("description"),
-			author: article.byline ?? meta("author"),
-			publishedAt: meta("article:published_time") ?? meta("date"),
-			lang: doc.documentElement.lang ?? "en",
-			tags,
-			wordCount,
-			readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
-			chunks: [], // placeholder — toLean reads chunks.length
-			headings,
-			links,
-			markdown: "",
-		} satisfies SpideredPage;
-		const lean = toLean(full);
-		return { ...lean, chunkCount, ...(jsRendered ? { jsRendered: true } : {}) };
-	}
-	// ---------------------------------------------------------------------------
-	// Tree path — build semantic DOM tree, then also produce full markdown
-	// ---------------------------------------------------------------------------
-	if (view === "tree") {
-		const tree = buildTree(article.content ?? "", url);
-		const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
-		const wordCount = markdown.split(/\s+/).filter(Boolean).length;
-		const chunks = chunk(markdown, url);
-		const images = captureImages
-			? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
-			: undefined;
-		return {
-			view: "tree",
-			url,
-			domain,
-			fetchedAt,
-			...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
-			title: article.title ?? meta("title"),
-			description: meta("description"),
-			author: article.byline ?? meta("author"),
-			publishedAt: meta("article:published_time") ?? meta("date"),
-			lang: doc.documentElement.lang ?? "en",
-			tags,
-			wordCount,
-			readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
-			headings,
-			chunks,
-			links,
-			markdown,
-			tree,
-			...(images ? { images } : {}),
-		};
-	}
-	// ---------------------------------------------------------------------------
-	// Full path — turndown + chunk
-	// ---------------------------------------------------------------------------
-	const markdown = toMarkdown(article.content ?? "", { keepImages: captureImages });
-	const wordCount = markdown.split(/\s+/).filter(Boolean).length;
-	// Chunk-aware tokenBudget: select whole chunks up to the budget rather
-	// than slicing markdown mid-sentence. Preserves chunk boundaries and
-	// returns the richest complete content that fits.
-	let allChunks = chunk(markdown, url);
-	if (tokenBudget !== undefined) {
-		const charBudget = tokenBudget * 4;
-		let remaining = charBudget;
-		let first = true;
-		allChunks = allChunks.filter((c) => {
-			// Always include at least the first chunk — agents need something
-			// even if it exceeds the budget.
-			if (!first && remaining <= 0) return false;
-			first = false;
-			remaining -= c.text.length;
-			return true;
-		});
-	}
-	// Reconstruct markdown from selected chunks for full-page consumers.
-	const finalMarkdown = tokenBudget !== undefined
-		? allChunks.map((c) => c.text).join("\n\n")
-		: markdown;
-	const images = captureImages
-		? await fetchImages(article.content ?? "", url, httpClient, maxImages, throttle)
-		: undefined;
-	return {
-		url,
-		domain,
-		fetchedAt,
-		...(canonicalUrl !== undefined ? { canonicalUrl } : {}),
-		title: article.title ?? meta("title"),
-		description: meta("description"),
-		author: article.byline ?? meta("author"),
-		publishedAt: meta("article:published_time") ?? meta("date"),
-		lang: doc.documentElement.lang ?? "en",
-		tags,
-		wordCount,
-		readingTimeMinutes: Math.ceil(wordCount / WORDS_PER_MINUTE),
-		headings,
-		chunks: allChunks,
-		links,
-		markdown: finalMarkdown,
-		...(images ? { images } : {}),
-		...(jsRendered ? { jsRendered: true } : {}),
-	};
-}

package/src/throttle.ts DELETED Viewed

@@ -1,118 +0,0 @@
-/**
- * Per-domain request throttle with exponential backoff and jitter.
- *
- * Enforces a minimum gap between requests to the same hostname.
- * On 429/503, backs off exponentially and respects Retry-After headers.
- * Shared instances should be passed into spider() and crawl() so that
- * all requests to a domain coordinate through one rate limiter.
- */
-import type { IThrottle } from "./ports.js";
-export interface ThrottleOptions {
-	/** Minimum gap between requests to the same domain (ms). Default 500. */
-	minDelayMs?: number;
-	/** Base for exponential backoff (ms). Default 1000. */
-	backoffBaseMs?: number;
-	/** Maximum backoff delay (ms). Default 30 000. */
-	backoffCapMs?: number;
-	/** Maximum retry attempts on 429/503 before giving up. Default 3. */
-	maxRetries?: number;
-}
-interface DomainState {
-	lastAt: number;
-	backoffUntil: number;
-	errors: number;
-	/** Per-domain minimum delay override (e.g. from robots.txt Crawl-delay). */
-	minDelayMs?: number;
-}
-function sleep(ms: number): Promise<void> {
-	return new Promise((r) => setTimeout(r, ms));
-}
-function parseRetryAfter(header: string | null): number {
-	if (!header) return 0;
-	const seconds = parseInt(header, 10);
-	if (!isNaN(seconds)) return seconds * 1_000;
-	const date = new Date(header).getTime();
-	if (!isNaN(date)) return Math.max(0, date - Date.now());
-	return 0;
-}
-export class DomainThrottle implements IThrottle {
-	private readonly states = new Map<string, DomainState>();
-	readonly minDelayMs: number;
-	readonly backoffBaseMs: number;
-	readonly backoffCapMs: number;
-	readonly maxRetries: number;
-	constructor(opts: ThrottleOptions = {}) {
-		this.minDelayMs = opts.minDelayMs ?? 500;
-		this.backoffBaseMs = opts.backoffBaseMs ?? 1_000;
-		this.backoffCapMs = opts.backoffCapMs ?? 30_000;
-		this.maxRetries = opts.maxRetries ?? 3;
-	}
-	private state(host: string): DomainState {
-		let s = this.states.get(host);
-		if (!s) {
-			s = { lastAt: 0, backoffUntil: 0, errors: 0 };
-			this.states.set(host, s);
-		}
-		return s;
-	}
-	/** Wait until the domain's rate limit and backoff have cleared. */
-	async wait(url: string): Promise<void> {
-		const s = this.state(new URL(url).hostname);
-		const minDelay = s.minDelayMs ?? this.minDelayMs;
-		const now = Date.now();
-		const delay = Math.max(
-			Math.max(0, s.backoffUntil - now),
-			Math.max(0, s.lastAt + minDelay - now),
-		);
-		if (delay > 0) await sleep(delay);
-		s.lastAt = Date.now();
-	}
-	/** Record a successful request — resets backoff for the domain. */
-	success(url: string): void {
-		const s = this.state(new URL(url).hostname);
-		s.errors = 0;
-		s.backoffUntil = 0;
-	}
-	/**
-	 * Record a rate-limit hit. Applies exponential backoff with jitter,
-	 * using Retry-After header when present. Returns the wait duration in ms.
-	 */
-	rateLimit(url: string, retryAfterHeader: string | null): number {
-		const s = this.state(new URL(url).hostname);
-		s.errors++;
-		const retryAfterMs = parseRetryAfter(retryAfterHeader);
-		const jitter = Math.random() * this.backoffBaseMs;
-		const backoffMs = Math.min(this.backoffCapMs, this.backoffBaseMs * 2 ** (s.errors - 1) + jitter);
-		const waitMs = Math.max(retryAfterMs, backoffMs);
-		s.backoffUntil = Date.now() + waitMs;
-		return waitMs;
-	}
-	/**
-	 * Override the minimum delay for a specific domain.
-	 * Used to honour robots.txt Crawl-delay directives.
-	 */
-	setDomainDelay(host: string, ms: number): void {
-		this.state(host).minDelayMs = ms;
-	}
-}
-/**
- * Factory — avoids jiti/Bun CJS re-export interop where class constructors
- * accessed through a re-export chain can appear undefined at call site.
- * Use this in extension code instead of `new DomainThrottle()`.
- */
-export function createThrottle(opts?: ThrottleOptions): DomainThrottle {
-	return new DomainThrottle(opts);
-}