npm - @dpopsuev/web-spider - Versions diffs - 0.10.4 → 0.10.5 - Mend

@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/dist/batch.js.map +1 -0
package/dist/cache.js.map +1 -0
package/dist/convert.js.map +1 -0
package/dist/crawl.js.map +1 -0
package/dist/disk-cache.js.map +1 -0
package/dist/graph.js.map +1 -0
package/dist/index.js.map +1 -0
package/dist/parse.js.map +1 -0
package/dist/playwright.js.map +1 -0
package/dist/ports.js.map +1 -0
package/dist/robots.js.map +1 -0
package/dist/search.js.map +1 -0
package/dist/sitemap.js.map +1 -0
package/dist/spider.js.map +1 -0
package/dist/throttle.js.map +1 -0
package/dist/tree.js.map +1 -0
package/dist/types.js.map +1 -0
package/dist/views.js.map +1 -0
package/dist/web-search.js.map +1 -0
package/package.json +2 -1
package/fixtures/article-with-images.html +0 -94
package/fixtures/gh-shell.html +0 -32
package/fixtures/guide-ai-agents-web-scraping.json +0 -552
package/fixtures/images/large.jpg +0 -0
package/fixtures/images/small.jpg +0 -0
package/fixtures/images/tiny.png +0 -0
package/fixtures/quotes-index.json +0 -40
package/scripts/fetch-guide.mjs +0 -25
package/src/cache.ts +0 -99
package/src/convert.ts +0 -161
package/src/crawl.ts +0 -186
package/src/disk-cache.ts +0 -228
package/src/graph.ts +0 -189
package/src/index.ts +0 -74
package/src/parse.ts +0 -154
package/src/playwright.ts +0 -193
package/src/ports.ts +0 -131
package/src/robots.ts +0 -121
package/src/search.ts +0 -173
package/src/sitemap.ts +0 -67
package/src/spider.ts +0 -475
package/src/throttle.ts +0 -118
package/src/tree.ts +0 -379
package/src/types.ts +0 -225
package/src/views.ts +0 -42
package/src/web-search.ts +0 -548
package/test/convert-images.test.ts +0 -69
package/test/disk-cache-images.test.ts +0 -193
package/test/engine-registry.test.ts +0 -114
package/test/exports.test.ts +0 -124
package/test/get-chunk.test.ts +0 -115
package/test/images-integration.test.ts +0 -359
package/test/improvements.test.ts +0 -279
package/test/inbound-count.test.ts +0 -111
package/test/lean.test.ts +0 -105
package/test/playwright.test.ts +0 -128
package/test/ports.test.ts +0 -161
package/test/search.test.ts +0 -219
package/test/spider-images.test.ts +0 -180
package/test/spider-unit.test.ts +0 -610
package/test/tree.test.ts +0 -272
package/test/types.test.ts +0 -169
package/test/web-search-integration.test.ts +0 -180
package/test/web-search.test.ts +0 -305
package/tsconfig.json +0 -9
package/tsconfig.test.json +0 -7
package/vitest.config.ts +0 -8

package/test/search.test.ts DELETED Viewed

@@ -1,219 +0,0 @@
-import { readFileSync } from "fs";
-import { dirname, join } from "path";
-import { fileURLToPath } from "url";
-import { describe, expect, it } from "vitest";
-import { fuzzySearch } from "../src/search.js";
-import type { SpideredPage } from "../src/types.js";
-const __dirname = dirname(fileURLToPath(import.meta.url));
-function loadFixture(name: string): SpideredPage {
-	const raw = readFileSync(join(__dirname, "../fixtures", name), "utf8");
-	return JSON.parse(raw) as SpideredPage;
-}
-const guide = loadFixture("guide-ai-agents-web-scraping.json");
-// ---------------------------------------------------------------------------
-// Basic contract
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — contract", () => {
-	it("returns an empty array for a blank query", () => {
-		expect(fuzzySearch([guide], "")).toEqual([]);
-		expect(fuzzySearch([guide], "   ")).toEqual([]);
-	});
-	it("returns an empty array when no pages are given", () => {
-		expect(fuzzySearch([], "openai")).toEqual([]);
-	});
-	it("returns at most topN results", () => {
-		const hits = fuzzySearch([guide], "the", { topN: 3 });
-		expect(hits.length).toBeLessThanOrEqual(3);
-	});
-	it("every hit has required fields", () => {
-		const hits = fuzzySearch([guide], "openai");
-		expect(hits.length).toBeGreaterThan(0);
-		for (const h of hits) {
-			expect(typeof h.url).toBe("string");
-			expect(typeof h.chunkId).toBe("string");
-			expect(typeof h.heading).toBe("string");
-			expect(typeof h.score).toBe("number");
-			expect(typeof h.snippet).toBe("string");
-		}
-	});
-	it("scores are in 0–1 range", () => {
-		const hits = fuzzySearch([guide], "agent scraping pipeline");
-		for (const h of hits) {
-			expect(h.score).toBeGreaterThan(0);
-			expect(h.score).toBeLessThanOrEqual(1);
-		}
-	});
-	it("results are sorted by score descending", () => {
-		const hits = fuzzySearch([guide], "LLM extraction cost");
-		for (let i = 1; i < hits.length; i++) {
-			expect(hits[i].score).toBeLessThanOrEqual(hits[i - 1].score);
-		}
-	});
-});
-// ---------------------------------------------------------------------------
-// Exact match quality
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — exact match", () => {
-	it("finds an exact phrase from the fixture title", () => {
-		const hits = fuzzySearch([guide], "AI Agents & Web Scraping");
-		expect(hits.length).toBeGreaterThan(0);
-		const titleHit = hits.find((h) => h.heading === "title");
-		// Title hit must be found and have a positive score.
-		// The absolute threshold is not asserted — it depends on the scorer's
-		// normalisation strategy and corpus size.
-		expect(titleHit).toBeDefined();
-		expect(titleHit!.score).toBeGreaterThan(0);
-	});
-	it("exact match scores higher than partial match for the same chunk", () => {
-		// "cost optimization" appears verbatim in a heading
-		const exact = fuzzySearch([guide], "Cost Optimization");
-		const partial = fuzzySearch([guide], "cost");
-		// The heading hit for exact phrase should outrank a generic token hit
-		const exactTop = exact[0];
-		expect(exactTop.score).toBeGreaterThanOrEqual(partial[0].score);
-	});
-	it("includes a non-empty snippet for every hit", () => {
-		const hits = fuzzySearch([guide], "OpenAI API");
-		for (const h of hits) {
-			expect(h.snippet.trim().length).toBeGreaterThan(0);
-		}
-	});
-	it("snippet contains the matched term", () => {
-		const hits = fuzzySearch([guide], "OpenAI API", { topN: 5 });
-		// At least one snippet should contain the matched term (case-insensitive)
-		const found = hits.some((h) => h.snippet.toLowerCase().includes("openai"));
-		expect(found).toBe(true);
-	});
-});
-// ---------------------------------------------------------------------------
-// Fuzzy / partial matching
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — fuzzy matching", () => {
-	it("matches partial tokens (prefix)", () => {
-		// "automat" should match "automation", "automated", "automatically"
-		const hits = fuzzySearch([guide], "automat");
-		expect(hits.length).toBeGreaterThan(0);
-	});
-	it("is case-insensitive", () => {
-		const lower = fuzzySearch([guide], "amazon");
-		const upper = fuzzySearch([guide], "AMAZON");
-		expect(lower.length).toBe(upper.length);
-		expect(lower.map((h) => h.chunkId)).toEqual(upper.map((h) => h.chunkId));
-	});
-	it("handles multi-word queries across chunk boundaries", () => {
-		// Words that appear spread across the document, not necessarily adjacent
-		const hits = fuzzySearch([guide], "proxy captcha reliable");
-		expect(hits.length).toBeGreaterThan(0);
-	});
-	it("returns no hits for a query that is clearly absent", () => {
-		const hits = fuzzySearch([guide], "xyzzy quux frumious bandersnatch");
-		expect(hits.length).toBe(0);
-	});
-});
-// ---------------------------------------------------------------------------
-// Metadata vs chunk hits
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — metadata hits", () => {
-	it("matches headings and returns chunkId as empty string", () => {
-		const hits = fuzzySearch([guide], "Frequently Asked Questions");
-		const metaHit = hits.find((h) => h.chunkId === "");
-		expect(metaHit).toBeDefined();
-	});
-	it("chunk hits carry a valid chunk ID", () => {
-		const hits = fuzzySearch([guide], "intelligent data pipeline");
-		const chunkHit = hits.find((h) => h.chunkId !== "");
-		expect(chunkHit).toBeDefined();
-		expect(chunkHit!.chunkId).toMatch(/^https?:\/\/.+#chunk-\d+$/);
-	});
-	it("matches the page description field", () => {
-		// Guide description: "Combine AI agents with web scraping APIs..."
-		const hits = fuzzySearch([guide], "automated reports");
-		expect(hits.some((h) => h.heading === "description")).toBe(true);
-	});
-});
-// ---------------------------------------------------------------------------
-// Multi-page corpus
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — multi-page corpus", () => {
-	// Build a second synthetic page from a subset of the guide's chunks
-	const page2: SpideredPage = {
-		...guide,
-		url: "https://example.com/other",
-		domain: "example.com",
-		title: "A Different Article About Proxies",
-		description: "Proxy rotation and CAPTCHA handling for scrapers.",
-		chunks: guide.chunks.slice(0, 2).map((c, i) => ({
-			...c,
-			id: `https://example.com/other#chunk-${i}`,
-		})),
-	};
-	it("returns hits from multiple pages when both match", () => {
-		// topN must exceed the number of matching guide chunks to let page2 surface
-		const hits = fuzzySearch([guide, page2], "scraping", { topN: 100 });
-		const urls = new Set(hits.map((h) => h.url));
-		expect(urls.size).toBeGreaterThan(1);
-	});
-	it("respects topN across the whole corpus", () => {
-		const hits = fuzzySearch([guide, page2], "agent", { topN: 4 });
-		expect(hits.length).toBeLessThanOrEqual(4);
-	});
-	it("higher-scoring page ranks first regardless of input order", () => {
-		// page2 title is explicitly about proxies; guide is not
-		const hitsProxies = fuzzySearch([guide, page2], "proxy rotation CAPTCHA", { topN: 1 });
-		expect(hitsProxies[0].url).toBe(page2.url);
-	});
-});
-// ---------------------------------------------------------------------------
-// Snippet shape
-// ---------------------------------------------------------------------------
-describe("fuzzySearch — snippet", () => {
-	it("snippet is bounded by snippetRadius", () => {
-		const radius = 30;
-		const hits = fuzzySearch([guide], "OpenAI", { snippetRadius: radius });
-		for (const h of hits) {
-			// Strip leading/trailing ellipsis markers before measuring
-			const bare = h.snippet.replace(/^…|…$/g, "");
-			// The bare snippet should be at most 2×radius + matched term length
-			// Give a generous upper bound to account for word boundaries
-			expect(bare.length).toBeLessThan(radius * 2 + 60);
-		}
-	});
-	it("snippet adds leading ellipsis when match is not at start", () => {
-		// Search for something known to appear mid-text
-		const hits = fuzzySearch([guide], "cost optimization");
-		const mid = hits.find((h) => h.snippet.startsWith("…"));
-		expect(mid).toBeDefined();
-	});
-});

package/test/spider-images.test.ts DELETED Viewed

@@ -1,180 +0,0 @@
-/**
- * TDD tests for spider() captureImages option.
- * No real network — uses stub IHttpClient.
- */
-import { readFileSync } from "node:fs";
-import { join } from "node:path";
-import { describe, expect, it } from "vitest";
-import type { IHttpClient } from "../src/ports.js";
-import { spider } from "../src/spider.js";
-// ---------------------------------------------------------------------------
-// Fixture HTML (loaded from disk)
-// ---------------------------------------------------------------------------
-const FIXTURE_HTML = readFileSync(
-	join(import.meta.dirname, "../fixtures/article-with-images.html"),
-	"utf8",
-);
-const TINY_PNG = readFileSync(join(import.meta.dirname, "../fixtures/images/tiny.png"));
-const SMALL_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/small.jpg"));
-// ---------------------------------------------------------------------------
-// Stub HTTP client factory
-// ---------------------------------------------------------------------------
-/**
- * Returns a stub IHttpClient that serves the fixture HTML for page requests
- * and fixture image bytes for image requests.
- * `failOnSecond`: if true, throws on the second image fetch.
- */
-function makeStubClient(opts: { failOnSecond?: boolean } = {}): IHttpClient {
-	let imageFetchCount = 0;
-	return {
-		async fetch(req) {
-			// Page request
-			if (req.url.startsWith("https://example.com") && !req.url.match(/\.(jpg|jpeg|png|webp|gif)(\?|$)/i)) {
-				return {
-					ok: true,
-					status: 200,
-					statusText: "OK",
-					headers: { get: (name) => (name === "content-type" ? "text/html" : null) },
-					text: async () => FIXTURE_HTML,
-					arrayBuffer: async () => new ArrayBuffer(0),
-				};
-			}
-			// Image requests
-			imageFetchCount++;
-			if (opts.failOnSecond && imageFetchCount === 2) {
-				throw new Error("Simulated network failure on second image");
-			}
-			// Serve fixture bytes based on extension
-			const isJpeg = req.url.match(/\.(jpg|jpeg|webp)(\?|$)/i);
-			const bytes = isJpeg ? SMALL_JPG : TINY_PNG;
-			const mimeType = isJpeg ? "image/jpeg" : "image/png";
-			const buf = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer;
-			return {
-				ok: true,
-				status: 200,
-				statusText: "OK",
-				headers: { get: (name) => (name === "content-type" ? mimeType : null) },
-				text: async () => "",
-				arrayBuffer: async () => buf,
-			};
-		},
-	};
-}
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-describe("spider() captureImages option", () => {
-	it("1. captureImages: false (default) — images field is undefined", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			// captureImages not set → defaults to false
-		});
-		expect(page.images).toBeUndefined();
-	});
-	it("2. captureImages: true — images array is populated", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		expect(page.images).toBeDefined();
-		expect(page.images!.length).toBeGreaterThan(0);
-	});
-	it("3. ImageRef fields are populated correctly", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		for (const img of page.images!) {
-			expect(img.src).toBeTruthy();
-			expect(img.mimeType).toMatch(/^image\//);
-			expect(typeof img.alt).toBe("string");
-			// Either base64 or filePath must be set
-			expect(img.base64 || img.filePath).toBeTruthy();
-		}
-	});
-	it("4. maxImages cap is respected", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-			maxImages: 2,
-		});
-		expect(page.images!.length).toBeLessThanOrEqual(2);
-	});
-	it("5. relative src URLs are resolved to absolute", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		for (const img of page.images!) {
-			// data: URLs are allowed as-is; all others must be absolute http(s)
-			if (!img.src.startsWith("data:")) {
-				expect(img.src).toMatch(/^https?:\/\//);
-			}
-		}
-		// Specifically, the relative /images/chart.png should resolve to https://example.com/images/chart.png
-		const resolved = page.images!.find((i) => i.src === "https://example.com/images/chart.png");
-		expect(resolved).toBeDefined();
-	});
-	it("6. failed image fetch is skipped gracefully — no exception propagates", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient({ failOnSecond: true }),
-			captureImages: true,
-		});
-		// Should still return a page — just with fewer images
-		expect(page.images).toBeDefined();
-		expect(page.url).toBe("https://example.com");
-	});
-	it("7. data: URL images are included without fetching", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		const dataImg = page.images!.find((i) => i.src.startsWith("data:"));
-		expect(dataImg).toBeDefined();
-		expect(dataImg!.mimeType).toBe("image/png");
-		expect(dataImg!.base64).toBeTruthy();
-	});
-	it("base64 strings are valid (decodable)", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		for (const img of page.images!) {
-			if (img.base64) {
-				expect(() => Buffer.from(img.base64!, "base64")).not.toThrow();
-				expect(Buffer.from(img.base64!, "base64").byteLength).toBeGreaterThan(0);
-			}
-		}
-	});
-	it("produces valid LLM data URLs from captured images", async () => {
-		const page = await spider("https://example.com", {
-			httpClient: makeStubClient(),
-			captureImages: true,
-		});
-		for (const img of page.images!) {
-			if (img.base64) {
-				const dataUrl = `data:${img.mimeType};base64,${img.base64}`;
-				expect(dataUrl).toMatch(/^data:image\//);
-			}
-		}
-	});
-});