@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/batch.js.map +1 -0
  2. package/dist/cache.js.map +1 -0
  3. package/dist/convert.js.map +1 -0
  4. package/dist/crawl.js.map +1 -0
  5. package/dist/disk-cache.js.map +1 -0
  6. package/dist/graph.js.map +1 -0
  7. package/dist/index.js.map +1 -0
  8. package/dist/parse.js.map +1 -0
  9. package/dist/playwright.js.map +1 -0
  10. package/dist/ports.js.map +1 -0
  11. package/dist/robots.js.map +1 -0
  12. package/dist/search.js.map +1 -0
  13. package/dist/sitemap.js.map +1 -0
  14. package/dist/spider.js.map +1 -0
  15. package/dist/throttle.js.map +1 -0
  16. package/dist/tree.js.map +1 -0
  17. package/dist/types.js.map +1 -0
  18. package/dist/views.js.map +1 -0
  19. package/dist/web-search.js.map +1 -0
  20. package/package.json +2 -1
  21. package/fixtures/article-with-images.html +0 -94
  22. package/fixtures/gh-shell.html +0 -32
  23. package/fixtures/guide-ai-agents-web-scraping.json +0 -552
  24. package/fixtures/images/large.jpg +0 -0
  25. package/fixtures/images/small.jpg +0 -0
  26. package/fixtures/images/tiny.png +0 -0
  27. package/fixtures/quotes-index.json +0 -40
  28. package/scripts/fetch-guide.mjs +0 -25
  29. package/src/cache.ts +0 -99
  30. package/src/convert.ts +0 -161
  31. package/src/crawl.ts +0 -186
  32. package/src/disk-cache.ts +0 -228
  33. package/src/graph.ts +0 -189
  34. package/src/index.ts +0 -74
  35. package/src/parse.ts +0 -154
  36. package/src/playwright.ts +0 -193
  37. package/src/ports.ts +0 -131
  38. package/src/robots.ts +0 -121
  39. package/src/search.ts +0 -173
  40. package/src/sitemap.ts +0 -67
  41. package/src/spider.ts +0 -475
  42. package/src/throttle.ts +0 -118
  43. package/src/tree.ts +0 -379
  44. package/src/types.ts +0 -225
  45. package/src/views.ts +0 -42
  46. package/src/web-search.ts +0 -548
  47. package/test/convert-images.test.ts +0 -69
  48. package/test/disk-cache-images.test.ts +0 -193
  49. package/test/engine-registry.test.ts +0 -114
  50. package/test/exports.test.ts +0 -124
  51. package/test/get-chunk.test.ts +0 -115
  52. package/test/images-integration.test.ts +0 -359
  53. package/test/improvements.test.ts +0 -279
  54. package/test/inbound-count.test.ts +0 -111
  55. package/test/lean.test.ts +0 -105
  56. package/test/playwright.test.ts +0 -128
  57. package/test/ports.test.ts +0 -161
  58. package/test/search.test.ts +0 -219
  59. package/test/spider-images.test.ts +0 -180
  60. package/test/spider-unit.test.ts +0 -610
  61. package/test/tree.test.ts +0 -272
  62. package/test/types.test.ts +0 -169
  63. package/test/web-search-integration.test.ts +0 -180
  64. package/test/web-search.test.ts +0 -305
  65. package/tsconfig.json +0 -9
  66. package/tsconfig.test.json +0 -7
  67. package/vitest.config.ts +0 -8
@@ -1,359 +0,0 @@
1
- /**
2
- * End-to-end captureImages integration tests.
3
- *
4
- * Covers the full pipeline:
5
- * spider() → SpideredPage.images → DiskCache.flush() → DiskCache.get()
6
- * → LLM data URL → PlaywrightHttpClient-shaped stub
7
- *
8
- * No real network, no real browser.
9
- */
10
-
11
- import { mkdirSync, rmSync } from "node:fs";
12
- import { tmpdir } from "node:os";
13
- import { join } from "node:path";
14
- import { readFileSync } from "node:fs";
15
- import { afterEach, beforeEach, describe, expect, it } from "vitest";
16
- import { DiskCache } from "../src/disk-cache.js";
17
- import { PlaywrightHttpClient } from "../src/playwright.js";
18
- import type { IHttpClient } from "../src/ports.js";
19
- import { spider } from "../src/spider.js";
20
-
21
- // ---------------------------------------------------------------------------
22
- // Fixtures
23
- // ---------------------------------------------------------------------------
24
-
25
- const FIXTURE_HTML = readFileSync(
26
- join(import.meta.dirname, "../fixtures/article-with-images.html"),
27
- "utf8",
28
- );
29
- const SMALL_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/small.jpg"));
30
- const TINY_PNG = readFileSync(join(import.meta.dirname, "../fixtures/images/tiny.png"));
31
- const LARGE_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/large.jpg"));
32
-
33
- // ---------------------------------------------------------------------------
34
- // Helpers
35
- // ---------------------------------------------------------------------------
36
-
37
- let testDir: string;
38
-
39
- beforeEach(() => {
40
- testDir = join(tmpdir(), `wbs-integration-${Date.now()}-${Math.random().toString(36).slice(2)}`);
41
- mkdirSync(testDir, { recursive: true });
42
- });
43
-
44
- afterEach(() => {
45
- rmSync(testDir, { recursive: true, force: true });
46
- });
47
-
48
- function makeCachePath() {
49
- return join(testDir, "pages.json");
50
- }
51
-
52
- /**
53
- * Stub client that serves the fixture HTML for page fetches and
54
- * appropriate fixture image bytes for image fetches.
55
- * `useLargeImages`: serve large.jpg (>32KB) to exercise disk-spill.
56
- */
57
- function makeStubClient(opts: { useLargeImages?: boolean } = {}): IHttpClient {
58
- return {
59
- async fetch(req) {
60
- const isImageReq = (req.headers?.["Accept"] ?? "").startsWith("image/");
61
-
62
- if (!isImageReq) {
63
- // Page fetch
64
- return {
65
- ok: true,
66
- status: 200,
67
- statusText: "OK",
68
- headers: { get: (n) => (n === "content-type" ? "text/html" : null) },
69
- text: async () => FIXTURE_HTML,
70
- arrayBuffer: async () => new ArrayBuffer(0),
71
- };
72
- }
73
-
74
- // Image fetch — pick fixture based on src extension / useLargeImages flag
75
- const src = req.url;
76
- let bytes: Buffer;
77
- let mime: string;
78
-
79
- if (opts.useLargeImages) {
80
- bytes = LARGE_JPG;
81
- mime = "image/jpeg";
82
- } else if (src.match(/\.png(\?|$)/i)) {
83
- bytes = TINY_PNG;
84
- mime = "image/png";
85
- } else {
86
- bytes = SMALL_JPG;
87
- mime = "image/jpeg";
88
- }
89
-
90
- const buf = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer;
91
- return {
92
- ok: true,
93
- status: 200,
94
- statusText: "OK",
95
- headers: { get: (n) => (n === "content-type" ? mime : null) },
96
- text: async () => "",
97
- arrayBuffer: async () => buf,
98
- };
99
- },
100
- };
101
- }
102
-
103
- // ---------------------------------------------------------------------------
104
- // 1. Full spider() → DiskCache → reload roundtrip (small images, inline)
105
- // ---------------------------------------------------------------------------
106
-
107
- describe("spider() → DiskCache roundtrip — small images (inline)", () => {
108
- it("images survive flush + reload with correct base64", async () => {
109
- const page = await spider("https://example.com", {
110
- httpClient: makeStubClient(),
111
- captureImages: true,
112
- });
113
-
114
- expect(page.images).toBeDefined();
115
- expect(page.images!.length).toBeGreaterThan(0);
116
-
117
- const cachePath = makeCachePath();
118
- const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
119
- cache1.set("https://example.com", page);
120
- cache1.flush();
121
-
122
- const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
123
- const reloaded = cache2.get("https://example.com");
124
-
125
- expect(reloaded).toBeDefined();
126
- expect(reloaded!.images).toBeDefined();
127
- expect(reloaded!.images!.length).toBe(page.images!.length);
128
-
129
- // Every base64 must survive the roundtrip exactly
130
- for (let i = 0; i < page.images!.length; i++) {
131
- const orig = page.images![i];
132
- const loaded = reloaded!.images![i];
133
- expect(loaded.src).toBe(orig.src);
134
- expect(loaded.mimeType).toBe(orig.mimeType);
135
- expect(loaded.alt).toBe(orig.alt);
136
- if (orig.base64) expect(loaded.base64).toBe(orig.base64);
137
- }
138
- });
139
-
140
- it("page text (markdown, chunks, title) also survives the roundtrip", async () => {
141
- const page = await spider("https://example.com", {
142
- httpClient: makeStubClient(),
143
- captureImages: true,
144
- });
145
-
146
- const cachePath = makeCachePath();
147
- const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
148
- cache1.set("https://example.com", page);
149
- cache1.flush();
150
-
151
- const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
152
- const reloaded = cache2.get("https://example.com");
153
-
154
- expect(reloaded!.title).toBe(page.title);
155
- expect(reloaded!.markdown).toBe(page.markdown);
156
- expect(reloaded!.chunks.length).toBe(page.chunks.length);
157
- });
158
- });
159
-
160
- // ---------------------------------------------------------------------------
161
- // 2. Full spider() → DiskCache → reload roundtrip (large images, disk-spill)
162
- // ---------------------------------------------------------------------------
163
-
164
- describe("spider() → DiskCache roundtrip — large images (disk-spill)", () => {
165
- it("large images are spilled to disk and hydrated on reload", async () => {
166
- const page = await spider("https://example.com", {
167
- httpClient: makeStubClient({ useLargeImages: true }),
168
- captureImages: true,
169
- maxImages: 3,
170
- });
171
-
172
- expect(page.images).toBeDefined();
173
- const cachePath = makeCachePath();
174
-
175
- // Use a low threshold so even SMALL_JPG spills — 100 bytes decoded
176
- const cache1 = new DiskCache(cachePath, {
177
- ttlMs: 60 * 60 * 1000,
178
- autoFlush: false,
179
- inlineImageThreshold: 100,
180
- });
181
- cache1.set("https://example.com", page);
182
- cache1.flush();
183
-
184
- const cache2 = new DiskCache(cachePath, {
185
- ttlMs: 60 * 60 * 1000,
186
- autoFlush: false,
187
- inlineImageThreshold: 100,
188
- });
189
- const reloaded = cache2.get("https://example.com");
190
-
191
- expect(reloaded!.images).toBeDefined();
192
- // All images must have base64 after hydration
193
- for (const img of reloaded!.images!) {
194
- if (img.filePath) {
195
- expect(img.base64).toBeDefined();
196
- expect(img.base64!.length).toBeGreaterThan(0);
197
- }
198
- }
199
- });
200
- });
201
-
202
- // ---------------------------------------------------------------------------
203
- // 3. LLM wire format — every image produces a valid data URL
204
- // ---------------------------------------------------------------------------
205
-
206
- describe("LLM wire format", () => {
207
- it("every captured image yields a valid data: URL", async () => {
208
- const page = await spider("https://example.com", {
209
- httpClient: makeStubClient(),
210
- captureImages: true,
211
- });
212
-
213
- expect(page.images!.length).toBeGreaterThan(0);
214
-
215
- for (const img of page.images!) {
216
- if (!img.base64) continue;
217
- const dataUrl = `data:${img.mimeType};base64,${img.base64}`;
218
- expect(dataUrl).toMatch(/^data:image\/(jpeg|png|webp|gif|svg\+xml|avif);base64,/);
219
- }
220
- });
221
-
222
- it("base64 in data URL decodes to non-empty binary", async () => {
223
- const page = await spider("https://example.com", {
224
- httpClient: makeStubClient(),
225
- captureImages: true,
226
- });
227
-
228
- for (const img of page.images!) {
229
- if (!img.base64) continue;
230
- const decoded = Buffer.from(img.base64, "base64");
231
- expect(decoded.byteLength).toBeGreaterThan(0);
232
- }
233
- });
234
-
235
- it("data: URL images from fixture have correct inline base64", async () => {
236
- const page = await spider("https://example.com", {
237
- httpClient: makeStubClient(),
238
- captureImages: true,
239
- });
240
-
241
- // The fixture contains one data: URL (1x1 PNG)
242
- const inlineImg = page.images!.find((i) => i.src.startsWith("data:"));
243
- expect(inlineImg).toBeDefined();
244
- expect(inlineImg!.mimeType).toBe("image/png");
245
-
246
- const dataUrl = `data:${inlineImg!.mimeType};base64,${inlineImg!.base64}`;
247
- expect(dataUrl).toMatch(/^data:image\/png;base64,/);
248
- });
249
- });
250
-
251
- // ---------------------------------------------------------------------------
252
- // 4. captureImages: false — no images attached, cache roundtrip clean
253
- // ---------------------------------------------------------------------------
254
-
255
- describe("captureImages: false — clean roundtrip", () => {
256
- it("images field is absent on spider() result", async () => {
257
- const page = await spider("https://example.com", {
258
- httpClient: makeStubClient(),
259
- });
260
- expect(page.images).toBeUndefined();
261
- });
262
-
263
- it("cache roundtrip without images is clean", async () => {
264
- const page = await spider("https://example.com", {
265
- httpClient: makeStubClient(),
266
- });
267
-
268
- const cachePath = makeCachePath();
269
- const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
270
- cache1.set("https://example.com", page);
271
- cache1.flush();
272
-
273
- const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
274
- const reloaded = cache2.get("https://example.com");
275
- expect(reloaded!.images).toBeUndefined();
276
- });
277
- });
278
-
279
- // ---------------------------------------------------------------------------
280
- // 5. PlaywrightHttpClient-shaped stub with captureImages
281
- // ---------------------------------------------------------------------------
282
-
283
- describe("PlaywrightHttpClient captureImages integration", () => {
284
- it("PlaywrightHttpClient constructs with captureImages:true and satisfies IHttpClient", () => {
285
- const client: IHttpClient = new PlaywrightHttpClient({ captureImages: true });
286
- expect(typeof client.fetch).toBe("function");
287
- });
288
-
289
- it("spider() with a Playwright-shaped stub and captureImages:true returns images", async () => {
290
- // Simulate what PlaywrightHttpClient would do: a stub that looks like
291
- // a Playwright client — returns HTML for page fetches, images for image fetches.
292
- const playwrightShapedStub: IHttpClient = {
293
- async fetch(req) {
294
- const isImageReq = (req.headers?.["Accept"] ?? "").startsWith("image/");
295
- if (!isImageReq) {
296
- return {
297
- ok: true,
298
- status: 200,
299
- statusText: "OK",
300
- headers: { get: (n) => (n === "content-type" ? "text/html" : null) },
301
- text: async () => FIXTURE_HTML,
302
- arrayBuffer: async () => new ArrayBuffer(0),
303
- };
304
- }
305
- const buf = SMALL_JPG.buffer.slice(
306
- SMALL_JPG.byteOffset,
307
- SMALL_JPG.byteOffset + SMALL_JPG.byteLength,
308
- ) as ArrayBuffer;
309
- return {
310
- ok: true,
311
- status: 200,
312
- statusText: "OK",
313
- headers: { get: (n) => (n === "content-type" ? "image/jpeg" : null) },
314
- text: async () => "",
315
- arrayBuffer: async () => buf,
316
- };
317
- },
318
- };
319
-
320
- const page = await spider("https://example.com", {
321
- httpClient: playwrightShapedStub,
322
- captureImages: true,
323
- });
324
-
325
- expect(page.images).toBeDefined();
326
- expect(page.images!.length).toBeGreaterThan(0);
327
-
328
- for (const img of page.images!) {
329
- if (img.base64) {
330
- expect(`data:${img.mimeType};base64,${img.base64}`).toMatch(/^data:image\//);
331
- }
332
- }
333
- });
334
- });
335
-
336
- // ---------------------------------------------------------------------------
337
- // 6. maxImages respected end-to-end through cache
338
- // ---------------------------------------------------------------------------
339
-
340
- describe("maxImages end-to-end", () => {
341
- it("maxImages:2 — only 2 images in cache after roundtrip", async () => {
342
- const page = await spider("https://example.com", {
343
- httpClient: makeStubClient(),
344
- captureImages: true,
345
- maxImages: 2,
346
- });
347
-
348
- expect(page.images!.length).toBeLessThanOrEqual(2);
349
-
350
- const cachePath = makeCachePath();
351
- const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
352
- cache1.set("https://example.com", page);
353
- cache1.flush();
354
-
355
- const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
356
- const reloaded = cache2.get("https://example.com");
357
- expect(reloaded!.images!.length).toBeLessThanOrEqual(2);
358
- });
359
- });
@@ -1,279 +0,0 @@
1
- /**
2
- * Tests for improvement tasks: JS degradation, chunk tokenBudget, sitemap discovery, disk cache.
3
- * Written before implementation — all should fail until code is in place.
4
- */
5
-
6
- import { describe, expect, it } from "vitest";
7
- import { spider } from "../src/spider.js";
8
- import { crawl } from "../src/crawl.js";
9
- import type { IHttpClient } from "../src/ports.js";
10
-
11
- // ---------------------------------------------------------------------------
12
- // Shared helpers
13
- // ---------------------------------------------------------------------------
14
-
15
- function mockClient(responses: Record<string, { status?: number; body: string }>): IHttpClient {
16
- return {
17
- fetch: async (req) => {
18
- const entry = responses[req.url] ?? responses["*"];
19
- if (!entry) throw new Error(`Unexpected fetch: ${req.url}`);
20
- const status = entry.status ?? 200;
21
- return {
22
- ok: status >= 200 && status < 300,
23
- status,
24
- statusText: status === 200 ? "OK" : "Error",
25
- headers: { get: () => null },
26
- text: async () => entry.body,
27
- arrayBuffer: async () => new ArrayBuffer(0),
28
- };
29
- },
30
- };
31
- }
32
-
33
- const articleHtml = (title: string, body: string) => `<!DOCTYPE html>
34
- <html lang="en">
35
- <head><title>${title}</title><meta name="description" content="test"></head>
36
- <body><article><h1>${title}</h1>${body}</article></body>
37
- </html>`;
38
-
39
- const LONG_BODY = `<p>${"Word ".repeat(300)}</p><h2>Section</h2><p>${"More words. ".repeat(300)}</p>`;
40
-
41
- // ---------------------------------------------------------------------------
42
- // Graceful degradation on JS-rendered pages
43
- // ---------------------------------------------------------------------------
44
-
45
- describe("JS-rendered pages degrade gracefully", () => {
46
- const jsHtml = `<!DOCTYPE html><html><head><title>App</title></head>
47
- <body><div id="root"></div><script>/* SPA */</script></body></html>`;
48
-
49
- it("returns a page with jsRendered:true instead of throwing", async () => {
50
- const page = await spider("https://example.com", {
51
- httpClient: mockClient({ "*": { body: jsHtml } }),
52
- });
53
- expect((page as { jsRendered?: boolean }).jsRendered).toBe(true);
54
- });
55
-
56
- it("still returns title and links from JS page", async () => {
57
- const html = `<!DOCTYPE html><html><head><title>My SPA</title></head>
58
- <body><div id="root"></div><a href="/about">About</a></body></html>`;
59
- const page = await spider("https://example.com", {
60
- httpClient: mockClient({ "*": { body: html } }),
61
- });
62
- expect(page.title).toContain("My SPA");
63
- expect(page.links.length).toBeGreaterThan(0);
64
- });
65
-
66
- it("returns empty chunks and markdown for JS page", async () => {
67
- const page = await spider("https://example.com", {
68
- httpClient: mockClient({ "*": { body: jsHtml } }),
69
- });
70
- expect(page.chunks).toHaveLength(0);
71
- expect(page.markdown).toBe("");
72
- });
73
-
74
- it("lean view also degrades instead of throwing", async () => {
75
- const page = await spider("https://example.com", {
76
- httpClient: mockClient({ "*": { body: jsHtml } }),
77
- view: "lean",
78
- });
79
- expect((page as { jsRendered?: boolean }).jsRendered).toBe(true);
80
- });
81
- });
82
-
83
- // ---------------------------------------------------------------------------
84
- // Chunk-aware tokenBudget
85
- // ---------------------------------------------------------------------------
86
-
87
- describe("chunk-aware tokenBudget", () => {
88
- it("returns whole chunks up to budget, not truncated mid-sentence", async () => {
89
- const page = await spider("https://example.com", {
90
- httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
91
- tokenBudget: 100,
92
- });
93
- // Should have at least one complete chunk
94
- expect(page.chunks.length).toBeGreaterThan(0);
95
- // Markdown should not end with truncation notice
96
- expect(page.markdown).not.toContain("truncated to ~");
97
- // Each chunk should end at a word boundary (not mid-word)
98
- for (const c of page.chunks) {
99
- expect(c.text.trim()).not.toMatch(/\w-$/);
100
- }
101
- });
102
-
103
- it("total chunk text fits within budget (first chunk may overflow)", async () => {
104
- const budget = 100;
105
- const page = await spider("https://example.com", {
106
- httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
107
- tokenBudget: budget,
108
- });
109
- const totalChars = page.chunks.reduce((sum, c) => sum + c.text.length, 0);
110
- // The first chunk is always included even if it exceeds the budget.
111
- // From chunk 2 onward, total must stay within budget.
112
- const firstChunkLen = page.chunks[0]?.text.length ?? 0;
113
- const rest = totalChars - firstChunkLen;
114
- expect(rest).toBeLessThanOrEqual(budget * 4);
115
- });
116
-
117
- it("without budget, returns all chunks", async () => {
118
- const withBudget = await spider("https://example.com", {
119
- httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
120
- tokenBudget: 50,
121
- });
122
- const withoutBudget = await spider("https://example.com", {
123
- httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
124
- });
125
- expect(withBudget.chunks.length).toBeLessThan(withoutBudget.chunks.length);
126
- });
127
- });
128
-
129
- // ---------------------------------------------------------------------------
130
- // Sitemap discovery
131
- // ---------------------------------------------------------------------------
132
-
133
- describe("sitemap.xml seeds crawl frontier", () => {
134
- const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
135
- <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
136
- <url><loc>https://example.com/page-a</loc></url>
137
- <url><loc>https://example.com/page-b</loc></url>
138
- <url><loc>https://example.com/page-c</loc></url>
139
- </urlset>`;
140
-
141
- const pageHtml = articleHtml("Page", "<p>Content here. ".repeat(20) + "</p>");
142
-
143
- it("fetches sitemap.xml and includes those URLs in crawl", async () => {
144
- const visited: string[] = [];
145
- const client = mockClient({
146
- "https://example.com": { body: pageHtml },
147
- "https://example.com/sitemap.xml": { body: sitemapXml },
148
- "https://example.com/page-a": { body: pageHtml },
149
- "https://example.com/page-b": { body: pageHtml },
150
- "https://example.com/page-c": { body: pageHtml },
151
- });
152
-
153
- const result = await crawl("https://example.com", {
154
- httpClient: client,
155
- maxDepth: 0,
156
- maxPages: 10,
157
- useSitemap: true,
158
- onPage: (p) => visited.push(p.url),
159
- });
160
-
161
- expect(result.pages.has("https://example.com/page-a")).toBe(true);
162
- expect(result.pages.has("https://example.com/page-b")).toBe(true);
163
- expect(result.pages.has("https://example.com/page-c")).toBe(true);
164
- });
165
-
166
- it("falls back to normal BFS when sitemap is missing (404)", async () => {
167
- const client = mockClient({
168
- "https://example.com": { body: articleHtml("Home", '<p>Text. <a href="/about">About</a></p>') },
169
- "https://example.com/sitemap.xml": { status: 404, body: "" },
170
- "https://example.com/about": { body: pageHtml },
171
- });
172
-
173
- const result = await crawl("https://example.com", {
174
- httpClient: client,
175
- maxDepth: 1,
176
- useSitemap: true,
177
- });
178
-
179
- expect(result.pages.size).toBeGreaterThan(0);
180
- // Should not throw even though sitemap 404d
181
- });
182
-
183
- it("sitemap disabled when useSitemap:false", async () => {
184
- const sitemapFetched = { value: false };
185
- const client: IHttpClient = {
186
- fetch: async (req) => {
187
- if (req.url.includes("sitemap")) sitemapFetched.value = true;
188
- return {
189
- ok: true, status: 200, statusText: "OK",
190
- headers: { get: () => null },
191
- text: async () => pageHtml,
192
- arrayBuffer: async () => new ArrayBuffer(0),
193
- };
194
- },
195
- };
196
-
197
- await crawl("https://example.com", {
198
- httpClient: client,
199
- maxDepth: 0,
200
- useSitemap: false,
201
- });
202
-
203
- expect(sitemapFetched.value).toBe(false);
204
- });
205
- });
206
-
207
- // ---------------------------------------------------------------------------
208
- // Disk cache (tested via ICache contract)
209
- // ---------------------------------------------------------------------------
210
-
211
- import { DiskCache } from "../src/disk-cache.js";
212
- import { mkdtempSync, rmSync } from "node:fs";
213
- import { join } from "node:path";
214
- import { tmpdir } from "node:os";
215
- import type { SpideredPage } from "../src/types.js";
216
-
217
- describe("DiskCache persists across instances", () => {
218
- function makePage(url: string): SpideredPage {
219
- return {
220
- url, domain: "example.com", fetchedAt: new Date().toISOString(),
221
- title: "Test", description: "desc", author: "", publishedAt: "",
222
- lang: "en", tags: [], wordCount: 10, readingTimeMinutes: 1,
223
- headings: [], chunks: [], links: [], markdown: "hello",
224
- };
225
- }
226
-
227
- it("persists a page and retrieves it in a new instance", () => {
228
- const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
229
- const path = join(dir, "cache.json");
230
- try {
231
- const cache1 = new DiskCache(path);
232
- const page = makePage("https://example.com/a");
233
- cache1.set("https://example.com/a", page);
234
- cache1.flush();
235
-
236
- const cache2 = new DiskCache(path);
237
- const retrieved = cache2.get("https://example.com/a");
238
- expect(retrieved?.url).toBe("https://example.com/a");
239
- expect(retrieved?.markdown).toBe("hello");
240
- } finally {
241
- rmSync(dir, { recursive: true });
242
- }
243
- });
244
-
245
- it("returns undefined for expired entries", () => {
246
- const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
247
- const path = join(dir, "cache.json");
248
- try {
249
- const cache = new DiskCache(path, { ttlMs: 1 }); // 1ms TTL
250
- cache.set("https://example.com/b", makePage("https://example.com/b"));
251
- cache.flush();
252
-
253
- // Wait for TTL to expire
254
- const waited = Date.now() + 5;
255
- while (Date.now() < waited) { /* spin */ }
256
-
257
- const cache2 = new DiskCache(path, { ttlMs: 1 });
258
- expect(cache2.get("https://example.com/b")).toBeUndefined();
259
- } finally {
260
- rmSync(dir, { recursive: true });
261
- }
262
- });
263
-
264
- it("implements ICache interface", () => {
265
- const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
266
- const path = join(dir, "cache.json");
267
- try {
268
- const cache = new DiskCache(path);
269
- const page = makePage("https://example.com/c");
270
- expect(cache.has("https://example.com/c")).toBe(false);
271
- cache.set("https://example.com/c", page);
272
- expect(cache.has("https://example.com/c")).toBe(true);
273
- cache.delete("https://example.com/c");
274
- expect(cache.has("https://example.com/c")).toBe(false);
275
- } finally {
276
- rmSync(dir, { recursive: true });
277
- }
278
- });
279
- });