@dpopsuev/web-spider 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.d.ts +24 -0
- package/dist/batch.d.ts.map +1 -0
- package/dist/batch.js +68 -0
- package/dist/cache.d.ts +40 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/convert.d.ts +29 -0
- package/dist/convert.d.ts.map +1 -0
- package/dist/convert.js +131 -0
- package/dist/crawl.d.ts +56 -0
- package/dist/crawl.d.ts.map +1 -0
- package/dist/crawl.js +126 -0
- package/dist/disk-cache.d.ts +75 -0
- package/dist/disk-cache.d.ts.map +1 -0
- package/dist/disk-cache.js +185 -0
- package/dist/graph.d.ts +76 -0
- package/dist/graph.d.ts.map +1 -0
- package/dist/graph.js +156 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +44 -0
- package/dist/parse.d.ts +27 -0
- package/dist/parse.d.ts.map +1 -0
- package/dist/parse.js +131 -0
- package/dist/playwright.d.ts +75 -0
- package/dist/playwright.d.ts.map +1 -0
- package/dist/playwright.js +141 -0
- package/dist/ports.d.ts +104 -0
- package/dist/ports.d.ts.map +1 -0
- package/dist/ports.js +10 -0
- package/dist/robots.d.ts +24 -0
- package/dist/robots.d.ts.map +1 -0
- package/dist/robots.js +104 -0
- package/dist/search.d.ts +47 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +112 -0
- package/dist/sitemap.d.ts +15 -0
- package/dist/sitemap.d.ts.map +1 -0
- package/dist/sitemap.js +65 -0
- package/dist/spider.d.ts +74 -0
- package/dist/spider.d.ts.map +1 -0
- package/dist/spider.js +349 -0
- package/dist/throttle.d.ts +49 -0
- package/dist/throttle.d.ts.map +1 -0
- package/dist/throttle.js +85 -0
- package/dist/tree.d.ts +34 -0
- package/dist/tree.d.ts.map +1 -0
- package/dist/tree.js +354 -0
- package/dist/types.d.ts +189 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/views.d.ts +17 -0
- package/dist/views.d.ts.map +1 -0
- package/dist/views.js +39 -0
- package/dist/web-search.d.ts +184 -0
- package/dist/web-search.d.ts.map +1 -0
- package/dist/web-search.js +399 -0
- package/fixtures/article-with-images.html +94 -0
- package/fixtures/gh-shell.html +32 -0
- package/fixtures/guide-ai-agents-web-scraping.json +552 -0
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +40 -0
- package/package.json +47 -0
- package/scripts/fetch-guide.mjs +25 -0
- package/src/cache.ts +99 -0
- package/src/convert.ts +161 -0
- package/src/crawl.ts +186 -0
- package/src/disk-cache.ts +228 -0
- package/src/graph.ts +189 -0
- package/src/index.ts +74 -0
- package/src/parse.ts +154 -0
- package/src/playwright.ts +193 -0
- package/src/ports.ts +131 -0
- package/src/robots.ts +121 -0
- package/src/search.ts +173 -0
- package/src/sitemap.ts +67 -0
- package/src/spider.ts +475 -0
- package/src/throttle.ts +118 -0
- package/src/tree.ts +379 -0
- package/src/types.ts +225 -0
- package/src/views.ts +42 -0
- package/src/web-search.ts +548 -0
- package/test/convert-images.test.ts +69 -0
- package/test/disk-cache-images.test.ts +193 -0
- package/test/engine-registry.test.ts +114 -0
- package/test/exports.test.ts +124 -0
- package/test/get-chunk.test.ts +115 -0
- package/test/images-integration.test.ts +359 -0
- package/test/improvements.test.ts +279 -0
- package/test/inbound-count.test.ts +111 -0
- package/test/lean.test.ts +105 -0
- package/test/playwright.test.ts +128 -0
- package/test/ports.test.ts +161 -0
- package/test/search.test.ts +219 -0
- package/test/spider-images.test.ts +180 -0
- package/test/spider-unit.test.ts +610 -0
- package/test/tree.test.ts +272 -0
- package/test/types.test.ts +169 -0
- package/test/web-search-integration.test.ts +180 -0
- package/test/web-search.test.ts +305 -0
- package/tsconfig.json +9 -0
- package/tsconfig.test.json +7 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* End-to-end captureImages integration tests.
|
|
3
|
+
*
|
|
4
|
+
* Covers the full pipeline:
|
|
5
|
+
* spider() → SpideredPage.images → DiskCache.flush() → DiskCache.get()
|
|
6
|
+
* → LLM data URL → PlaywrightHttpClient-shaped stub
|
|
7
|
+
*
|
|
8
|
+
* No real network, no real browser.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { mkdirSync, rmSync } from "node:fs";
|
|
12
|
+
import { tmpdir } from "node:os";
|
|
13
|
+
import { join } from "node:path";
|
|
14
|
+
import { readFileSync } from "node:fs";
|
|
15
|
+
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
|
16
|
+
import { DiskCache } from "../src/disk-cache.js";
|
|
17
|
+
import { PlaywrightHttpClient } from "../src/playwright.js";
|
|
18
|
+
import type { IHttpClient } from "../src/ports.js";
|
|
19
|
+
import { spider } from "../src/spider.js";
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Fixtures
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
const FIXTURE_HTML = readFileSync(
|
|
26
|
+
join(import.meta.dirname, "../fixtures/article-with-images.html"),
|
|
27
|
+
"utf8",
|
|
28
|
+
);
|
|
29
|
+
const SMALL_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/small.jpg"));
|
|
30
|
+
const TINY_PNG = readFileSync(join(import.meta.dirname, "../fixtures/images/tiny.png"));
|
|
31
|
+
const LARGE_JPG = readFileSync(join(import.meta.dirname, "../fixtures/images/large.jpg"));
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Helpers
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
let testDir: string;
|
|
38
|
+
|
|
39
|
+
beforeEach(() => {
|
|
40
|
+
testDir = join(tmpdir(), `wbs-integration-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
|
41
|
+
mkdirSync(testDir, { recursive: true });
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
afterEach(() => {
|
|
45
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
function makeCachePath() {
|
|
49
|
+
return join(testDir, "pages.json");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Stub client that serves the fixture HTML for page fetches and
|
|
54
|
+
* appropriate fixture image bytes for image fetches.
|
|
55
|
+
* `useLargeImages`: serve large.jpg (>32KB) to exercise disk-spill.
|
|
56
|
+
*/
|
|
57
|
+
function makeStubClient(opts: { useLargeImages?: boolean } = {}): IHttpClient {
|
|
58
|
+
return {
|
|
59
|
+
async fetch(req) {
|
|
60
|
+
const isImageReq = (req.headers?.["Accept"] ?? "").startsWith("image/");
|
|
61
|
+
|
|
62
|
+
if (!isImageReq) {
|
|
63
|
+
// Page fetch
|
|
64
|
+
return {
|
|
65
|
+
ok: true,
|
|
66
|
+
status: 200,
|
|
67
|
+
statusText: "OK",
|
|
68
|
+
headers: { get: (n) => (n === "content-type" ? "text/html" : null) },
|
|
69
|
+
text: async () => FIXTURE_HTML,
|
|
70
|
+
arrayBuffer: async () => new ArrayBuffer(0),
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Image fetch — pick fixture based on src extension / useLargeImages flag
|
|
75
|
+
const src = req.url;
|
|
76
|
+
let bytes: Buffer;
|
|
77
|
+
let mime: string;
|
|
78
|
+
|
|
79
|
+
if (opts.useLargeImages) {
|
|
80
|
+
bytes = LARGE_JPG;
|
|
81
|
+
mime = "image/jpeg";
|
|
82
|
+
} else if (src.match(/\.png(\?|$)/i)) {
|
|
83
|
+
bytes = TINY_PNG;
|
|
84
|
+
mime = "image/png";
|
|
85
|
+
} else {
|
|
86
|
+
bytes = SMALL_JPG;
|
|
87
|
+
mime = "image/jpeg";
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const buf = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer;
|
|
91
|
+
return {
|
|
92
|
+
ok: true,
|
|
93
|
+
status: 200,
|
|
94
|
+
statusText: "OK",
|
|
95
|
+
headers: { get: (n) => (n === "content-type" ? mime : null) },
|
|
96
|
+
text: async () => "",
|
|
97
|
+
arrayBuffer: async () => buf,
|
|
98
|
+
};
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
// 1. Full spider() → DiskCache → reload roundtrip (small images, inline)
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
describe("spider() → DiskCache roundtrip — small images (inline)", () => {
|
|
108
|
+
it("images survive flush + reload with correct base64", async () => {
|
|
109
|
+
const page = await spider("https://example.com", {
|
|
110
|
+
httpClient: makeStubClient(),
|
|
111
|
+
captureImages: true,
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
expect(page.images).toBeDefined();
|
|
115
|
+
expect(page.images!.length).toBeGreaterThan(0);
|
|
116
|
+
|
|
117
|
+
const cachePath = makeCachePath();
|
|
118
|
+
const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
119
|
+
cache1.set("https://example.com", page);
|
|
120
|
+
cache1.flush();
|
|
121
|
+
|
|
122
|
+
const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
123
|
+
const reloaded = cache2.get("https://example.com");
|
|
124
|
+
|
|
125
|
+
expect(reloaded).toBeDefined();
|
|
126
|
+
expect(reloaded!.images).toBeDefined();
|
|
127
|
+
expect(reloaded!.images!.length).toBe(page.images!.length);
|
|
128
|
+
|
|
129
|
+
// Every base64 must survive the roundtrip exactly
|
|
130
|
+
for (let i = 0; i < page.images!.length; i++) {
|
|
131
|
+
const orig = page.images![i];
|
|
132
|
+
const loaded = reloaded!.images![i];
|
|
133
|
+
expect(loaded.src).toBe(orig.src);
|
|
134
|
+
expect(loaded.mimeType).toBe(orig.mimeType);
|
|
135
|
+
expect(loaded.alt).toBe(orig.alt);
|
|
136
|
+
if (orig.base64) expect(loaded.base64).toBe(orig.base64);
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it("page text (markdown, chunks, title) also survives the roundtrip", async () => {
|
|
141
|
+
const page = await spider("https://example.com", {
|
|
142
|
+
httpClient: makeStubClient(),
|
|
143
|
+
captureImages: true,
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const cachePath = makeCachePath();
|
|
147
|
+
const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
148
|
+
cache1.set("https://example.com", page);
|
|
149
|
+
cache1.flush();
|
|
150
|
+
|
|
151
|
+
const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
152
|
+
const reloaded = cache2.get("https://example.com");
|
|
153
|
+
|
|
154
|
+
expect(reloaded!.title).toBe(page.title);
|
|
155
|
+
expect(reloaded!.markdown).toBe(page.markdown);
|
|
156
|
+
expect(reloaded!.chunks.length).toBe(page.chunks.length);
|
|
157
|
+
});
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
// 2. Full spider() → DiskCache → reload roundtrip (large images, disk-spill)
|
|
162
|
+
// ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
describe("spider() → DiskCache roundtrip — large images (disk-spill)", () => {
|
|
165
|
+
it("large images are spilled to disk and hydrated on reload", async () => {
|
|
166
|
+
const page = await spider("https://example.com", {
|
|
167
|
+
httpClient: makeStubClient({ useLargeImages: true }),
|
|
168
|
+
captureImages: true,
|
|
169
|
+
maxImages: 3,
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
expect(page.images).toBeDefined();
|
|
173
|
+
const cachePath = makeCachePath();
|
|
174
|
+
|
|
175
|
+
// Use a low threshold so even SMALL_JPG spills — 100 bytes decoded
|
|
176
|
+
const cache1 = new DiskCache(cachePath, {
|
|
177
|
+
ttlMs: 60 * 60 * 1000,
|
|
178
|
+
autoFlush: false,
|
|
179
|
+
inlineImageThreshold: 100,
|
|
180
|
+
});
|
|
181
|
+
cache1.set("https://example.com", page);
|
|
182
|
+
cache1.flush();
|
|
183
|
+
|
|
184
|
+
const cache2 = new DiskCache(cachePath, {
|
|
185
|
+
ttlMs: 60 * 60 * 1000,
|
|
186
|
+
autoFlush: false,
|
|
187
|
+
inlineImageThreshold: 100,
|
|
188
|
+
});
|
|
189
|
+
const reloaded = cache2.get("https://example.com");
|
|
190
|
+
|
|
191
|
+
expect(reloaded!.images).toBeDefined();
|
|
192
|
+
// All images must have base64 after hydration
|
|
193
|
+
for (const img of reloaded!.images!) {
|
|
194
|
+
if (img.filePath) {
|
|
195
|
+
expect(img.base64).toBeDefined();
|
|
196
|
+
expect(img.base64!.length).toBeGreaterThan(0);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
// 3. LLM wire format — every image produces a valid data URL
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
describe("LLM wire format", () => {
|
|
207
|
+
it("every captured image yields a valid data: URL", async () => {
|
|
208
|
+
const page = await spider("https://example.com", {
|
|
209
|
+
httpClient: makeStubClient(),
|
|
210
|
+
captureImages: true,
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
expect(page.images!.length).toBeGreaterThan(0);
|
|
214
|
+
|
|
215
|
+
for (const img of page.images!) {
|
|
216
|
+
if (!img.base64) continue;
|
|
217
|
+
const dataUrl = `data:${img.mimeType};base64,${img.base64}`;
|
|
218
|
+
expect(dataUrl).toMatch(/^data:image\/(jpeg|png|webp|gif|svg\+xml|avif);base64,/);
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
it("base64 in data URL decodes to non-empty binary", async () => {
|
|
223
|
+
const page = await spider("https://example.com", {
|
|
224
|
+
httpClient: makeStubClient(),
|
|
225
|
+
captureImages: true,
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
for (const img of page.images!) {
|
|
229
|
+
if (!img.base64) continue;
|
|
230
|
+
const decoded = Buffer.from(img.base64, "base64");
|
|
231
|
+
expect(decoded.byteLength).toBeGreaterThan(0);
|
|
232
|
+
}
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
it("data: URL images from fixture have correct inline base64", async () => {
|
|
236
|
+
const page = await spider("https://example.com", {
|
|
237
|
+
httpClient: makeStubClient(),
|
|
238
|
+
captureImages: true,
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
// The fixture contains one data: URL (1x1 PNG)
|
|
242
|
+
const inlineImg = page.images!.find((i) => i.src.startsWith("data:"));
|
|
243
|
+
expect(inlineImg).toBeDefined();
|
|
244
|
+
expect(inlineImg!.mimeType).toBe("image/png");
|
|
245
|
+
|
|
246
|
+
const dataUrl = `data:${inlineImg!.mimeType};base64,${inlineImg!.base64}`;
|
|
247
|
+
expect(dataUrl).toMatch(/^data:image\/png;base64,/);
|
|
248
|
+
});
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
// ---------------------------------------------------------------------------
|
|
252
|
+
// 4. captureImages: false — no images attached, cache roundtrip clean
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
describe("captureImages: false — clean roundtrip", () => {
|
|
256
|
+
it("images field is absent on spider() result", async () => {
|
|
257
|
+
const page = await spider("https://example.com", {
|
|
258
|
+
httpClient: makeStubClient(),
|
|
259
|
+
});
|
|
260
|
+
expect(page.images).toBeUndefined();
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it("cache roundtrip without images is clean", async () => {
|
|
264
|
+
const page = await spider("https://example.com", {
|
|
265
|
+
httpClient: makeStubClient(),
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
const cachePath = makeCachePath();
|
|
269
|
+
const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
270
|
+
cache1.set("https://example.com", page);
|
|
271
|
+
cache1.flush();
|
|
272
|
+
|
|
273
|
+
const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
274
|
+
const reloaded = cache2.get("https://example.com");
|
|
275
|
+
expect(reloaded!.images).toBeUndefined();
|
|
276
|
+
});
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
// 5. PlaywrightHttpClient-shaped stub with captureImages
|
|
281
|
+
// ---------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
describe("PlaywrightHttpClient captureImages integration", () => {
|
|
284
|
+
it("PlaywrightHttpClient constructs with captureImages:true and satisfies IHttpClient", () => {
|
|
285
|
+
const client: IHttpClient = new PlaywrightHttpClient({ captureImages: true });
|
|
286
|
+
expect(typeof client.fetch).toBe("function");
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
it("spider() with a Playwright-shaped stub and captureImages:true returns images", async () => {
|
|
290
|
+
// Simulate what PlaywrightHttpClient would do: a stub that looks like
|
|
291
|
+
// a Playwright client — returns HTML for page fetches, images for image fetches.
|
|
292
|
+
const playwrightShapedStub: IHttpClient = {
|
|
293
|
+
async fetch(req) {
|
|
294
|
+
const isImageReq = (req.headers?.["Accept"] ?? "").startsWith("image/");
|
|
295
|
+
if (!isImageReq) {
|
|
296
|
+
return {
|
|
297
|
+
ok: true,
|
|
298
|
+
status: 200,
|
|
299
|
+
statusText: "OK",
|
|
300
|
+
headers: { get: (n) => (n === "content-type" ? "text/html" : null) },
|
|
301
|
+
text: async () => FIXTURE_HTML,
|
|
302
|
+
arrayBuffer: async () => new ArrayBuffer(0),
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
const buf = SMALL_JPG.buffer.slice(
|
|
306
|
+
SMALL_JPG.byteOffset,
|
|
307
|
+
SMALL_JPG.byteOffset + SMALL_JPG.byteLength,
|
|
308
|
+
) as ArrayBuffer;
|
|
309
|
+
return {
|
|
310
|
+
ok: true,
|
|
311
|
+
status: 200,
|
|
312
|
+
statusText: "OK",
|
|
313
|
+
headers: { get: (n) => (n === "content-type" ? "image/jpeg" : null) },
|
|
314
|
+
text: async () => "",
|
|
315
|
+
arrayBuffer: async () => buf,
|
|
316
|
+
};
|
|
317
|
+
},
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
const page = await spider("https://example.com", {
|
|
321
|
+
httpClient: playwrightShapedStub,
|
|
322
|
+
captureImages: true,
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
expect(page.images).toBeDefined();
|
|
326
|
+
expect(page.images!.length).toBeGreaterThan(0);
|
|
327
|
+
|
|
328
|
+
for (const img of page.images!) {
|
|
329
|
+
if (img.base64) {
|
|
330
|
+
expect(`data:${img.mimeType};base64,${img.base64}`).toMatch(/^data:image\//);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
});
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
// ---------------------------------------------------------------------------
|
|
337
|
+
// 6. maxImages respected end-to-end through cache
|
|
338
|
+
// ---------------------------------------------------------------------------
|
|
339
|
+
|
|
340
|
+
describe("maxImages end-to-end", () => {
|
|
341
|
+
it("maxImages:2 — only 2 images in cache after roundtrip", async () => {
|
|
342
|
+
const page = await spider("https://example.com", {
|
|
343
|
+
httpClient: makeStubClient(),
|
|
344
|
+
captureImages: true,
|
|
345
|
+
maxImages: 2,
|
|
346
|
+
});
|
|
347
|
+
|
|
348
|
+
expect(page.images!.length).toBeLessThanOrEqual(2);
|
|
349
|
+
|
|
350
|
+
const cachePath = makeCachePath();
|
|
351
|
+
const cache1 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
352
|
+
cache1.set("https://example.com", page);
|
|
353
|
+
cache1.flush();
|
|
354
|
+
|
|
355
|
+
const cache2 = new DiskCache(cachePath, { ttlMs: 60 * 60 * 1000, autoFlush: false });
|
|
356
|
+
const reloaded = cache2.get("https://example.com");
|
|
357
|
+
expect(reloaded!.images!.length).toBeLessThanOrEqual(2);
|
|
358
|
+
});
|
|
359
|
+
});
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for improvement tasks: JS degradation, chunk tokenBudget, sitemap discovery, disk cache.
|
|
3
|
+
* Written before implementation — all should fail until code is in place.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { describe, expect, it } from "vitest";
|
|
7
|
+
import { spider } from "../src/spider.js";
|
|
8
|
+
import { crawl } from "../src/crawl.js";
|
|
9
|
+
import type { IHttpClient } from "../src/ports.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Shared helpers
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
function mockClient(responses: Record<string, { status?: number; body: string }>): IHttpClient {
|
|
16
|
+
return {
|
|
17
|
+
fetch: async (req) => {
|
|
18
|
+
const entry = responses[req.url] ?? responses["*"];
|
|
19
|
+
if (!entry) throw new Error(`Unexpected fetch: ${req.url}`);
|
|
20
|
+
const status = entry.status ?? 200;
|
|
21
|
+
return {
|
|
22
|
+
ok: status >= 200 && status < 300,
|
|
23
|
+
status,
|
|
24
|
+
statusText: status === 200 ? "OK" : "Error",
|
|
25
|
+
headers: { get: () => null },
|
|
26
|
+
text: async () => entry.body,
|
|
27
|
+
arrayBuffer: async () => new ArrayBuffer(0),
|
|
28
|
+
};
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const articleHtml = (title: string, body: string) => `<!DOCTYPE html>
|
|
34
|
+
<html lang="en">
|
|
35
|
+
<head><title>${title}</title><meta name="description" content="test"></head>
|
|
36
|
+
<body><article><h1>${title}</h1>${body}</article></body>
|
|
37
|
+
</html>`;
|
|
38
|
+
|
|
39
|
+
const LONG_BODY = `<p>${"Word ".repeat(300)}</p><h2>Section</h2><p>${"More words. ".repeat(300)}</p>`;
|
|
40
|
+
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// Graceful degradation on JS-rendered pages
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
describe("JS-rendered pages degrade gracefully", () => {
|
|
46
|
+
const jsHtml = `<!DOCTYPE html><html><head><title>App</title></head>
|
|
47
|
+
<body><div id="root"></div><script>/* SPA */</script></body></html>`;
|
|
48
|
+
|
|
49
|
+
it("returns a page with jsRendered:true instead of throwing", async () => {
|
|
50
|
+
const page = await spider("https://example.com", {
|
|
51
|
+
httpClient: mockClient({ "*": { body: jsHtml } }),
|
|
52
|
+
});
|
|
53
|
+
expect((page as { jsRendered?: boolean }).jsRendered).toBe(true);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it("still returns title and links from JS page", async () => {
|
|
57
|
+
const html = `<!DOCTYPE html><html><head><title>My SPA</title></head>
|
|
58
|
+
<body><div id="root"></div><a href="/about">About</a></body></html>`;
|
|
59
|
+
const page = await spider("https://example.com", {
|
|
60
|
+
httpClient: mockClient({ "*": { body: html } }),
|
|
61
|
+
});
|
|
62
|
+
expect(page.title).toContain("My SPA");
|
|
63
|
+
expect(page.links.length).toBeGreaterThan(0);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
it("returns empty chunks and markdown for JS page", async () => {
|
|
67
|
+
const page = await spider("https://example.com", {
|
|
68
|
+
httpClient: mockClient({ "*": { body: jsHtml } }),
|
|
69
|
+
});
|
|
70
|
+
expect(page.chunks).toHaveLength(0);
|
|
71
|
+
expect(page.markdown).toBe("");
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("lean view also degrades instead of throwing", async () => {
|
|
75
|
+
const page = await spider("https://example.com", {
|
|
76
|
+
httpClient: mockClient({ "*": { body: jsHtml } }),
|
|
77
|
+
view: "lean",
|
|
78
|
+
});
|
|
79
|
+
expect((page as { jsRendered?: boolean }).jsRendered).toBe(true);
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Chunk-aware tokenBudget
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
describe("chunk-aware tokenBudget", () => {
|
|
88
|
+
it("returns whole chunks up to budget, not truncated mid-sentence", async () => {
|
|
89
|
+
const page = await spider("https://example.com", {
|
|
90
|
+
httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
|
|
91
|
+
tokenBudget: 100,
|
|
92
|
+
});
|
|
93
|
+
// Should have at least one complete chunk
|
|
94
|
+
expect(page.chunks.length).toBeGreaterThan(0);
|
|
95
|
+
// Markdown should not end with truncation notice
|
|
96
|
+
expect(page.markdown).not.toContain("truncated to ~");
|
|
97
|
+
// Each chunk should end at a word boundary (not mid-word)
|
|
98
|
+
for (const c of page.chunks) {
|
|
99
|
+
expect(c.text.trim()).not.toMatch(/\w-$/);
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it("total chunk text fits within budget (first chunk may overflow)", async () => {
|
|
104
|
+
const budget = 100;
|
|
105
|
+
const page = await spider("https://example.com", {
|
|
106
|
+
httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
|
|
107
|
+
tokenBudget: budget,
|
|
108
|
+
});
|
|
109
|
+
const totalChars = page.chunks.reduce((sum, c) => sum + c.text.length, 0);
|
|
110
|
+
// The first chunk is always included even if it exceeds the budget.
|
|
111
|
+
// From chunk 2 onward, total must stay within budget.
|
|
112
|
+
const firstChunkLen = page.chunks[0]?.text.length ?? 0;
|
|
113
|
+
const rest = totalChars - firstChunkLen;
|
|
114
|
+
expect(rest).toBeLessThanOrEqual(budget * 4);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it("without budget, returns all chunks", async () => {
|
|
118
|
+
const withBudget = await spider("https://example.com", {
|
|
119
|
+
httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
|
|
120
|
+
tokenBudget: 50,
|
|
121
|
+
});
|
|
122
|
+
const withoutBudget = await spider("https://example.com", {
|
|
123
|
+
httpClient: mockClient({ "*": { body: articleHtml("Test", LONG_BODY) } }),
|
|
124
|
+
});
|
|
125
|
+
expect(withBudget.chunks.length).toBeLessThan(withoutBudget.chunks.length);
|
|
126
|
+
});
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
// Sitemap discovery
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
describe("sitemap.xml seeds crawl frontier", () => {
|
|
134
|
+
const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
135
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
136
|
+
<url><loc>https://example.com/page-a</loc></url>
|
|
137
|
+
<url><loc>https://example.com/page-b</loc></url>
|
|
138
|
+
<url><loc>https://example.com/page-c</loc></url>
|
|
139
|
+
</urlset>`;
|
|
140
|
+
|
|
141
|
+
const pageHtml = articleHtml("Page", "<p>Content here. ".repeat(20) + "</p>");
|
|
142
|
+
|
|
143
|
+
it("fetches sitemap.xml and includes those URLs in crawl", async () => {
|
|
144
|
+
const visited: string[] = [];
|
|
145
|
+
const client = mockClient({
|
|
146
|
+
"https://example.com": { body: pageHtml },
|
|
147
|
+
"https://example.com/sitemap.xml": { body: sitemapXml },
|
|
148
|
+
"https://example.com/page-a": { body: pageHtml },
|
|
149
|
+
"https://example.com/page-b": { body: pageHtml },
|
|
150
|
+
"https://example.com/page-c": { body: pageHtml },
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
const result = await crawl("https://example.com", {
|
|
154
|
+
httpClient: client,
|
|
155
|
+
maxDepth: 0,
|
|
156
|
+
maxPages: 10,
|
|
157
|
+
useSitemap: true,
|
|
158
|
+
onPage: (p) => visited.push(p.url),
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
expect(result.pages.has("https://example.com/page-a")).toBe(true);
|
|
162
|
+
expect(result.pages.has("https://example.com/page-b")).toBe(true);
|
|
163
|
+
expect(result.pages.has("https://example.com/page-c")).toBe(true);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it("falls back to normal BFS when sitemap is missing (404)", async () => {
|
|
167
|
+
const client = mockClient({
|
|
168
|
+
"https://example.com": { body: articleHtml("Home", '<p>Text. <a href="/about">About</a></p>') },
|
|
169
|
+
"https://example.com/sitemap.xml": { status: 404, body: "" },
|
|
170
|
+
"https://example.com/about": { body: pageHtml },
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const result = await crawl("https://example.com", {
|
|
174
|
+
httpClient: client,
|
|
175
|
+
maxDepth: 1,
|
|
176
|
+
useSitemap: true,
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
expect(result.pages.size).toBeGreaterThan(0);
|
|
180
|
+
// Should not throw even though sitemap 404d
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it("sitemap disabled when useSitemap:false", async () => {
|
|
184
|
+
const sitemapFetched = { value: false };
|
|
185
|
+
const client: IHttpClient = {
|
|
186
|
+
fetch: async (req) => {
|
|
187
|
+
if (req.url.includes("sitemap")) sitemapFetched.value = true;
|
|
188
|
+
return {
|
|
189
|
+
ok: true, status: 200, statusText: "OK",
|
|
190
|
+
headers: { get: () => null },
|
|
191
|
+
text: async () => pageHtml,
|
|
192
|
+
arrayBuffer: async () => new ArrayBuffer(0),
|
|
193
|
+
};
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
await crawl("https://example.com", {
|
|
198
|
+
httpClient: client,
|
|
199
|
+
maxDepth: 0,
|
|
200
|
+
useSitemap: false,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
expect(sitemapFetched.value).toBe(false);
|
|
204
|
+
});
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// Disk cache (tested via ICache contract)
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
import { DiskCache } from "../src/disk-cache.js";
|
|
212
|
+
import { mkdtempSync, rmSync } from "node:fs";
|
|
213
|
+
import { join } from "node:path";
|
|
214
|
+
import { tmpdir } from "node:os";
|
|
215
|
+
import type { SpideredPage } from "../src/types.js";
|
|
216
|
+
|
|
217
|
+
describe("DiskCache persists across instances", () => {
|
|
218
|
+
function makePage(url: string): SpideredPage {
|
|
219
|
+
return {
|
|
220
|
+
url, domain: "example.com", fetchedAt: new Date().toISOString(),
|
|
221
|
+
title: "Test", description: "desc", author: "", publishedAt: "",
|
|
222
|
+
lang: "en", tags: [], wordCount: 10, readingTimeMinutes: 1,
|
|
223
|
+
headings: [], chunks: [], links: [], markdown: "hello",
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
it("persists a page and retrieves it in a new instance", () => {
|
|
228
|
+
const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
|
|
229
|
+
const path = join(dir, "cache.json");
|
|
230
|
+
try {
|
|
231
|
+
const cache1 = new DiskCache(path);
|
|
232
|
+
const page = makePage("https://example.com/a");
|
|
233
|
+
cache1.set("https://example.com/a", page);
|
|
234
|
+
cache1.flush();
|
|
235
|
+
|
|
236
|
+
const cache2 = new DiskCache(path);
|
|
237
|
+
const retrieved = cache2.get("https://example.com/a");
|
|
238
|
+
expect(retrieved?.url).toBe("https://example.com/a");
|
|
239
|
+
expect(retrieved?.markdown).toBe("hello");
|
|
240
|
+
} finally {
|
|
241
|
+
rmSync(dir, { recursive: true });
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
it("returns undefined for expired entries", () => {
|
|
246
|
+
const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
|
|
247
|
+
const path = join(dir, "cache.json");
|
|
248
|
+
try {
|
|
249
|
+
const cache = new DiskCache(path, { ttlMs: 1 }); // 1ms TTL
|
|
250
|
+
cache.set("https://example.com/b", makePage("https://example.com/b"));
|
|
251
|
+
cache.flush();
|
|
252
|
+
|
|
253
|
+
// Wait for TTL to expire
|
|
254
|
+
const waited = Date.now() + 5;
|
|
255
|
+
while (Date.now() < waited) { /* spin */ }
|
|
256
|
+
|
|
257
|
+
const cache2 = new DiskCache(path, { ttlMs: 1 });
|
|
258
|
+
expect(cache2.get("https://example.com/b")).toBeUndefined();
|
|
259
|
+
} finally {
|
|
260
|
+
rmSync(dir, { recursive: true });
|
|
261
|
+
}
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
it("implements ICache interface", () => {
|
|
265
|
+
const dir = mkdtempSync(join(tmpdir(), "spider-cache-"));
|
|
266
|
+
const path = join(dir, "cache.json");
|
|
267
|
+
try {
|
|
268
|
+
const cache = new DiskCache(path);
|
|
269
|
+
const page = makePage("https://example.com/c");
|
|
270
|
+
expect(cache.has("https://example.com/c")).toBe(false);
|
|
271
|
+
cache.set("https://example.com/c", page);
|
|
272
|
+
expect(cache.has("https://example.com/c")).toBe(true);
|
|
273
|
+
cache.delete("https://example.com/c");
|
|
274
|
+
expect(cache.has("https://example.com/c")).toBe(false);
|
|
275
|
+
} finally {
|
|
276
|
+
rmSync(dir, { recursive: true });
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
});
|