@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,610 @@
1
+ /**
2
+ * Unit tests for spider.ts internals and agent-ergonomic behaviour.
3
+ * These tests do not make real HTTP requests.
4
+ */
5
+
6
+ import { parseHTML } from "linkedom";
7
+ import TurndownService from "turndown";
8
+ import { describe, expect, it, vi } from "vitest";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Escape suppression — no backslash noise in output
12
+ // ---------------------------------------------------------------------------
13
+
14
+ describe("turndown escape suppression", () => {
15
+ it("does not escape square brackets", () => {
16
+ const td = new TurndownService();
17
+ (td as unknown as { escape: (s: string) => string }).escape = (s) => s;
18
+ const html = "<p>See [[wikilinks]] and [[another]]</p>";
19
+ const md = td.turndown(html);
20
+ expect(md).not.toContain("\\[");
21
+ expect(md).toContain("[[wikilinks]]");
22
+ });
23
+
24
+ it("does not escape asterisks", () => {
25
+ const td = new TurndownService();
26
+ (td as unknown as { escape: (s: string) => string }).escape = (s) => s;
27
+ const html = "<p>The **bold** word</p>";
28
+ const md = td.turndown(html);
29
+ // Turndown converts <strong> to **, but plain text ** should not become \*\*
30
+ expect(md).not.toContain("\\*");
31
+ });
32
+
33
+ it("does not escape backticks in plain text", () => {
34
+ const td = new TurndownService();
35
+ (td as unknown as { escape: (s: string) => string }).escape = (s) => s;
36
+ const html = "<p>Use `code` here</p>";
37
+ const md = td.turndown(html);
38
+ expect(md).not.toContain("\\`");
39
+ });
40
+ });
41
+
42
+ // ---------------------------------------------------------------------------
43
+ // Image stripping — no alt-text noise
44
+ // ---------------------------------------------------------------------------
45
+
46
+ describe("image stripping", () => {
47
+ it("removes img tags from output", () => {
48
+ const td = new TurndownService();
49
+ (td as unknown as { escape: (s: string) => string }).escape = (s) => s;
50
+ td.addRule("strip-images", { filter: "img", replacement: () => "" });
51
+
52
+ const html = `<p>Before</p><img src="photo.jpg" alt="A nice photo"><p>After</p>`;
53
+ const md = td.turndown(html);
54
+ expect(md).not.toContain("photo.jpg");
55
+ expect(md).not.toContain("A nice photo");
56
+ expect(md).toContain("Before");
57
+ expect(md).toContain("After");
58
+ });
59
+ });
60
+
61
+ // ---------------------------------------------------------------------------
62
+ // Content type detection
63
+ // ---------------------------------------------------------------------------
64
+
65
+ // Re-implement the pure function locally so we can test it without importing
66
+ // the whole spider module (which has side effects at module init time).
67
+ function detectContentType(lines: string[]): string {
68
+ for (const line of lines) {
69
+ const t = line.trim();
70
+ if (!t) continue;
71
+ if (t.startsWith("```")) return "code";
72
+ if (t.startsWith("|")) return "table";
73
+ if (/^[-*+] /.test(t) || /^\d+\. /.test(t)) return "list";
74
+ if (t.startsWith(">")) return "blockquote";
75
+ return "text";
76
+ }
77
+ return "text";
78
+ }
79
+
80
+ // Re-implement the chunker locally to test code-block and table boundary logic.
81
+ const CHUNK_TARGET = 150;
82
+ function chunkMarkdown(
83
+ markdown: string,
84
+ _baseUrl = "https://example.com",
85
+ ): Array<{ text: string; contentType: string; heading: string }> {
86
+ const chunks: Array<{ text: string; contentType: string; heading: string }> = [];
87
+ const lines = markdown.split("\n");
88
+ let heading = "";
89
+ let buffer: string[] = [];
90
+ let inTable = false;
91
+ let inCode = false;
92
+
93
+ const flush = () => {
94
+ const text = buffer.join("\n").trim();
95
+ if (!text) return;
96
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
97
+ if (wordCount < 10) return;
98
+ chunks.push({ text, contentType: detectContentType(buffer), heading });
99
+ buffer = [];
100
+ inTable = false;
101
+ };
102
+
103
+ for (const line of lines) {
104
+ if (line.trim().startsWith("```")) inCode = !inCode;
105
+ const isTableRow = line.trim().startsWith("|");
106
+ if (inCode) {
107
+ buffer.push(line);
108
+ } else {
109
+ if (isTableRow) inTable = true;
110
+ else if (inTable && !isTableRow) inTable = false;
111
+ const headingMatch = /^#{1,3} (.+)/.exec(line);
112
+ if (headingMatch && !inTable) {
113
+ const w = buffer.join(" ").split(/\s+/).filter(Boolean).length;
114
+ if (w >= CHUNK_TARGET) flush();
115
+ heading = headingMatch[1];
116
+ buffer.push(line);
117
+ } else {
118
+ buffer.push(line);
119
+ const w = buffer.join(" ").split(/\s+/).filter(Boolean).length;
120
+ if (w >= CHUNK_TARGET && !inTable) flush();
121
+ }
122
+ }
123
+ }
124
+ flush();
125
+ return chunks;
126
+ }
127
+
128
+ describe("detectContentType", () => {
129
+ it("detects fenced code blocks", () => {
130
+ expect(detectContentType(["```typescript", "const x = 1", "```"])).toBe("code");
131
+ });
132
+
133
+ it("detects markdown tables", () => {
134
+ expect(detectContentType(["| Col1 | Col2 |", "| --- | --- |", "| a | b |"])).toBe("table");
135
+ });
136
+
137
+ it("detects unordered lists", () => {
138
+ expect(detectContentType(["- item one", "- item two"])).toBe("list");
139
+ expect(detectContentType(["* item", "* item"])).toBe("list");
140
+ expect(detectContentType(["+ item"])).toBe("list");
141
+ });
142
+
143
+ it("detects ordered lists", () => {
144
+ expect(detectContentType(["1. first", "2. second"])).toBe("list");
145
+ });
146
+
147
+ it("detects blockquotes", () => {
148
+ expect(detectContentType(["> quoted text"])).toBe("blockquote");
149
+ });
150
+
151
+ it("defaults to text for prose", () => {
152
+ expect(detectContentType(["This is a normal paragraph."])).toBe("text");
153
+ });
154
+
155
+ it("skips blank lines before classifying", () => {
156
+ expect(detectContentType(["", " ", "| table row |"])).toBe("table");
157
+ });
158
+
159
+ it("returns text for empty buffer", () => {
160
+ expect(detectContentType([])).toBe("text");
161
+ expect(detectContentType(["", " "])).toBe("text");
162
+ });
163
+ });
164
+
165
+ // ---------------------------------------------------------------------------
166
+ // Tag extraction
167
+ // ---------------------------------------------------------------------------
168
+
169
+ function extractTags(doc: Document): string[] {
170
+ const tags = new Set<string>();
171
+ const keywords = doc.querySelector('meta[name="keywords"]')?.getAttribute("content") ?? "";
172
+ for (const k of keywords
173
+ .split(/[,;]/)
174
+ .map((k) => k.trim().toLowerCase())
175
+ .filter(Boolean)) {
176
+ tags.add(k);
177
+ }
178
+ for (const el of [...doc.querySelectorAll('meta[property="article:tag"], meta[name="article:tag"]')]) {
179
+ const t = el.getAttribute("content")?.trim().toLowerCase();
180
+ if (t) tags.add(t);
181
+ }
182
+ return [...tags].slice(0, 20);
183
+ }
184
+
185
+ describe("extractTags", () => {
186
+ it("extracts comma-separated keywords", () => {
187
+ const dom = parseHTML('<html><head><meta name="keywords" content="scraping, agents, LLM"></head></html>');
188
+ expect(extractTags(dom.document)).toEqual(["scraping", "agents", "llm"]);
189
+ });
190
+
191
+ it("extracts article:tag properties", () => {
192
+ const dom = parseHTML(
193
+ "<html><head>" +
194
+ '<meta property="article:tag" content="AI">' +
195
+ '<meta property="article:tag" content="Web">' +
196
+ "</head></html>",
197
+ );
198
+ expect(extractTags(dom.document)).toEqual(["ai", "web"]);
199
+ });
200
+
201
+ it("deduplicates across sources", () => {
202
+ const dom = parseHTML(
203
+ "<html><head>" +
204
+ '<meta name="keywords" content="ai, web">' +
205
+ '<meta property="article:tag" content="AI">' +
206
+ "</head></html>",
207
+ );
208
+ const tags = extractTags(dom.document);
209
+ // "ai" appears twice (once from keywords, once from article:tag) — should be deduplicated
210
+ expect(tags.filter((t) => t === "ai")).toHaveLength(1);
211
+ });
212
+
213
+ it("returns empty array when no tags present", () => {
214
+ const dom = parseHTML("<html><head></head></html>");
215
+ expect(extractTags(dom.document)).toEqual([]);
216
+ });
217
+
218
+ it("caps at 20 tags", () => {
219
+ const many = Array.from({ length: 30 }, (_, i) => `tag${i}`).join(",");
220
+ const dom = parseHTML(`<html><head><meta name="keywords" content="${many}"></head></html>`);
221
+ expect(extractTags(dom.document).length).toBeLessThanOrEqual(20);
222
+ });
223
+ });
224
+
225
+ // ---------------------------------------------------------------------------
226
+ // Canonical URL extraction
227
+ // ---------------------------------------------------------------------------
228
+
229
+ function extractCanonicalUrl(doc: Document, fetchedUrl: string): string | undefined {
230
+ const canonical =
231
+ doc.querySelector('link[rel="canonical"]')?.getAttribute("href") ??
232
+ doc.querySelector('meta[property="og:url"]')?.getAttribute("content");
233
+ if (!canonical) return undefined;
234
+ const norm = (u: string) => u.replace(/\/$/, "");
235
+ return norm(canonical) !== norm(fetchedUrl) ? canonical : undefined;
236
+ }
237
+
238
+ describe("extractCanonicalUrl", () => {
239
+ it("extracts link[rel=canonical]", () => {
240
+ const dom = parseHTML('<html><head><link rel="canonical" href="https://example.com/page"></head></html>');
241
+ expect(extractCanonicalUrl(dom.document, "https://example.com/page?ref=social")).toBe(
242
+ "https://example.com/page",
243
+ );
244
+ });
245
+
246
+ it("extracts og:url when no canonical link", () => {
247
+ const dom = parseHTML('<html><head><meta property="og:url" content="https://example.com/og"></head></html>');
248
+ expect(extractCanonicalUrl(dom.document, "https://example.com/other")).toBe("https://example.com/og");
249
+ });
250
+
251
+ it("returns undefined when canonical matches fetched URL", () => {
252
+ const dom = parseHTML('<html><head><link rel="canonical" href="https://example.com/page"></head></html>');
253
+ expect(extractCanonicalUrl(dom.document, "https://example.com/page")).toBeUndefined();
254
+ });
255
+
256
+ it("returns undefined when no canonical", () => {
257
+ const dom = parseHTML("<html><head></head></html>");
258
+ expect(extractCanonicalUrl(dom.document, "https://example.com")).toBeUndefined();
259
+ });
260
+ });
261
+
262
+ // ---------------------------------------------------------------------------
263
+ // Link rel classification
264
+ // ---------------------------------------------------------------------------
265
+
266
+ function classifyLinkRel(a: Element): "body" | "nav" {
267
+ return a.closest("nav, header, footer, aside") !== null ? "nav" : "body";
268
+ }
269
+
270
+ describe("link rel classification", () => {
271
+ it("classifies links inside <nav> as nav", () => {
272
+ const dom = parseHTML("<html><body><nav><a href='/x'>link</a></nav></body></html>");
273
+ const a = dom.document.querySelector("a")!;
274
+ expect(classifyLinkRel(a)).toBe("nav");
275
+ });
276
+
277
+ it("classifies links inside <footer> as nav", () => {
278
+ const dom = parseHTML("<html><body><footer><a href='/x'>link</a></footer></body></html>");
279
+ const a = dom.document.querySelector("a")!;
280
+ expect(classifyLinkRel(a)).toBe("nav");
281
+ });
282
+
283
+ it("classifies links inside <header> as nav", () => {
284
+ const dom = parseHTML("<html><body><header><a href='/x'>link</a></header></body></html>");
285
+ const a = dom.document.querySelector("a")!;
286
+ expect(classifyLinkRel(a)).toBe("nav");
287
+ });
288
+
289
+ it("classifies links inside <aside> as nav", () => {
290
+ const dom = parseHTML("<html><body><aside><a href='/x'>link</a></aside></body></html>");
291
+ const a = dom.document.querySelector("a")!;
292
+ expect(classifyLinkRel(a)).toBe("nav");
293
+ });
294
+
295
+ it("classifies links inside article content as body", () => {
296
+ const dom = parseHTML("<html><body><article><p><a href='/x'>link</a></p></article></body></html>");
297
+ const a = dom.document.querySelector("a")!;
298
+ expect(classifyLinkRel(a)).toBe("body");
299
+ });
300
+
301
+ it("classifies bare links as body", () => {
302
+ const dom = parseHTML("<html><body><p><a href='/x'>link</a></p></body></html>");
303
+ const a = dom.document.querySelector("a")!;
304
+ expect(classifyLinkRel(a)).toBe("body");
305
+ });
306
+ });
307
+
308
+ // ---------------------------------------------------------------------------
309
+ // Input validation (via thrown error messages — no HTTP needed)
310
+ // ---------------------------------------------------------------------------
311
+
312
+ describe("spider input validation", () => {
313
+ it("rejects non-URL strings", async () => {
314
+ const { spider } = await import("../src/spider.js");
315
+ await expect(spider("not a url")).rejects.toThrow("Invalid URL");
316
+ });
317
+
318
+ it("rejects ftp:// protocol", async () => {
319
+ const { spider } = await import("../src/spider.js");
320
+ await expect(spider("ftp://example.com")).rejects.toThrow("Unsupported protocol");
321
+ });
322
+
323
+ it("rejects file:// protocol", async () => {
324
+ const { spider } = await import("../src/spider.js");
325
+ await expect(spider("file:///etc/passwd")).rejects.toThrow("Unsupported protocol");
326
+ });
327
+ });
328
+
329
+ // ---------------------------------------------------------------------------
330
+ // Code block splitting — fences must never be broken across chunks
331
+ // ---------------------------------------------------------------------------
332
+
333
+ describe("code block splitting", () => {
334
+ // Build a markdown string with a large code block (>150 words of code)
335
+ // followed by more prose to force a flush boundary inside the fence.
336
+ const bigCodeBlock = [
337
+ "## Setup",
338
+ "",
339
+ "Some intro text.",
340
+ "",
341
+ "```typescript",
342
+ // 160 words of fake code — enough to exceed CHUNK_TARGET on its own
343
+ ...Array.from({ length: 160 }, (_, i) => `const var${i} = ${i} // line ${i}`),
344
+ "```",
345
+ "",
346
+ "## After the block",
347
+ "",
348
+ "Prose that follows the code block. It should land in its own chunk.",
349
+ ].join("\n");
350
+
351
+ it("never produces an odd number of fenced code markers", () => {
352
+ const chunks = chunkMarkdown(bigCodeBlock);
353
+ const totalFences = chunks.reduce((n, c) => n + (c.text.match(/```/g) ?? []).length, 0);
354
+ expect(totalFences % 2).toBe(0);
355
+ });
356
+
357
+ it("keeps the entire code block in one chunk", () => {
358
+ const chunks = chunkMarkdown(bigCodeBlock);
359
+ // At most one chunk should contain fence markers
360
+ const chunksWithCode = chunks.filter((c) => c.text.includes("```"));
361
+ expect(chunksWithCode.length).toBe(1);
362
+ });
363
+
364
+ it("detects contentType=code when a chunk opens directly with a fence (no heading)", () => {
365
+ // A standalone code block with no preceding prose or heading.
366
+ // detectContentType sees the fence as the first non-blank line → 'code'.
367
+ const pureCode = [
368
+ "```typescript",
369
+ ...Array.from({ length: 30 }, (_, i) => `const x${i} = ${i}`),
370
+ "```",
371
+ // padding to reach the 10-word minimum for flush
372
+ "some extra words to reach the minimum threshold for flushing the buffer right here",
373
+ ].join("\n");
374
+ const chunks = chunkMarkdown(pureCode);
375
+ const codeChunk = chunks.find((c) => c.text.includes("```"));
376
+ expect(codeChunk).toBeDefined();
377
+ expect(codeChunk!.contentType).toBe("code");
378
+ });
379
+
380
+ it("contentType is text when a heading precedes the fence in the same chunk", () => {
381
+ // A heading + code block land in the same chunk.
382
+ // detectContentType sees the heading line first → 'text'. This is expected.
383
+ const headingThenCode = [
384
+ "## My Section",
385
+ "```typescript",
386
+ ...Array.from({ length: 30 }, (_, i) => `const x${i} = ${i}`),
387
+ "```",
388
+ ].join("\n");
389
+ const chunks = chunkMarkdown(headingThenCode);
390
+ const mixed = chunks.find((c) => c.text.includes("```"));
391
+ expect(mixed).toBeDefined();
392
+ // heading comes first → contentType is 'text', not 'code'
393
+ expect(mixed!.contentType).toBe("text");
394
+ });
395
+
396
+ it("prose after the block lands in its own text chunk", () => {
397
+ const chunks = chunkMarkdown(bigCodeBlock);
398
+ const last = chunks[chunks.length - 1];
399
+ expect(last.contentType).toBe("text");
400
+ expect(last.text).toContain("Prose that follows");
401
+ });
402
+
403
+ it("small code blocks (under target) are also kept whole", () => {
404
+ const small = ["Intro text. ".repeat(5), "", "```ts", "const x = 1", "```", "", "More text. ".repeat(20)].join(
405
+ "\n",
406
+ );
407
+ const chunks = chunkMarkdown(small);
408
+ const fenceCount = chunks.reduce((n, c) => n + (c.text.match(/```/g) ?? []).length, 0);
409
+ expect(fenceCount % 2).toBe(0);
410
+ });
411
+ });
412
+
413
+ // ---------------------------------------------------------------------------
414
+ // Nav classification — extended patterns
415
+ // ---------------------------------------------------------------------------
416
+
417
+ describe("extended nav classification", () => {
418
+ it("classifies links inside role=navigation as nav", () => {
419
+ const dom = parseHTML('<html><body><div role="navigation"><a href="/x">link</a></div></body></html>');
420
+ const a = dom.document.querySelector("a")!;
421
+ expect(a.closest("[role='navigation'],[role='banner'],[role='contentinfo'],[role='complementary']")).not.toBeNull();
422
+ });
423
+
424
+ it("returns empty tags when no meta tags and no fallback", () => {
425
+ const dom = parseHTML("<html><head></head></html>");
426
+ const tags = extractTags(dom.document);
427
+ expect(tags).toEqual([]);
428
+ });
429
+ });
430
+
431
+ // ---------------------------------------------------------------------------
432
+ // IHttpClient injection — spider() without real network
433
+ // ---------------------------------------------------------------------------
434
+
435
+ import { spider } from "../src/spider.js";
436
+ import type { IHttpClient } from "../src/ports.js";
437
+
438
+ function makeHtmlResponse(html: string, status = 200): ReturnType<IHttpClient["fetch"]> {
439
+ return Promise.resolve({
440
+ ok: status >= 200 && status < 300,
441
+ status,
442
+ statusText: status === 200 ? "OK" : "Error",
443
+ headers: { get: () => null },
444
+ text: () => Promise.resolve(html),
445
+ arrayBuffer: () => Promise.resolve(new ArrayBuffer(0)),
446
+ });
447
+ }
448
+
449
+ function mockClient(html: string, status = 200): IHttpClient {
450
+ return { fetch: () => makeHtmlResponse(html, status) };
451
+ }
452
+
453
+ const SIMPLE_HTML = `<!DOCTYPE html>
454
+ <html lang="en">
455
+ <head>
456
+ <title>Test Page</title>
457
+ <meta name="description" content="A test description">
458
+ <meta name="keywords" content="testing, spider">
459
+ </head>
460
+ <body>
461
+ <article>
462
+ <h1>Hello World</h1>
463
+ <p>${"Content paragraph. ".repeat(30)}</p>
464
+ <h2>Section Two</h2>
465
+ <p>${"More content here. ".repeat(30)}</p>
466
+ </article>
467
+ </body>
468
+ </html>`;
469
+
470
+ describe("spider() with injected IHttpClient", () => {
471
+ it("fetches and parses a page without real network", async () => {
472
+ const page = await spider("https://example.com", { httpClient: mockClient(SIMPLE_HTML) });
473
+ expect(page.url).toBe("https://example.com");
474
+ expect(page.title).toContain("Test Page");
475
+ expect(page.description).toBe("A test description");
476
+ expect(page.tags).toContain("testing");
477
+ expect(page.markdown.length).toBeGreaterThan(0);
478
+ expect(page.chunks.length).toBeGreaterThan(0);
479
+ });
480
+
481
+ it("returns a lean page without network", async () => {
482
+ const page = await spider("https://example.com", {
483
+ httpClient: mockClient(SIMPLE_HTML),
484
+ view: "lean",
485
+ });
486
+ expect(page.view).toBe("lean");
487
+ expect(page.title).toContain("Test Page");
488
+ expect(page.headings.length).toBeGreaterThan(0);
489
+ });
490
+
491
+ it("throws FetchError on non-200 response", async () => {
492
+ await expect(
493
+ spider("https://example.com", { httpClient: mockClient("", 404) })
494
+ ).rejects.toThrow("404");
495
+ });
496
+
497
+ it("throws on non-http URL without touching the client", async () => {
498
+ const client = { fetch: vi.fn() };
499
+ await expect(spider("ftp://example.com", { httpClient: client })).rejects.toThrow("Unsupported protocol");
500
+ expect(client.fetch).not.toHaveBeenCalled();
501
+ });
502
+
503
+ it("applies tokenBudget via injected client (chunk-aware)", async () => {
504
+ const full = await spider("https://example.com", { httpClient: mockClient(SIMPLE_HTML) });
505
+ const budgeted = await spider("https://example.com", {
506
+ httpClient: mockClient(SIMPLE_HTML),
507
+ tokenBudget: 50, // very small — should select fewer chunks
508
+ });
509
+ // Chunk-aware budget selects whole chunks up to the limit.
510
+ // We can't guarantee fewer bytes (first chunk is always included)
511
+ // but we must have fewer or equal chunks.
512
+ expect(budgeted.chunks.length).toBeLessThanOrEqual(full.chunks.length);
513
+ // Markdown is rebuilt from selected chunks only.
514
+ if (full.chunks.length > 1) {
515
+ expect(budgeted.chunks.length).toBeLessThan(full.chunks.length);
516
+ }
517
+ });
518
+ });
519
+
520
+ // ---------------------------------------------------------------------------
521
+ // Table-aware chunking — tables must be atomic
522
+ // ---------------------------------------------------------------------------
523
+
524
+ import { chunk } from "../src/convert.js";
525
+
526
+ describe("table-aware chunking", () => {
527
+ const URL = "https://example.com/page";
528
+
529
+ it("keeps a table in a single chunk", () => {
530
+ const md = [
531
+ "| Col A | Col B | Col C |",
532
+ "| ----- | ----- | ----- |",
533
+ "| one | two | three |",
534
+ "| four | five | six |",
535
+ "| seven | eight | nine |",
536
+ ].join("\n");
537
+ const chunks = chunk(md, URL);
538
+ const tableChunks = chunks.filter((c) => c.contentType === "table");
539
+ expect(tableChunks).toHaveLength(1);
540
+ expect(tableChunks[0].text).toContain("| Col A");
541
+ expect(tableChunks[0].text).toContain("| seven");
542
+ });
543
+
544
+ it("does not split a large table across chunk boundaries", () => {
545
+ // Build a table with enough rows to exceed the 150-word target
546
+ const rows = Array.from({ length: 30 }, (_, i) =>
547
+ `| row${i} | description of row ${i} which is quite verbose | value-${i} | extra-${i} |`
548
+ );
549
+ const md = [
550
+ "| Name | Description | Value | Extra |",
551
+ "| ---- | ----------- | ----- | ----- |",
552
+ ...rows,
553
+ ].join("\n");
554
+ const chunks = chunk(md, URL);
555
+ const tableChunks = chunks.filter((c) => c.contentType === "table");
556
+ expect(tableChunks).toHaveLength(1);
557
+ });
558
+
559
+ it("flushes prose before a table so they are in separate chunks", () => {
560
+ const prose = Array.from({ length: 20 }, (_, i) =>
561
+ `Word${i} `.repeat(8).trim()
562
+ ).join(" ");
563
+ const table = [
564
+ "| A | B |",
565
+ "| - | - |",
566
+ "| 1 | 2 |",
567
+ "| 3 | 4 |",
568
+ ].join("\n");
569
+ const md = prose + "\n\n" + table;
570
+ const chunks = chunk(md, URL);
571
+ expect(chunks.length).toBeGreaterThanOrEqual(2);
572
+ const tableChunks = chunks.filter((c) => c.contentType === "table");
573
+ const textChunks = chunks.filter((c) => c.contentType === "text");
574
+ expect(tableChunks).toHaveLength(1);
575
+ expect(textChunks.length).toBeGreaterThanOrEqual(1);
576
+ // The table chunk must not contain the prose
577
+ expect(tableChunks[0].text).not.toContain("Word0");
578
+ });
579
+
580
+ it("prose after a table goes into a separate chunk", () => {
581
+ const table = [
582
+ "| A | B |",
583
+ "| - | - |",
584
+ "| 1 | 2 |",
585
+ ].join("\n");
586
+ const after = Array.from({ length: 20 }, (_, i) => `After${i} `.repeat(8).trim()).join(" ");
587
+ const md = table + "\n\n" + after;
588
+ const chunks = chunk(md, URL);
589
+ const tableChunks = chunks.filter((c) => c.contentType === "table");
590
+ expect(tableChunks).toHaveLength(1);
591
+ expect(tableChunks[0].text).not.toContain("After0");
592
+ });
593
+
594
+ it("pipe characters inside code blocks are not treated as table rows", () => {
595
+ const md = [
596
+ "```python",
597
+ "| fake | table | inside | code | block | row | here |",
598
+ "| another | row | in | code |",
599
+ "x = 1 # not a table row",
600
+ "```",
601
+ ].join("\n");
602
+ const chunks = chunk(md, URL);
603
+ // The | lines inside the code block must not produce a table chunk
604
+ const tableChunks = chunks.filter((c) => c.contentType === "table");
605
+ expect(tableChunks).toHaveLength(0);
606
+ // The pipe content must still exist in the output (inside a code chunk)
607
+ const allText = chunks.map((c) => c.text).join("\n");
608
+ expect(allText).toContain("| fake | table");
609
+ });
610
+ });