@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,193 @@
1
+ /**
2
+ * TDD tests for DiskCache hybrid image persistence.
3
+ * Uses real tmp directories — no mocking of fs.
4
+ */
5
+
6
+ import { existsSync, mkdirSync, readFileSync, rmSync } from "node:fs";
7
+ import { tmpdir } from "node:os";
8
+ import { join } from "node:path";
9
+ import { afterEach, beforeEach, describe, expect, it } from "vitest";
10
+ import { DiskCache } from "../src/disk-cache.js";
11
+ import type { ImageRef, SpideredPage } from "../src/types.js";
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // Helpers
15
+ // ---------------------------------------------------------------------------
16
+
17
+ let testDir: string;
18
+
19
+ beforeEach(() => {
20
+ testDir = join(tmpdir(), `wbs-disk-cache-test-${Date.now()}-${Math.random().toString(36).slice(2)}`);
21
+ mkdirSync(testDir, { recursive: true });
22
+ });
23
+
24
+ afterEach(() => {
25
+ rmSync(testDir, { recursive: true, force: true });
26
+ });
27
+
28
+ function cachePath(): string {
29
+ return join(testDir, "pages.json");
30
+ }
31
+
32
+ function makeCache(threshold = 32 * 1024): DiskCache {
33
+ return new DiskCache(cachePath(), {
34
+ ttlMs: 60 * 60 * 1000,
35
+ autoFlush: false,
36
+ inlineImageThreshold: threshold,
37
+ });
38
+ }
39
+
40
+ function makeImage(base64: string, src = "https://example.com/photo.jpg"): ImageRef {
41
+ return { src, mimeType: "image/jpeg", alt: "Test image", base64 };
42
+ }
43
+
44
+ function makePage(images: ImageRef[]): SpideredPage {
45
+ return {
46
+ url: "https://example.com",
47
+ domain: "example.com",
48
+ fetchedAt: new Date().toISOString(),
49
+ title: "Test",
50
+ description: "",
51
+ author: "",
52
+ publishedAt: "",
53
+ lang: "en",
54
+ tags: [],
55
+ wordCount: 0,
56
+ readingTimeMinutes: 0,
57
+ headings: [],
58
+ chunks: [],
59
+ links: [],
60
+ markdown: "",
61
+ images,
62
+ };
63
+ }
64
+
65
+ /** Generate a base64 string of approximately `bytes` bytes when decoded. */
66
+ function makeBase64(bytes: number): string {
67
+ return Buffer.alloc(bytes, 0xab).toString("base64");
68
+ }
69
+
70
+ // ---------------------------------------------------------------------------
71
+ // Tests
72
+ // ---------------------------------------------------------------------------
73
+
74
+ describe("DiskCache hybrid image persistence", () => {
75
+ it("1. small image (\u2264 threshold) is stored inline in JSON", () => {
76
+ const cache = makeCache(32 * 1024);
77
+ const b64 = makeBase64(100); // 100 bytes decoded \u226532KB threshold
78
+ const page = makePage([makeImage(b64)]);
79
+
80
+ cache.set("https://example.com", page);
81
+ cache.flush();
82
+
83
+ const raw = JSON.parse(readFileSync(cachePath(), "utf8")) as { entries: Record<string, { page: SpideredPage }> };
84
+ const entry = Object.values(raw.entries)[0];
85
+ expect(entry.page.images![0].base64).toBe(b64);
86
+ expect(entry.page.images![0].filePath).toBeUndefined();
87
+ });
88
+
89
+ it("2. large image (> threshold) is written to disk — JSON has filePath, no base64", () => {
90
+ const cache = makeCache(32 * 1024);
91
+ const b64 = makeBase64(40 * 1024); // 40KB decoded > 32KB threshold
92
+ const page = makePage([makeImage(b64)]);
93
+
94
+ cache.set("https://example.com", page);
95
+ cache.flush();
96
+
97
+ const raw = JSON.parse(readFileSync(cachePath(), "utf8")) as { entries: Record<string, { page: SpideredPage }> };
98
+ const entry = Object.values(raw.entries)[0];
99
+ const storedImg = entry.page.images![0];
100
+
101
+ expect(storedImg.base64).toBeUndefined();
102
+ expect(storedImg.filePath).toBeDefined();
103
+ expect(existsSync(storedImg.filePath!)).toBe(true);
104
+ });
105
+
106
+ it("3. large image is hydrated on get() after reload", () => {
107
+ const b64 = makeBase64(40 * 1024);
108
+ const page = makePage([makeImage(b64)]);
109
+
110
+ const cache1 = makeCache(32 * 1024);
111
+ cache1.set("https://example.com", page);
112
+ cache1.flush();
113
+
114
+ // Fresh cache instance — simulates restart
115
+ const cache2 = makeCache(32 * 1024);
116
+ const loaded = cache2.get("https://example.com");
117
+
118
+ expect(loaded).toBeDefined();
119
+ expect(loaded!.images![0].base64).toBe(b64);
120
+ });
121
+
122
+ it("4. missing binary file degrades gracefully — no throw, filePath preserved", () => {
123
+ const b64 = makeBase64(40 * 1024);
124
+ const page = makePage([makeImage(b64)]);
125
+
126
+ const cache1 = makeCache(32 * 1024);
127
+ cache1.set("https://example.com", page);
128
+ cache1.flush();
129
+
130
+ // Read the JSON to find the file path, then delete it
131
+ const raw = JSON.parse(readFileSync(cachePath(), "utf8")) as { entries: Record<string, { page: SpideredPage }> };
132
+ const entry = Object.values(raw.entries)[0];
133
+ const filePath = entry.page.images![0].filePath!;
134
+ rmSync(filePath);
135
+
136
+ const cache2 = makeCache(32 * 1024);
137
+ let result: SpideredPage | undefined;
138
+ expect(() => { result = cache2.get("https://example.com"); }).not.toThrow();
139
+ expect(result).toBeDefined();
140
+ expect(result!.images![0].filePath).toBeDefined();
141
+ expect(result!.images![0].base64).toBeUndefined();
142
+ });
143
+
144
+ it("5. images/ directory is created automatically on first large-image flush", () => {
145
+ const imagesDir = join(testDir, "images");
146
+ expect(existsSync(imagesDir)).toBe(false);
147
+
148
+ const cache = makeCache(32 * 1024);
149
+ const b64 = makeBase64(40 * 1024);
150
+ cache.set("https://example.com", makePage([makeImage(b64)]));
151
+ cache.flush();
152
+
153
+ expect(existsSync(imagesDir)).toBe(true);
154
+ });
155
+
156
+ it("page without images round-trips cleanly", () => {
157
+ const cache1 = makeCache();
158
+ const page = makePage([]);
159
+ // Override images to be undefined (no captureImages)
160
+ const noImgPage = { ...page, images: undefined };
161
+ cache1.set("https://example.com", noImgPage);
162
+ cache1.flush();
163
+
164
+ const cache2 = makeCache();
165
+ const loaded = cache2.get("https://example.com");
166
+ expect(loaded).toBeDefined();
167
+ expect(loaded!.images).toBeUndefined();
168
+ });
169
+
170
+ it("multiple images — mix of small and large — persisted correctly", () => {
171
+ const smallB64 = makeBase64(100);
172
+ const largeB64 = makeBase64(40 * 1024);
173
+ const page = makePage([
174
+ makeImage(smallB64, "https://example.com/small.jpg"),
175
+ makeImage(largeB64, "https://example.com/large.jpg"),
176
+ ]);
177
+
178
+ const cache1 = makeCache(32 * 1024);
179
+ cache1.set("https://example.com", page);
180
+ cache1.flush();
181
+
182
+ const cache2 = makeCache(32 * 1024);
183
+ const loaded = cache2.get("https://example.com");
184
+
185
+ expect(loaded!.images).toHaveLength(2);
186
+ // Small: inline
187
+ const small = loaded!.images!.find((i) => i.src.includes("small"))!;
188
+ expect(small.base64).toBe(smallB64);
189
+ // Large: hydrated from file
190
+ const large = loaded!.images!.find((i) => i.src.includes("large"))!;
191
+ expect(large.base64).toBe(largeB64);
192
+ });
193
+ });
@@ -0,0 +1,114 @@
1
+ /**
2
+ * TDD tests for the search engine registry.
3
+ *
4
+ * Adding a new engine must not require editing existing code —
5
+ * only registering a new entry.
6
+ */
7
+
8
+ import { describe, expect, it } from "vitest";
9
+ import type { ISearchEngine, SearchQuery, WebSearchResult } from "../src/ports.js";
10
+ import {
11
+ registerSearchEngine,
12
+ resolveSearchEngine,
13
+ defaultSearchEngine,
14
+ BraveSearchEngine,
15
+ TavilySearchEngine,
16
+ ExaSearchEngine,
17
+ DdgSearchEngine,
18
+ } from "../src/web-search.js";
19
+
20
+ // ---------------------------------------------------------------------------
21
+ // Stub engine — used to verify registration without touching real APIs
22
+ // ---------------------------------------------------------------------------
23
+
24
+ class StubEngine implements ISearchEngine {
25
+ readonly calls: SearchQuery[] = [];
26
+ constructor(private readonly results: WebSearchResult[] = []) {}
27
+ async search(req: SearchQuery): Promise<WebSearchResult[]> {
28
+ this.calls.push(req);
29
+ return this.results;
30
+ }
31
+ }
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // Registry — registerSearchEngine / resolveSearchEngine
35
+ // ---------------------------------------------------------------------------
36
+
37
+ describe("registerSearchEngine / resolveSearchEngine", () => {
38
+ it("resolves a built-in engine by name without editing existing code", () => {
39
+ const engine = resolveSearchEngine("ddg");
40
+ expect(engine).toBeInstanceOf(DdgSearchEngine);
41
+ });
42
+
43
+ it("resolves brave when BRAVE_SEARCH_API_KEY is set", () => {
44
+ const engine = resolveSearchEngine("brave", "test-brave-key");
45
+ expect(engine).toBeInstanceOf(BraveSearchEngine);
46
+ });
47
+
48
+ it("resolves tavily when TAVILY_API_KEY is set", () => {
49
+ const engine = resolveSearchEngine("tavily", "test-tavily-key");
50
+ expect(engine).toBeInstanceOf(TavilySearchEngine);
51
+ });
52
+
53
+ it("resolves exa when EXA_API_KEY is set", () => {
54
+ const engine = resolveSearchEngine("exa", "test-exa-key");
55
+ expect(engine).toBeInstanceOf(ExaSearchEngine);
56
+ });
57
+
58
+ it("throws a descriptive error when key is missing for a keyed engine", () => {
59
+ expect(() => resolveSearchEngine("brave", undefined)).toThrow(/BRAVE_SEARCH_API_KEY/);
60
+ expect(() => resolveSearchEngine("tavily", undefined)).toThrow(/TAVILY_API_KEY/);
61
+ expect(() => resolveSearchEngine("exa", undefined)).toThrow(/EXA_API_KEY/);
62
+ });
63
+
64
+ it("throws for an unknown engine name", () => {
65
+ expect(() => resolveSearchEngine("unknown-engine" as never, undefined)).toThrow(/unknown.*engine/i);
66
+ });
67
+
68
+ it("a third-party engine can be registered without editing existing code", () => {
69
+ const stub = new StubEngine([{ url: "https://test.com", title: "Test", snippet: "ok" }]);
70
+ registerSearchEngine("my-custom-engine", () => stub);
71
+
72
+ const resolved = resolveSearchEngine("my-custom-engine" as never, undefined);
73
+ expect(resolved).toBe(stub);
74
+ });
75
+
76
+ it("registered engine is callable and returns results", async () => {
77
+ const stub = new StubEngine([{ url: "https://custom.com", title: "Custom", snippet: "result" }]);
78
+ registerSearchEngine("test-engine-2", () => stub);
79
+
80
+ const engine = resolveSearchEngine("test-engine-2" as never, undefined);
81
+ const results = await engine.search({ query: "hello", numResults: 5 });
82
+ expect(results[0].url).toBe("https://custom.com");
83
+ expect(stub.calls).toHaveLength(1);
84
+ expect(stub.calls[0].query).toBe("hello");
85
+ });
86
+
87
+ it("registered engine overwrites a previous registration for the same name", () => {
88
+ const first = new StubEngine();
89
+ const second = new StubEngine();
90
+ registerSearchEngine("overwrite-test", () => first);
91
+ registerSearchEngine("overwrite-test", () => second);
92
+
93
+ const resolved = resolveSearchEngine("overwrite-test" as never, undefined);
94
+ expect(resolved).toBe(second);
95
+ });
96
+ });
97
+
98
+ // ---------------------------------------------------------------------------
99
+ // defaultSearchEngine — still builds the right chain from env vars
100
+ // ---------------------------------------------------------------------------
101
+
102
+ describe("defaultSearchEngine", () => {
103
+ it("returns an ISearchEngine", () => {
104
+ const engine = defaultSearchEngine();
105
+ expect(typeof engine.search).toBe("function");
106
+ });
107
+
108
+ it("includes DdgSearchEngine as last-resort fallback (always present)", () => {
109
+ // defaultSearchEngine always returns a FallbackSearchEngine that ends with DDG.
110
+ // We verify by checking the returned engine is functional even with no API keys.
111
+ const engine = defaultSearchEngine();
112
+ expect(engine).toBeDefined();
113
+ });
114
+ });
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Package export smoke tests.
3
+ *
4
+ * Imports every public symbol from the package entrypoint and asserts it is
5
+ * present and has the right shape. This catches "not a constructor" and
6
+ * "undefined is not a function" errors before they reach users.
7
+ *
8
+ * Run after every build: these tests operate on the compiled src/, not dist/,
9
+ * so they reflect what will be shipped.
10
+ */
11
+
12
+ import { describe, expect, it } from "vitest";
13
+ import {
14
+ DdgSearchEngine,
15
+ DomainThrottle,
16
+ FallbackSearchEngine,
17
+ PageGraph,
18
+ PlaywrightHttpClient,
19
+ RobotsCache,
20
+ SpiderCache,
21
+ braveSearch,
22
+ ddgSearch,
23
+ buildTree,
24
+ crawl,
25
+ createPlaywrightClient,
26
+ fuzzySearch,
27
+ searchPages,
28
+ navigateTree,
29
+ queryTree,
30
+ spider,
31
+ tavilySearch,
32
+ toLean,
33
+ webSearch,
34
+ } from "../src/index.js";
35
+
36
+ describe("class constructors", () => {
37
+ it("SpiderCache is constructable", () => {
38
+ expect(typeof SpiderCache).toBe("function");
39
+ expect(new SpiderCache()).toBeInstanceOf(SpiderCache);
40
+ });
41
+
42
+ it("PageGraph is constructable", () => {
43
+ expect(typeof PageGraph).toBe("function");
44
+ expect(new PageGraph()).toBeInstanceOf(PageGraph);
45
+ });
46
+
47
+ it("DomainThrottle is constructable", () => {
48
+ expect(typeof DomainThrottle).toBe("function");
49
+ const t = new DomainThrottle();
50
+ expect(t).toBeInstanceOf(DomainThrottle);
51
+ expect(typeof t.wait).toBe("function");
52
+ expect(typeof t.success).toBe("function");
53
+ expect(typeof t.rateLimit).toBe("function");
54
+ });
55
+
56
+ it("RobotsCache is constructable", () => {
57
+ expect(typeof RobotsCache).toBe("function");
58
+ const r = new RobotsCache();
59
+ expect(r).toBeInstanceOf(RobotsCache);
60
+ expect(typeof r.check).toBe("function");
61
+ });
62
+ });
63
+
64
+ describe("functions", () => {
65
+ it.each([
66
+ ["spider", spider],
67
+ ["crawl", crawl],
68
+ ["fuzzySearch", fuzzySearch],
69
+ ["searchPages", searchPages],
70
+ ["buildTree", buildTree],
71
+ ["navigateTree", navigateTree],
72
+ ["queryTree", queryTree],
73
+ ["toLean", toLean],
74
+ ["webSearch", webSearch],
75
+ ["braveSearch", braveSearch],
76
+ ["tavilySearch", tavilySearch],
77
+ ["ddgSearch", ddgSearch],
78
+ ])("%s is a function", (_name, fn) => {
79
+ expect(typeof fn).toBe("function");
80
+ });
81
+ });
82
+
83
+ describe("PlaywrightHttpClient", () => {
84
+ it("is constructable", () => {
85
+ expect(typeof PlaywrightHttpClient).toBe("function");
86
+ const client = new PlaywrightHttpClient();
87
+ expect(typeof client.fetch).toBe("function");
88
+ expect(typeof client.close).toBe("function");
89
+ });
90
+
91
+ it("createPlaywrightClient returns an instance", () => {
92
+ const client = createPlaywrightClient();
93
+ expect(client).toBeInstanceOf(PlaywrightHttpClient);
94
+ });
95
+ });
96
+
97
+ describe("FallbackSearchEngine", () => {
98
+ it("is constructable with at least one engine", () => {
99
+ const stub = { search: async () => [] };
100
+ const fb = new FallbackSearchEngine([stub]);
101
+ expect(typeof fb.search).toBe("function");
102
+ });
103
+
104
+ it("DdgSearchEngine is constructable", () => {
105
+ const ddg = new DdgSearchEngine();
106
+ expect(typeof ddg.search).toBe("function");
107
+ });
108
+ });
109
+
110
+ describe("DomainThrottle defaults", () => {
111
+ it("has expected default values", () => {
112
+ const t = new DomainThrottle();
113
+ expect(t.minDelayMs).toBe(500);
114
+ expect(t.backoffBaseMs).toBe(1_000);
115
+ expect(t.backoffCapMs).toBe(30_000);
116
+ expect(t.maxRetries).toBe(3);
117
+ });
118
+
119
+ it("accepts custom options", () => {
120
+ const t = new DomainThrottle({ minDelayMs: 100, maxRetries: 1 });
121
+ expect(t.minDelayMs).toBe(100);
122
+ expect(t.maxRetries).toBe(1);
123
+ });
124
+ });
@@ -0,0 +1,115 @@
1
+ /**
2
+ * TDD tests for getChunk(cache, url, index).
3
+ */
4
+
5
+ import { describe, expect, it } from "vitest";
6
+ import { SpiderCache } from "../src/cache.js";
7
+ import { getChunk } from "../src/index.js";
8
+ import type { Chunk, SpideredPage } from "../src/types.js";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Helpers
12
+ // ---------------------------------------------------------------------------
13
+
14
+ function makeChunk(index: number, text = `Chunk text number ${index}. `.repeat(12)): Chunk {
15
+ return {
16
+ id: `https://example.com#chunk-${index}`,
17
+ index,
18
+ heading: `Heading ${index}`,
19
+ text,
20
+ wordCount: text.split(/\s+/).filter(Boolean).length,
21
+ contentType: "text",
22
+ };
23
+ }
24
+
25
+ function makePage(chunks: Chunk[]): SpideredPage {
26
+ return {
27
+ url: "https://example.com",
28
+ domain: "example.com",
29
+ fetchedAt: new Date().toISOString(),
30
+ title: "Test Page",
31
+ description: "",
32
+ author: "",
33
+ publishedAt: "",
34
+ lang: "en",
35
+ tags: [],
36
+ wordCount: chunks.reduce((n, c) => n + c.wordCount, 0),
37
+ readingTimeMinutes: 1,
38
+ headings: [],
39
+ chunks,
40
+ links: [],
41
+ markdown: chunks.map((c) => c.text).join("\n\n"),
42
+ };
43
+ }
44
+
45
+ function populatedCache(): SpiderCache {
46
+ const cache = new SpiderCache();
47
+ cache.set("https://example.com", makePage([makeChunk(0), makeChunk(1), makeChunk(2)]));
48
+ cache.set("https://other.com", makePage([makeChunk(0), makeChunk(1)]));
49
+ return cache;
50
+ }
51
+
52
+ // ---------------------------------------------------------------------------
53
+ // Tests
54
+ // ---------------------------------------------------------------------------
55
+
56
+ describe("getChunk(cache, url, index)", () => {
57
+ it("returns the correct chunk by index", () => {
58
+ const cache = populatedCache();
59
+ const chunk = getChunk(cache, "https://example.com", 1);
60
+ expect(chunk).toBeDefined();
61
+ expect(chunk!.index).toBe(1);
62
+ expect(chunk!.id).toBe("https://example.com#chunk-1");
63
+ });
64
+
65
+ it("returns chunk 0", () => {
66
+ const cache = populatedCache();
67
+ const chunk = getChunk(cache, "https://example.com", 0);
68
+ expect(chunk!.index).toBe(0);
69
+ });
70
+
71
+ it("returns the last chunk", () => {
72
+ const cache = populatedCache();
73
+ const chunk = getChunk(cache, "https://example.com", 2);
74
+ expect(chunk!.index).toBe(2);
75
+ });
76
+
77
+ it("returns undefined for an out-of-range index", () => {
78
+ const cache = populatedCache();
79
+ expect(getChunk(cache, "https://example.com", 99)).toBeUndefined();
80
+ });
81
+
82
+ it("returns undefined when the URL is not in the cache", () => {
83
+ const cache = populatedCache();
84
+ expect(getChunk(cache, "https://notcached.com", 0)).toBeUndefined();
85
+ });
86
+
87
+ it("returns undefined for a negative index", () => {
88
+ const cache = populatedCache();
89
+ expect(getChunk(cache, "https://example.com", -1)).toBeUndefined();
90
+ });
91
+
92
+ it("works across different cached URLs", () => {
93
+ const cache = populatedCache();
94
+ const a = getChunk(cache, "https://example.com", 2);
95
+ const b = getChunk(cache, "https://other.com", 1);
96
+ expect(a!.index).toBe(2);
97
+ expect(b!.index).toBe(1);
98
+ });
99
+
100
+ it("normalises trailing slashes in URL", () => {
101
+ const cache = populatedCache();
102
+ const chunk = getChunk(cache, "https://example.com/", 0);
103
+ expect(chunk).toBeDefined();
104
+ });
105
+
106
+ it("works with DiskCache via ICache interface", () => {
107
+ // getChunk accepts any ICache<string, SpideredPage> — verify it's not SpiderCache-specific
108
+ const cache = new SpiderCache();
109
+ const chunks = [makeChunk(0), makeChunk(1)];
110
+ cache.set("https://example.com", makePage(chunks));
111
+ const result = getChunk(cache, "https://example.com", 1);
112
+ expect(result).toBeDefined();
113
+ expect(result!.text).toBe(chunks[1].text);
114
+ });
115
+ });