@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,272 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { buildTree, navigateTree, queryTree } from "../src/tree.js";
3
+ import type { DOMNode } from "../src/types.js";
4
+
5
+ // ---------------------------------------------------------------------------
6
+ // Helpers
7
+ // ---------------------------------------------------------------------------
8
+
9
+ function _countNodes(node: DOMNode): number {
10
+ return 1 + (node.children ?? []).reduce((n, c) => n + _countNodes(c), 0);
11
+ }
12
+
13
+ function allPaths(node: DOMNode): string[] {
14
+ return [node.path, ...(node.children ?? []).flatMap(allPaths)];
15
+ }
16
+
17
+ function findByTag(node: DOMNode, tag: string): DOMNode[] {
18
+ const results: DOMNode[] = [];
19
+ if (node.tag === tag) results.push(node);
20
+ for (const child of node.children ?? []) results.push(...findByTag(child, tag));
21
+ return results;
22
+ }
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Fixtures
26
+ // ---------------------------------------------------------------------------
27
+
28
+ const SIMPLE_HTML = `
29
+ <div>
30
+ <h1>Getting Started</h1>
31
+ <p>This is the introduction.</p>
32
+ <h2>Installation</h2>
33
+ <p>Run <code>npm install</code> to get started.</p>
34
+ <pre><code class="language-bash">npm install my-package</code></pre>
35
+ <h2>Usage</h2>
36
+ <ul>
37
+ <li>Item one</li>
38
+ <li>Item two</li>
39
+ </ul>
40
+ </div>`;
41
+
42
+ const TABLE_HTML = `
43
+ <div>
44
+ <h2>Comparison</h2>
45
+ <table>
46
+ <thead><tr><th>Feature</th><th>A</th><th>B</th></tr></thead>
47
+ <tbody>
48
+ <tr><td>Speed</td><td>Fast</td><td>Slow</td></tr>
49
+ <tr><td>Cost</td><td>High</td><td>Low</td></tr>
50
+ </tbody>
51
+ </table>
52
+ </div>`;
53
+
54
+ const DEEP_WRAPPER_HTML = `
55
+ <div>
56
+ <div>
57
+ <div>
58
+ <div>
59
+ <p>This is deeply nested but should collapse.</p>
60
+ </div>
61
+ </div>
62
+ </div>
63
+ </div>`;
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Tree building
67
+ // ---------------------------------------------------------------------------
68
+
69
+ describe("buildTree", () => {
70
+ it("returns a root node tagged 'article'", () => {
71
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
72
+ expect(tree.tag).toBe("article");
73
+ expect(tree.path).toBe("article");
74
+ });
75
+
76
+ it("extracts headings", () => {
77
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
78
+ const h1s = findByTag(tree, "h1");
79
+ const h2s = findByTag(tree, "h2");
80
+ expect(h1s).toHaveLength(1);
81
+ expect(h1s[0].text).toContain("Getting Started");
82
+ expect(h2s).toHaveLength(2);
83
+ });
84
+
85
+ it("extracts paragraphs with text", () => {
86
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
87
+ const ps = findByTag(tree, "p");
88
+ expect(ps.length).toBeGreaterThanOrEqual(2);
89
+ expect(ps[0].text).toContain("introduction");
90
+ });
91
+
92
+ it("preserves pre/code blocks as atomic leaf nodes", () => {
93
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
94
+ const pres = findByTag(tree, "pre");
95
+ expect(pres).toHaveLength(1);
96
+ expect(pres[0].text).toContain("npm install my-package");
97
+ // pre is a leaf — no children
98
+ expect(pres[0].children).toBeUndefined();
99
+ });
100
+
101
+ it("extracts lang attr from code blocks", () => {
102
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
103
+ const pres = findByTag(tree, "pre");
104
+ expect(pres[0].attrs?.lang).toBe("bash");
105
+ });
106
+
107
+ it("extracts lists and list items", () => {
108
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
109
+ const uls = findByTag(tree, "ul");
110
+ expect(uls).toHaveLength(1);
111
+ const lis = findByTag(tree, "li");
112
+ expect(lis).toHaveLength(2);
113
+ expect(lis[0].text).toBe("Item one");
114
+ });
115
+
116
+ it("preserves tables as structured subtrees", () => {
117
+ const tree = buildTree(TABLE_HTML, "https://example.com");
118
+ const tables = findByTag(tree, "table");
119
+ expect(tables).toHaveLength(1);
120
+ const tds = findByTag(tree, "td");
121
+ expect(tds.length).toBeGreaterThanOrEqual(4);
122
+ expect(tds[0].text).toBe("Speed");
123
+ });
124
+
125
+ it("collapses deep div wrappers", () => {
126
+ const tree = buildTree(DEEP_WRAPPER_HTML, "https://example.com");
127
+ const ps = findByTag(tree, "p");
128
+ expect(ps).toHaveLength(1);
129
+ expect(ps[0].text).toContain("deeply nested");
130
+ // should NOT have div nodes
131
+ const divs = findByTag(tree, "div");
132
+ expect(divs).toHaveLength(0);
133
+ });
134
+
135
+ it("generates unique paths for all nodes", () => {
136
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
137
+ const paths = allPaths(tree);
138
+ const unique = new Set(paths);
139
+ expect(unique.size).toBe(paths.length);
140
+ });
141
+
142
+ it("sibling nodes of the same tag get bracket notation", () => {
143
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
144
+ const h2s = findByTag(tree, "h2");
145
+ expect(h2s[0].path).not.toContain("["); // first h2 has no bracket
146
+ expect(h2s[1].path).toContain("[1]"); // second h2 gets [1]
147
+ });
148
+
149
+ it("href attr preserved on anchor tags", () => {
150
+ const html = `<div><p>See <a href="https://example.com/docs">the docs</a>.</p></div>`;
151
+ const tree = buildTree(html, "https://example.com");
152
+ const _links = findByTag(tree, "a");
153
+ // a tags inside p are inline — p is flattened to text, so link may not survive
154
+ // but the p text should contain "the docs"
155
+ const ps = findByTag(tree, "p");
156
+ expect(ps[0].text).toContain("the docs");
157
+ });
158
+ });
159
+
160
+ // ---------------------------------------------------------------------------
161
+ // Tree navigation
162
+ // ---------------------------------------------------------------------------
163
+
164
+ describe("navigateTree", () => {
165
+ it("returns the root when path is 'article'", () => {
166
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
167
+ const node = navigateTree(tree, "article");
168
+ expect(node).toBe(tree);
169
+ });
170
+
171
+ it("returns a child node by exact path", () => {
172
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
173
+ const h1s = findByTag(tree, "h1");
174
+ const h1Path = h1s[0].path;
175
+ const node = navigateTree(tree, h1Path);
176
+ expect(node).toBeDefined();
177
+ expect(node?.tag).toBe("h1");
178
+ expect(node?.text).toContain("Getting Started");
179
+ });
180
+
181
+ it("returns null for a non-existent path", () => {
182
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
183
+ expect(navigateTree(tree, "article.section[99].p")).toBeNull();
184
+ });
185
+
186
+ it("returns a pre node and its content is intact", () => {
187
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
188
+ const pres = findByTag(tree, "pre");
189
+ const node = navigateTree(tree, pres[0].path);
190
+ expect(node?.tag).toBe("pre");
191
+ expect(node?.text).toContain("npm install");
192
+ });
193
+ });
194
+
195
+ // ---------------------------------------------------------------------------
196
+ // Tree fuzzy search
197
+ // ---------------------------------------------------------------------------
198
+
199
+ describe("queryTree", () => {
200
+ it("returns empty array for blank query", () => {
201
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
202
+ expect(queryTree(tree, "")).toEqual([]);
203
+ expect(queryTree(tree, " ")).toEqual([]);
204
+ });
205
+
206
+ it("finds a heading by text", () => {
207
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
208
+ const hits = queryTree(tree, "installation");
209
+ expect(hits.length).toBeGreaterThan(0);
210
+ expect(hits[0].node.tag).toBe("h2");
211
+ expect(hits[0].node.text).toContain("Installation");
212
+ });
213
+
214
+ it("finds content in a paragraph", () => {
215
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
216
+ const hits = queryTree(tree, "introduction");
217
+ expect(hits.length).toBeGreaterThan(0);
218
+ expect(hits[0].snippet).toContain("introduction");
219
+ });
220
+
221
+ it("returns code blocks for code queries", () => {
222
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
223
+ const hits = queryTree(tree, "npm install my-package");
224
+ expect(hits.length).toBeGreaterThan(0);
225
+ const codeHit = hits.find((h) => h.node.tag === "pre");
226
+ expect(codeHit).toBeDefined();
227
+ expect(codeHit!.node.text).toContain("npm install my-package");
228
+ });
229
+
230
+ it("sorts hits by score descending", () => {
231
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
232
+ const hits = queryTree(tree, "npm install");
233
+ for (let i = 1; i < hits.length; i++) {
234
+ expect(hits[i].score).toBeLessThanOrEqual(hits[i - 1].score);
235
+ }
236
+ });
237
+
238
+ it("respects topN option", () => {
239
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
240
+ const hits = queryTree(tree, "item", { topN: 1 });
241
+ expect(hits.length).toBeLessThanOrEqual(1);
242
+ });
243
+
244
+ it("returns table rows for table queries", () => {
245
+ const tree = buildTree(TABLE_HTML, "https://example.com");
246
+ const hits = queryTree(tree, "speed fast");
247
+ expect(hits.length).toBeGreaterThan(0);
248
+ expect(hits[0].snippet.toLowerCase()).toContain("speed");
249
+ });
250
+
251
+ it("does not return duplicate ancestor/descendant pairs", () => {
252
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
253
+ const hits = queryTree(tree, "npm install");
254
+ // No two hits where one path is a prefix of another at equal score
255
+ for (let i = 0; i < hits.length; i++) {
256
+ for (let j = 0; j < hits.length; j++) {
257
+ if (i === j) continue;
258
+ if (hits[j].path.startsWith(`${hits[i].path}.`) && hits[i].score >= hits[j].score) {
259
+ expect(false).toBe(true); // should have been deduplicated
260
+ }
261
+ }
262
+ }
263
+ });
264
+
265
+ it("snippet is non-empty for every hit", () => {
266
+ const tree = buildTree(SIMPLE_HTML, "https://example.com");
267
+ const hits = queryTree(tree, "install");
268
+ for (const h of hits) {
269
+ expect(h.snippet.trim().length).toBeGreaterThan(0);
270
+ }
271
+ });
272
+ });
@@ -0,0 +1,169 @@
1
+ /**
2
+ * TDD tests for ImageRef and SpideredPage.images
3
+ */
4
+
5
+ import { describe, expect, it } from "vitest";
6
+ import type { ImageRef, SpideredPage } from "../src/types.js";
7
+
8
+ // ---------------------------------------------------------------------------
9
+ // Helpers
10
+ // ---------------------------------------------------------------------------
11
+
12
+ /** Build a minimal valid SpideredPage for structural tests. */
13
+ function makeMinimalPage(overrides: Partial<SpideredPage> = {}): SpideredPage {
14
+ return {
15
+ url: "https://example.com",
16
+ domain: "example.com",
17
+ fetchedAt: new Date().toISOString(),
18
+ title: "Test",
19
+ description: "",
20
+ author: "",
21
+ publishedAt: "",
22
+ lang: "en",
23
+ tags: [],
24
+ wordCount: 0,
25
+ readingTimeMinutes: 0,
26
+ headings: [],
27
+ chunks: [],
28
+ links: [],
29
+ markdown: "",
30
+ ...overrides,
31
+ };
32
+ }
33
+
34
+ /** Runtime guard: at least one of base64 or filePath must be present. */
35
+ function assertImageRefHasData(ref: ImageRef): void {
36
+ if (!ref.base64 && !ref.filePath) {
37
+ throw new Error("ImageRef must have at least one of base64 or filePath");
38
+ }
39
+ }
40
+
41
+ // ---------------------------------------------------------------------------
42
+ // ImageRef structural tests
43
+ // ---------------------------------------------------------------------------
44
+
45
+ describe("ImageRef type", () => {
46
+ it("accepts a minimal ImageRef with only required fields", () => {
47
+ const ref: ImageRef = {
48
+ src: "https://example.com/photo.jpg",
49
+ mimeType: "image/jpeg",
50
+ alt: "A photo",
51
+ };
52
+ expect(ref.src).toBe("https://example.com/photo.jpg");
53
+ expect(ref.mimeType).toBe("image/jpeg");
54
+ expect(ref.alt).toBe("A photo");
55
+ expect(ref.base64).toBeUndefined();
56
+ expect(ref.filePath).toBeUndefined();
57
+ });
58
+
59
+ it("accepts an ImageRef with base64", () => {
60
+ const ref: ImageRef = {
61
+ src: "https://example.com/photo.jpg",
62
+ mimeType: "image/jpeg",
63
+ alt: "A photo",
64
+ base64: "abc123",
65
+ };
66
+ expect(ref.base64).toBe("abc123");
67
+ });
68
+
69
+ it("accepts an ImageRef with filePath", () => {
70
+ const ref: ImageRef = {
71
+ src: "https://example.com/large.jpg",
72
+ mimeType: "image/jpeg",
73
+ alt: "Large image",
74
+ filePath: "/home/user/.cache/web-spider/images/abc123.jpg",
75
+ };
76
+ expect(ref.filePath).toContain("abc123.jpg");
77
+ });
78
+
79
+ it("accepts an ImageRef with both base64 and filePath", () => {
80
+ const ref: ImageRef = {
81
+ src: "https://example.com/photo.jpg",
82
+ mimeType: "image/jpeg",
83
+ alt: "",
84
+ base64: "xyz",
85
+ filePath: "/tmp/xyz.jpg",
86
+ };
87
+ expect(ref.base64).toBe("xyz");
88
+ expect(ref.filePath).toBe("/tmp/xyz.jpg");
89
+ });
90
+
91
+ it("accepts empty string alt (no alt attribute)", () => {
92
+ const ref: ImageRef = {
93
+ src: "https://example.com/no-alt.jpg",
94
+ mimeType: "image/jpeg",
95
+ alt: "",
96
+ };
97
+ expect(ref.alt).toBe("");
98
+ });
99
+
100
+ it("accepts data: URL in src field", () => {
101
+ const ref: ImageRef = {
102
+ src: "data:image/png;base64,iVBORw0KGgo=",
103
+ mimeType: "image/png",
104
+ alt: "Inline image",
105
+ base64: "iVBORw0KGgo=",
106
+ };
107
+ expect(ref.src).toMatch(/^data:/);
108
+ });
109
+ });
110
+
111
+ // ---------------------------------------------------------------------------
112
+ // Runtime guard: assertImageRefHasData
113
+ // ---------------------------------------------------------------------------
114
+
115
+ describe("ImageRef data guard", () => {
116
+ it("passes when base64 is present", () => {
117
+ const ref: ImageRef = { src: "https://x.com/a.jpg", mimeType: "image/jpeg", alt: "", base64: "abc" };
118
+ expect(() => assertImageRefHasData(ref)).not.toThrow();
119
+ });
120
+
121
+ it("passes when filePath is present", () => {
122
+ const ref: ImageRef = { src: "https://x.com/a.jpg", mimeType: "image/jpeg", alt: "", filePath: "/tmp/a.jpg" };
123
+ expect(() => assertImageRefHasData(ref)).not.toThrow();
124
+ });
125
+
126
+ it("throws when neither base64 nor filePath is present", () => {
127
+ const ref: ImageRef = { src: "https://x.com/a.jpg", mimeType: "image/jpeg", alt: "" };
128
+ expect(() => assertImageRefHasData(ref)).toThrow("at least one of base64 or filePath");
129
+ });
130
+ });
131
+
132
+ // ---------------------------------------------------------------------------
133
+ // SpideredPage.images field
134
+ // ---------------------------------------------------------------------------
135
+
136
+ describe("SpideredPage.images field", () => {
137
+ it("is optional — SpideredPage without images is valid", () => {
138
+ const page = makeMinimalPage();
139
+ expect(page.images).toBeUndefined();
140
+ });
141
+
142
+ it("accepts an empty images array", () => {
143
+ const page = makeMinimalPage({ images: [] });
144
+ expect(page.images).toEqual([]);
145
+ });
146
+
147
+ it("accepts a populated images array", () => {
148
+ const images: ImageRef[] = [
149
+ { src: "https://example.com/a.jpg", mimeType: "image/jpeg", alt: "A", base64: "abc" },
150
+ { src: "https://example.com/b.png", mimeType: "image/png", alt: "B", base64: "def" },
151
+ ];
152
+ const page = makeMinimalPage({ images });
153
+ expect(page.images).toHaveLength(2);
154
+ expect(page.images![0].src).toBe("https://example.com/a.jpg");
155
+ expect(page.images![1].mimeType).toBe("image/png");
156
+ });
157
+
158
+ it("produces a valid LLM data URL from an ImageRef", () => {
159
+ const ref: ImageRef = {
160
+ src: "https://example.com/photo.jpg",
161
+ mimeType: "image/jpeg",
162
+ alt: "Photo",
163
+ base64: "abc123XYZ",
164
+ };
165
+ const dataUrl = `data:${ref.mimeType};base64,${ref.base64}`;
166
+ expect(dataUrl).toBe("data:image/jpeg;base64,abc123XYZ");
167
+ expect(dataUrl).toMatch(/^data:image\//);
168
+ });
169
+ });
@@ -0,0 +1,180 @@
1
+ /**
2
+ * Integration tests — real network, real APIs.
3
+ *
4
+ * Each suite skips cleanly when its API key is absent so CI without secrets
5
+ * still passes. Run locally with keys set:
6
+ *
7
+ * TAVILY_API_KEY=tvly-... vitest run test/web-search-integration.test.ts
8
+ *
9
+ * DDG tests always run — no key required.
10
+ */
11
+
12
+ import { describe, expect, it } from "vitest";
13
+ import type { WebSearchResult } from "../src/ports.js";
14
+ import {
15
+ DdgSearchEngine,
16
+ FallbackSearchEngine,
17
+ TavilySearchEngine,
18
+ ddgSearch,
19
+ tavilySearch,
20
+ webSearch,
21
+ } from "../src/web-search.js";
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Shared contract assertions
25
+ // ---------------------------------------------------------------------------
26
+
27
+ function assertResults(results: WebSearchResult[], minCount = 1) {
28
+ expect(Array.isArray(results)).toBe(true);
29
+ expect(results.length).toBeGreaterThanOrEqual(minCount);
30
+ for (const r of results) {
31
+ expect(typeof r.url).toBe("string");
32
+ expect(r.url).toMatch(/^https?:\/\//);
33
+ expect(typeof r.title).toBe("string");
34
+ expect(r.title.length).toBeGreaterThan(0);
35
+ expect(typeof r.snippet).toBe("string");
36
+ }
37
+ }
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // DuckDuckGo — no key required, always runs
41
+ // ---------------------------------------------------------------------------
42
+
43
+ describe("ddgSearch() — live DDG Instant Answer API", () => {
44
+ it("returns a parseable JSON response (catches Brotli/encoding issues)", async () => {
45
+ // This test exists specifically to catch the Node fetch + Brotli bug:
46
+ // DDG responds with Content-Encoding: br by default; undici won't
47
+ // decompress it, yielding an empty body and a JSON parse error.
48
+ // The fix: Accept-Encoding: gzip, deflate in the request headers.
49
+ const results = await ddgSearch("OpenAI", { numResults: 5 });
50
+ // DDG may return 0 results for some queries — just assert no throw
51
+ expect(Array.isArray(results)).toBe(true);
52
+ });
53
+
54
+ it("returns well-known entity results for an unambiguous query", async () => {
55
+ const results = await ddgSearch("Node.js", { numResults: 5 });
56
+ assertResults(results);
57
+ });
58
+
59
+ it("every result has a valid http(s) URL", async () => {
60
+ const results = await ddgSearch("TypeScript language", { numResults: 8 });
61
+ for (const r of results) {
62
+ expect(() => new URL(r.url)).not.toThrow();
63
+ expect(["http:", "https:"]).toContain(new URL(r.url).protocol);
64
+ }
65
+ });
66
+
67
+ it("respects numResults cap", async () => {
68
+ const results = await ddgSearch("JavaScript", { numResults: 3 });
69
+ expect(results.length).toBeLessThanOrEqual(3);
70
+ });
71
+
72
+ it("returns an empty array (not a throw) for a nonsense query", async () => {
73
+ const results = await ddgSearch("xyzzy-quux-frumious-bandersnatch-99999");
74
+ expect(Array.isArray(results)).toBe(true);
75
+ });
76
+
77
+ it("DdgSearchEngine.search() delegates correctly", async () => {
78
+ const engine = new DdgSearchEngine();
79
+ const results = await engine.search({ query: "OpenAI", numResults: 3 });
80
+ expect(Array.isArray(results)).toBe(true);
81
+ });
82
+ });
83
+
84
+ // ---------------------------------------------------------------------------
85
+ // Tavily — skips when TAVILY_API_KEY is absent
86
+ // ---------------------------------------------------------------------------
87
+
88
+ const TAVILY_KEY = process.env["TAVILY_API_KEY"];
89
+ const describeTavily = TAVILY_KEY ? describe : describe.skip;
90
+
91
+ describeTavily("tavilySearch() — live Tavily API", () => {
92
+ it("returns results for a straightforward query", async () => {
93
+ const results = await tavilySearch("hexagonal architecture TypeScript", { numResults: 3 });
94
+ assertResults(results, 1);
95
+ });
96
+
97
+ it("every result has a valid URL and non-empty title", async () => {
98
+ const results = await tavilySearch("web scraping AI agents", { numResults: 5 });
99
+ assertResults(results, 1);
100
+ });
101
+
102
+ it("respects numResults", async () => {
103
+ const results = await tavilySearch("JavaScript", { numResults: 2 });
104
+ expect(results.length).toBeLessThanOrEqual(2);
105
+ });
106
+
107
+ it("TavilySearchEngine.search() delegates correctly", async () => {
108
+ const engine = new TavilySearchEngine(TAVILY_KEY!);
109
+ const results = await engine.search({ query: "DuckDuckGo API", numResults: 3 });
110
+ assertResults(results, 1);
111
+ });
112
+
113
+ it("throws a clear error when the key is wrong", async () => {
114
+ await expect(
115
+ tavilySearch("test", { apiKey: "tvly-invalid-key-000" }),
116
+ ).rejects.toThrow(/tavily/i);
117
+ });
118
+ });
119
+
120
+ // ---------------------------------------------------------------------------
121
+ // FallbackSearchEngine — Tavily → DDG end-to-end
122
+ // ---------------------------------------------------------------------------
123
+
124
+ const describeFallback = TAVILY_KEY ? describe : describe.skip;
125
+
126
+ describeFallback("FallbackSearchEngine — Tavily → DDG live chain", () => {
127
+ it("Tavily wins for a normal query (DDG never needed)", async () => {
128
+ const engine = new FallbackSearchEngine([
129
+ new TavilySearchEngine(TAVILY_KEY!),
130
+ new DdgSearchEngine(),
131
+ ]);
132
+ const results = await engine.search({ query: "TypeScript strategy pattern", numResults: 3 });
133
+ assertResults(results, 1);
134
+ });
135
+
136
+ it("DDG provides results when Tavily is replaced with a failing stub", async () => {
137
+ const alwaysFails = { search: async () => { throw new Error("simulated Tavily outage"); } };
138
+ const engine = new FallbackSearchEngine([alwaysFails, new DdgSearchEngine()]);
139
+ // DDG may return empty for some queries but must not throw
140
+ const results = await engine.search({ query: "OpenAI", numResults: 3 });
141
+ expect(Array.isArray(results)).toBe(true);
142
+ });
143
+
144
+ it("DDG provides results when Tavily returns empty", async () => {
145
+ const alwaysEmpty = { search: async () => [] };
146
+ const engine = new FallbackSearchEngine([alwaysEmpty, new DdgSearchEngine()]);
147
+ const results = await engine.search({ query: "Node.js", numResults: 3 });
148
+ expect(Array.isArray(results)).toBe(true);
149
+ });
150
+ });
151
+
152
+ // ---------------------------------------------------------------------------
153
+ // webSearch() — auto-detect from env
154
+ // ---------------------------------------------------------------------------
155
+
156
+ const describeWebSearch = TAVILY_KEY ? describe : describe.skip;
157
+
158
+ describeWebSearch("webSearch() — auto-detects Tavily from env", () => {
159
+ it("returns results without specifying an engine", async () => {
160
+ const results = await webSearch("open source web crawler", { numResults: 3 });
161
+ assertResults(results, 1);
162
+ });
163
+
164
+ it("returns results when engine is forced to 'tavily'", async () => {
165
+ const results = await webSearch("AI coding assistant", { engine: "tavily", numResults: 3 });
166
+ assertResults(results, 1);
167
+ });
168
+
169
+ it("returns results when engine is forced to 'ddg'", async () => {
170
+ const results = await webSearch("OpenAI", { engine: "ddg", numResults: 5 });
171
+ expect(Array.isArray(results)).toBe(true);
172
+ });
173
+
174
+ it("throws a descriptive error when forced to 'brave' with no key set", async () => {
175
+ const saved = process.env["BRAVE_SEARCH_API_KEY"];
176
+ delete process.env["BRAVE_SEARCH_API_KEY"];
177
+ await expect(webSearch("test", { engine: "brave" })).rejects.toThrow("BRAVE_SEARCH_API_KEY");
178
+ if (saved) process.env["BRAVE_SEARCH_API_KEY"] = saved;
179
+ });
180
+ });