@dpopsuev/web-spider 0.10.4 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/batch.js.map +1 -0
  2. package/dist/cache.js.map +1 -0
  3. package/dist/convert.js.map +1 -0
  4. package/dist/crawl.js.map +1 -0
  5. package/dist/disk-cache.js.map +1 -0
  6. package/dist/graph.js.map +1 -0
  7. package/dist/index.js.map +1 -0
  8. package/dist/parse.js.map +1 -0
  9. package/dist/playwright.js.map +1 -0
  10. package/dist/ports.js.map +1 -0
  11. package/dist/robots.js.map +1 -0
  12. package/dist/search.js.map +1 -0
  13. package/dist/sitemap.js.map +1 -0
  14. package/dist/spider.js.map +1 -0
  15. package/dist/throttle.js.map +1 -0
  16. package/dist/tree.js.map +1 -0
  17. package/dist/types.js.map +1 -0
  18. package/dist/views.js.map +1 -0
  19. package/dist/web-search.js.map +1 -0
  20. package/package.json +2 -1
  21. package/fixtures/article-with-images.html +0 -94
  22. package/fixtures/gh-shell.html +0 -32
  23. package/fixtures/guide-ai-agents-web-scraping.json +0 -552
  24. package/fixtures/images/large.jpg +0 -0
  25. package/fixtures/images/small.jpg +0 -0
  26. package/fixtures/images/tiny.png +0 -0
  27. package/fixtures/quotes-index.json +0 -40
  28. package/scripts/fetch-guide.mjs +0 -25
  29. package/src/cache.ts +0 -99
  30. package/src/convert.ts +0 -161
  31. package/src/crawl.ts +0 -186
  32. package/src/disk-cache.ts +0 -228
  33. package/src/graph.ts +0 -189
  34. package/src/index.ts +0 -74
  35. package/src/parse.ts +0 -154
  36. package/src/playwright.ts +0 -193
  37. package/src/ports.ts +0 -131
  38. package/src/robots.ts +0 -121
  39. package/src/search.ts +0 -173
  40. package/src/sitemap.ts +0 -67
  41. package/src/spider.ts +0 -475
  42. package/src/throttle.ts +0 -118
  43. package/src/tree.ts +0 -379
  44. package/src/types.ts +0 -225
  45. package/src/views.ts +0 -42
  46. package/src/web-search.ts +0 -548
  47. package/test/convert-images.test.ts +0 -69
  48. package/test/disk-cache-images.test.ts +0 -193
  49. package/test/engine-registry.test.ts +0 -114
  50. package/test/exports.test.ts +0 -124
  51. package/test/get-chunk.test.ts +0 -115
  52. package/test/images-integration.test.ts +0 -359
  53. package/test/improvements.test.ts +0 -279
  54. package/test/inbound-count.test.ts +0 -111
  55. package/test/lean.test.ts +0 -105
  56. package/test/playwright.test.ts +0 -128
  57. package/test/ports.test.ts +0 -161
  58. package/test/search.test.ts +0 -219
  59. package/test/spider-images.test.ts +0 -180
  60. package/test/spider-unit.test.ts +0 -610
  61. package/test/tree.test.ts +0 -272
  62. package/test/types.test.ts +0 -169
  63. package/test/web-search-integration.test.ts +0 -180
  64. package/test/web-search.test.ts +0 -305
  65. package/tsconfig.json +0 -9
  66. package/tsconfig.test.json +0 -7
  67. package/vitest.config.ts +0 -8
@@ -1,305 +0,0 @@
1
- /**
2
- * Unit tests for the web-search strategy layer.
3
- *
4
- * No network calls — every ISearchEngine is stubbed so these run offline
5
- * and exercise the FallbackSearchEngine composition logic in isolation.
6
- */
7
-
8
- import { describe, expect, it, vi } from "vitest";
9
- import type { ISearchEngine, SearchQuery, WebSearchResult } from "../src/ports.js";
10
- import { DdgSearchEngine, FallbackSearchEngine, TavilySearchEngine } from "../src/web-search.js";
11
-
12
- // ---------------------------------------------------------------------------
13
- // Helpers
14
- // ---------------------------------------------------------------------------
15
-
16
- const RESULT_A: WebSearchResult = { url: "https://a.example", title: "A", snippet: "snippet a" };
17
- const RESULT_B: WebSearchResult = { url: "https://b.example", title: "B", snippet: "snippet b" };
18
-
19
- /** Stub engine that resolves with a fixed result list. */
20
- function okEngine(results: WebSearchResult[]): ISearchEngine {
21
- return { search: vi.fn().mockResolvedValue(results) };
22
- }
23
-
24
- /** Stub engine that always throws. */
25
- function failEngine(message = "engine error"): ISearchEngine {
26
- return { search: vi.fn().mockRejectedValue(new Error(message)) };
27
- }
28
-
29
- const REQ: SearchQuery = { query: "test query", numResults: 5 };
30
-
31
- // ---------------------------------------------------------------------------
32
- // FallbackSearchEngine — construction guards
33
- // ---------------------------------------------------------------------------
34
-
35
- describe("FallbackSearchEngine — construction", () => {
36
- it("throws when constructed with an empty engines array", () => {
37
- expect(() => new FallbackSearchEngine([])).toThrow("at least one engine");
38
- });
39
-
40
- it("accepts a single engine", () => {
41
- expect(() => new FallbackSearchEngine([okEngine([])])).not.toThrow();
42
- });
43
- });
44
-
45
- // ---------------------------------------------------------------------------
46
- // FallbackSearchEngine — happy path
47
- // ---------------------------------------------------------------------------
48
-
49
- describe("FallbackSearchEngine — first engine succeeds", () => {
50
- it("returns first engine's results without calling subsequent engines", async () => {
51
- const first = okEngine([RESULT_A]);
52
- const second = okEngine([RESULT_B]);
53
- const fb = new FallbackSearchEngine([first, second]);
54
-
55
- const results = await fb.search(REQ);
56
-
57
- expect(results).toEqual([RESULT_A]);
58
- expect(first.search).toHaveBeenCalledOnce();
59
- expect(second.search).not.toHaveBeenCalled();
60
- });
61
-
62
- it("forwards query and numResults to the engine", async () => {
63
- const engine = okEngine([RESULT_A]);
64
- const fb = new FallbackSearchEngine([engine]);
65
- const req: SearchQuery = { query: "hello", numResults: 3 };
66
-
67
- await fb.search(req);
68
-
69
- expect(engine.search).toHaveBeenCalledWith(req);
70
- });
71
- });
72
-
73
- // ---------------------------------------------------------------------------
74
- // FallbackSearchEngine — fallbackOnEmpty (default: true)
75
- // ---------------------------------------------------------------------------
76
-
77
- describe("FallbackSearchEngine — fallbackOnEmpty", () => {
78
- it("falls through to second engine when first returns empty (default)", async () => {
79
- const first = okEngine([]);
80
- const second = okEngine([RESULT_B]);
81
- const fb = new FallbackSearchEngine([first, second]);
82
-
83
- const results = await fb.search(REQ);
84
-
85
- expect(results).toEqual([RESULT_B]);
86
- expect(first.search).toHaveBeenCalledOnce();
87
- expect(second.search).toHaveBeenCalledOnce();
88
- });
89
-
90
- it("does NOT fall through when fallbackOnEmpty is false", async () => {
91
- const first = okEngine([]);
92
- const second = okEngine([RESULT_B]);
93
- const fb = new FallbackSearchEngine([first, second], { fallbackOnEmpty: false });
94
-
95
- const results = await fb.search(REQ);
96
-
97
- expect(results).toEqual([]);
98
- expect(second.search).not.toHaveBeenCalled();
99
- });
100
-
101
- it("returns empty when all engines return empty", async () => {
102
- const fb = new FallbackSearchEngine([okEngine([]), okEngine([])]);
103
- const results = await fb.search(REQ);
104
- expect(results).toEqual([]);
105
- });
106
- });
107
-
108
- // ---------------------------------------------------------------------------
109
- // FallbackSearchEngine — fallbackOnError (default: true)
110
- // ---------------------------------------------------------------------------
111
-
112
- describe("FallbackSearchEngine — fallbackOnError", () => {
113
- it("falls through to next engine on error (default)", async () => {
114
- const first = failEngine("network timeout");
115
- const second = okEngine([RESULT_B]);
116
- const fb = new FallbackSearchEngine([first, second]);
117
-
118
- const results = await fb.search(REQ);
119
-
120
- expect(results).toEqual([RESULT_B]);
121
- expect(second.search).toHaveBeenCalledOnce();
122
- });
123
-
124
- it("re-throws immediately when fallbackOnError is false", async () => {
125
- const first = failEngine("api key invalid");
126
- const second = okEngine([RESULT_B]);
127
- const fb = new FallbackSearchEngine([first, second], { fallbackOnError: false });
128
-
129
- await expect(fb.search(REQ)).rejects.toThrow("api key invalid");
130
- expect(second.search).not.toHaveBeenCalled();
131
- });
132
-
133
- it("re-throws last error when all engines fail", async () => {
134
- const fb = new FallbackSearchEngine([
135
- failEngine("first error"),
136
- failEngine("second error"),
137
- ]);
138
-
139
- await expect(fb.search(REQ)).rejects.toThrow("second error");
140
- });
141
-
142
- it("falls through on error then on empty before returning results", async () => {
143
- const first = failEngine("timeout");
144
- const second = okEngine([]);
145
- const third = okEngine([RESULT_A]);
146
- const fb = new FallbackSearchEngine([first, second, third]);
147
-
148
- const results = await fb.search(REQ);
149
-
150
- expect(results).toEqual([RESULT_A]);
151
- expect(third.search).toHaveBeenCalledOnce();
152
- });
153
- });
154
-
155
- // ---------------------------------------------------------------------------
156
- // FallbackSearchEngine — composability (nested)
157
- // ---------------------------------------------------------------------------
158
-
159
- describe("FallbackSearchEngine — composability", () => {
160
- it("can be nested inside another FallbackSearchEngine", async () => {
161
- // Inner chain: fails → empty
162
- const inner = new FallbackSearchEngine([failEngine(), okEngine([])]);
163
- // Outer chain: inner → RESULT_B
164
- const outer = new FallbackSearchEngine([inner, okEngine([RESULT_B])]);
165
-
166
- const results = await outer.search(REQ);
167
- expect(results).toEqual([RESULT_B]);
168
- });
169
-
170
- it("implements ISearchEngine — assignable to the port type", () => {
171
- const fb: ISearchEngine = new FallbackSearchEngine([okEngine([])]);
172
- expect(typeof fb.search).toBe("function");
173
- });
174
- });
175
-
176
- // ---------------------------------------------------------------------------
177
- // DdgSearchEngine — port conformance (no network)
178
- // ---------------------------------------------------------------------------
179
-
180
- describe("DdgSearchEngine — port conformance", () => {
181
- it("implements ISearchEngine", () => {
182
- const engine: ISearchEngine = new DdgSearchEngine();
183
- expect(typeof engine.search).toBe("function");
184
- });
185
-
186
- it("can be placed inside a FallbackSearchEngine chain", async () => {
187
- // We don't call the real DDG here — just assert structural compatibility.
188
- const ddg = new DdgSearchEngine();
189
- const fb = new FallbackSearchEngine([okEngine([RESULT_A]), ddg]);
190
-
191
- // First engine returns results — DDG never called (no network needed)
192
- const results = await fb.search(REQ);
193
- expect(results).toEqual([RESULT_A]);
194
- });
195
- });
196
-
197
- // ---------------------------------------------------------------------------
198
- // TavilySearchEngine — missing key throws (guards)
199
- // ---------------------------------------------------------------------------
200
-
201
- describe("TavilySearchEngine — key guard", () => {
202
- it("throws when no API key is provided and env var is absent", async () => {
203
- const savedKey = process.env["TAVILY_API_KEY"];
204
- delete process.env["TAVILY_API_KEY"];
205
-
206
- const engine = new TavilySearchEngine(""); // empty string = no key
207
- await expect(engine.search(REQ)).rejects.toThrow();
208
-
209
- if (savedKey !== undefined) process.env["TAVILY_API_KEY"] = savedKey;
210
- });
211
- });
212
-
213
- // ---------------------------------------------------------------------------
214
- // Recommended composition: Tavily → DDG
215
- // ---------------------------------------------------------------------------
216
-
217
- describe("Tavily + DDG fallback pattern", () => {
218
- it("returns Tavily results when Tavily succeeds", async () => {
219
- const tavily = okEngine([RESULT_A]);
220
- const ddg = okEngine([RESULT_B]);
221
- const engine = new FallbackSearchEngine([tavily, ddg]);
222
-
223
- const results = await engine.search(REQ);
224
- expect(results).toEqual([RESULT_A]);
225
- expect(ddg.search).not.toHaveBeenCalled();
226
- });
227
-
228
- it("falls back to DDG when Tavily returns empty", async () => {
229
- const tavily = okEngine([]);
230
- const ddg = okEngine([RESULT_B]);
231
- const engine = new FallbackSearchEngine([tavily, ddg]);
232
-
233
- const results = await engine.search(REQ);
234
- expect(results).toEqual([RESULT_B]);
235
- });
236
-
237
- it("falls back to DDG when Tavily throws (e.g. rate limit)", async () => {
238
- const tavily = failEngine("429 rate limit");
239
- const ddg = okEngine([RESULT_B]);
240
- const engine = new FallbackSearchEngine([tavily, ddg]);
241
-
242
- const results = await engine.search(REQ);
243
- expect(results).toEqual([RESULT_B]);
244
- });
245
-
246
- it("returns empty when both Tavily and DDG find nothing", async () => {
247
- const engine = new FallbackSearchEngine([okEngine([]), okEngine([])]);
248
- const results = await engine.search(REQ);
249
- expect(results).toEqual([]);
250
- });
251
- });
252
-
253
- // ---------------------------------------------------------------------------
254
- // SearchQuery — timeRange and topic fields
255
- // ---------------------------------------------------------------------------
256
-
257
- describe("SearchQuery — timeRange and topic", () => {
258
- it("SearchQuery accepts timeRange field", () => {
259
- const req: SearchQuery = { query: "AI agents", numResults: 5, timeRange: "month" };
260
- expect(req.timeRange).toBe("month");
261
- });
262
-
263
- it("SearchQuery accepts topic field", () => {
264
- const req: SearchQuery = { query: "latest news", topic: "news" };
265
- expect(req.topic).toBe("news");
266
- });
267
-
268
- it("FallbackSearchEngine forwards timeRange and topic to each engine", async () => {
269
- const spy = vi.fn().mockResolvedValue([RESULT_A]);
270
- const engine = new FallbackSearchEngine([{ search: spy }]);
271
-
272
- await engine.search({ query: "test", timeRange: "week", topic: "news" });
273
-
274
- expect(spy).toHaveBeenCalledWith(
275
- expect.objectContaining({ timeRange: "week", topic: "news" }),
276
- );
277
- });
278
-
279
- it("TavilySearchEngine.search() sends time_range and topic in the POST body", async () => {
280
- // Intercept global fetch to capture what body Tavily receives.
281
- const originalFetch = globalThis.fetch;
282
- let capturedBody: Record<string, unknown> | null = null;
283
-
284
- globalThis.fetch = vi.fn().mockImplementation(async (_url: string, init?: RequestInit) => {
285
- capturedBody = JSON.parse(init?.body as string ?? "{}");
286
- return {
287
- ok: true,
288
- status: 200,
289
- statusText: "OK",
290
- headers: { get: () => "application/json" },
291
- json: async () => ({
292
- results: [{ url: "https://a.com", title: "A", content: "snippet" }],
293
- }),
294
- };
295
- }) as typeof fetch;
296
-
297
- try {
298
- const engine = new TavilySearchEngine("test-key");
299
- await engine.search({ query: "ona", timeRange: "month", topic: "news" });
300
- expect(capturedBody).toMatchObject({ time_range: "month", topic: "news" });
301
- } finally {
302
- globalThis.fetch = originalFetch;
303
- }
304
- });
305
- });
package/tsconfig.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../../tsconfig.base.json",
3
- "compilerOptions": {
4
- "outDir": "./dist",
5
- "rootDir": "./src",
6
- "lib": ["ES2022", "DOM", "DOM.Iterable"]
7
- },
8
- "include": ["src/**/*"]
9
- }
@@ -1,7 +0,0 @@
1
- {
2
- "extends": "../../tsconfig.base.json",
3
- "compilerOptions": {
4
- "noEmit": true
5
- },
6
- "include": ["src/**/*", "test/**/*", "vitest.config.ts"]
7
- }
package/vitest.config.ts DELETED
@@ -1,8 +0,0 @@
1
- import { defineConfig } from "vitest/config"
2
-
3
- export default defineConfig({
4
- test: {
5
- globals: true,
6
- environment: "node",
7
- },
8
- })