@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,548 @@
1
+ /**
2
+ * Web search API integration — Brave Search and Tavily.
3
+ *
4
+ * Both return a normalised WebSearchResult[].
5
+ * API keys are read from environment variables by default:
6
+ * BRAVE_SEARCH_API_KEY
7
+ * TAVILY_API_KEY
8
+ */
9
+
10
+ // WebSearchResult is defined in ports.ts (the abstraction layer).
11
+ // web-search.ts is an adapter — it imports from the port, not the other way.
12
+ export type { WebSearchResult } from "./ports.js";
13
+ import type { ISearchEngine, SearchQuery, WebSearchResult } from "./ports.js";
14
+
15
+ export interface BraveSearchOptions {
16
+ /** API key. Defaults to process.env.BRAVE_SEARCH_API_KEY. */
17
+ apiKey?: string;
18
+ /** Number of results (1–20). Default 10. */
19
+ numResults?: number;
20
+ /** ISO 3166-1 alpha-2 country code for localised results, e.g. "US". */
21
+ country?: string;
22
+ /**
23
+ * Freshness filter. Maps SearchQuery.timeRange to Brave's parameter:
24
+ * "pd" = past day, "pw" = past week, "pm" = past month, "py" = past year.
25
+ * Pass directly when bypassing the adapter, or set timeRange on SearchQuery.
26
+ */
27
+ freshness?: "pd" | "pw" | "pm" | "py";
28
+ }
29
+
30
+ export interface TavilySearchOptions {
31
+ /** API key. Defaults to process.env.TAVILY_API_KEY. */
32
+ apiKey?: string;
33
+ /** Number of results. Default 5. */
34
+ numResults?: number;
35
+ /** "basic" (1 credit) or "advanced" (2 credits). Default "basic". */
36
+ depth?: "basic" | "advanced";
37
+ /** Restrict results to content published within this window. */
38
+ timeRange?: "day" | "week" | "month" | "year";
39
+ /** Topic mode: "news" prioritises fresh news articles. */
40
+ topic?: "news" | "general";
41
+ }
42
+
43
+ export type SearchEngine = "brave" | "tavily" | "exa" | "ddg";
44
+
45
+ export interface ExaSearchOptions {
46
+ /** API key. Defaults to process.env.EXA_API_KEY. */
47
+ apiKey?: string;
48
+ /** Number of results. Default 10. */
49
+ numResults?: number;
50
+ /**
51
+ * Search type.
52
+ * "auto" — Exa decides keyword vs neural (default).
53
+ * "neural" — embedding-based semantic search.
54
+ * "keyword" — traditional keyword search.
55
+ */
56
+ type?: "auto" | "neural" | "keyword";
57
+ }
58
+
59
+ /**
60
+ * Search the web via the Exa Search API (neural/semantic retrieval).
61
+ * https://exa.ai/docs/reference/search
62
+ *
63
+ * Returns highlights inline per result — richer snippets without extra round-trips.
64
+ */
65
+ export async function exaSearch(query: string, opts: ExaSearchOptions = {}): Promise<WebSearchResult[]> {
66
+ const apiKey = opts.apiKey ?? process.env["EXA_API_KEY"];
67
+ if (!apiKey) throw new Error("Exa API key required — set EXA_API_KEY or pass opts.apiKey");
68
+
69
+ const controller = new AbortController();
70
+ const timer = setTimeout(() => controller.abort(), 15_000);
71
+ let res: Response;
72
+ try {
73
+ res = await fetch("https://api.exa.ai/search", {
74
+ method: "POST",
75
+ signal: controller.signal,
76
+ headers: {
77
+ "Content-Type": "application/json",
78
+ "x-api-key": apiKey,
79
+ },
80
+ body: JSON.stringify({
81
+ query,
82
+ numResults: opts.numResults ?? 10,
83
+ type: opts.type ?? "auto",
84
+ contents: {
85
+ highlights: { numSentences: 2, highlightsPerUrl: 3 },
86
+ },
87
+ }),
88
+ });
89
+ } finally {
90
+ clearTimeout(timer);
91
+ }
92
+
93
+ if (!res.ok) throw new Error(`Exa API error: ${res.status} ${res.statusText}`);
94
+
95
+ const data = (await res.json()) as {
96
+ results?: Array<{
97
+ url: string;
98
+ title: string;
99
+ publishedDate?: string;
100
+ highlights?: string[];
101
+ }>;
102
+ };
103
+
104
+ return (data.results ?? []).map((r) => ({
105
+ url: r.url,
106
+ title: r.title,
107
+ snippet: r.highlights?.join(" … ") ?? "",
108
+ ...(r.publishedDate ? { publishedAt: r.publishedDate } : {}),
109
+ }));
110
+ }
111
+
112
+ /**
113
+ * Search the web via the Brave Search API.
114
+ * https://api.search.brave.com/app/documentation/web-search
115
+ */
116
+ export async function braveSearch(query: string, opts: BraveSearchOptions = {}): Promise<WebSearchResult[]> {
117
+ const apiKey = opts.apiKey ?? process.env["BRAVE_SEARCH_API_KEY"];
118
+ if (!apiKey) throw new Error("Brave Search API key required — set BRAVE_SEARCH_API_KEY or pass opts.apiKey");
119
+
120
+ const params = new URLSearchParams({
121
+ q: query,
122
+ count: String(Math.min(opts.numResults ?? 10, 20)),
123
+ });
124
+ if (opts.country) params.set("country", opts.country);
125
+ if (opts.freshness) params.set("freshness", opts.freshness);
126
+
127
+ const controller = new AbortController();
128
+ const timer = setTimeout(() => controller.abort(), 10_000);
129
+ let res: Response;
130
+ try {
131
+ res = await fetch(`https://api.search.brave.com/res/v1/web/search?${params}`, {
132
+ signal: controller.signal,
133
+ headers: {
134
+ Accept: "application/json",
135
+ "Accept-Encoding": "gzip",
136
+ "X-Subscription-Token": apiKey,
137
+ },
138
+ });
139
+ } finally {
140
+ clearTimeout(timer);
141
+ }
142
+
143
+ if (!res.ok) throw new Error(`Brave Search API error: ${res.status} ${res.statusText}`);
144
+
145
+ const data = (await res.json()) as {
146
+ web?: {
147
+ results?: Array<{
148
+ url: string;
149
+ title: string;
150
+ description?: string;
151
+ age?: string;
152
+ }>;
153
+ };
154
+ };
155
+
156
+ return (data.web?.results ?? []).map((r) => ({
157
+ url: r.url,
158
+ title: r.title,
159
+ snippet: r.description ?? "",
160
+ ...(r.age ? { publishedAt: r.age } : {}),
161
+ }));
162
+ }
163
+
164
+ /**
165
+ * Search the web via the Tavily API.
166
+ * https://docs.tavily.com/docs/rest-api/api-reference
167
+ */
168
+ export async function tavilySearch(query: string, opts: TavilySearchOptions = {}): Promise<WebSearchResult[]> {
169
+ const apiKey = opts.apiKey ?? process.env["TAVILY_API_KEY"];
170
+ if (!apiKey) throw new Error("Tavily API key required — set TAVILY_API_KEY or pass opts.apiKey");
171
+
172
+ const controller = new AbortController();
173
+ const timer = setTimeout(() => controller.abort(), 15_000);
174
+ let res: Response;
175
+ try {
176
+ res = await fetch("https://api.tavily.com/search", {
177
+ method: "POST",
178
+ signal: controller.signal,
179
+ headers: { "Content-Type": "application/json" },
180
+ body: JSON.stringify({
181
+ query,
182
+ api_key: apiKey,
183
+ max_results: opts.numResults ?? 5,
184
+ search_depth: opts.depth ?? "basic",
185
+ include_raw_content: false,
186
+ ...(opts.timeRange ? { time_range: opts.timeRange } : {}),
187
+ ...(opts.topic ? { topic: opts.topic } : {}),
188
+ }),
189
+ });
190
+ } finally {
191
+ clearTimeout(timer);
192
+ }
193
+
194
+ if (!res.ok) throw new Error(`Tavily API error: ${res.status} ${res.statusText}`);
195
+
196
+ const data = (await res.json()) as {
197
+ results?: Array<{
198
+ url: string;
199
+ title: string;
200
+ content?: string;
201
+ published_date?: string;
202
+ }>;
203
+ };
204
+
205
+ return (data.results ?? []).map((r) => ({
206
+ url: r.url,
207
+ title: r.title,
208
+ snippet: r.content ?? "",
209
+ ...(r.published_date ? { publishedAt: r.published_date } : {}),
210
+ }));
211
+ }
212
+
213
+ // ---------------------------------------------------------------------------
214
+ // DuckDuckGo Instant Answer API — no key required, zero-cost fallback
215
+ // ---------------------------------------------------------------------------
216
+
217
+ export interface DdgSearchOptions {
218
+ /**
219
+ * Maximum results to return. DDG doesn't support a server-side count param;
220
+ * this slices the client-side result list. Default: 10.
221
+ */
222
+ numResults?: number;
223
+ }
224
+
225
+ /**
226
+ * Search via the DuckDuckGo Instant Answer API.
227
+ * https://duckduckgo.com/api
228
+ *
229
+ * No API key required. Returns structured instant answers (Abstract,
230
+ * Results, RelatedTopics) mapped to WebSearchResult[].
231
+ *
232
+ * Limitation: not a full web index — best for well-known entities and
233
+ * unambiguous queries. Returns empty when DDG has no instant answer.
234
+ */
235
+ export async function ddgSearch(query: string, opts: DdgSearchOptions = {}): Promise<WebSearchResult[]> {
236
+ const params = new URLSearchParams({
237
+ q: query,
238
+ format: "json",
239
+ no_redirect: "1",
240
+ no_html: "1",
241
+ skip_disambig: "1",
242
+ });
243
+
244
+ const controller = new AbortController();
245
+ const timer = setTimeout(() => controller.abort(), 10_000);
246
+ let res: Response;
247
+ try {
248
+ res = await fetch(`https://api.duckduckgo.com/?${params}`, {
249
+ signal: controller.signal,
250
+ headers: {
251
+ Accept: "application/json",
252
+ // DDG silently returns an empty 200 body for browser-like or
253
+ // missing User-Agents. A curl/bot-style UA gets a real 202.
254
+ "User-Agent": "web-spider/0.8",
255
+ },
256
+ });
257
+ } finally {
258
+ clearTimeout(timer);
259
+ }
260
+
261
+ if (!res.ok) throw new Error(`DDG API error: ${res.status} ${res.statusText}`);
262
+
263
+ const data = (await res.json()) as {
264
+ Abstract?: string;
265
+ AbstractURL?: string;
266
+ AbstractSource?: string;
267
+ Heading?: string;
268
+ Results?: Array<{ FirstURL: string; Text: string }>;
269
+ RelatedTopics?: Array<{
270
+ FirstURL?: string;
271
+ Text?: string;
272
+ Topics?: Array<{ FirstURL: string; Text: string }>;
273
+ }>;
274
+ };
275
+
276
+ const results: WebSearchResult[] = [];
277
+ const limit = opts.numResults ?? 10;
278
+
279
+ // 1. Instant answer abstract (Wikipedia-style knowledge panel)
280
+ if (data.Abstract && data.AbstractURL) {
281
+ results.push({
282
+ url: data.AbstractURL,
283
+ title: data.Heading ?? data.AbstractSource ?? "DuckDuckGo",
284
+ snippet: data.Abstract,
285
+ });
286
+ }
287
+
288
+ // 2. Official results (e.g. official site links)
289
+ for (const r of data.Results ?? []) {
290
+ if (results.length >= limit) break;
291
+ if (r.FirstURL) results.push({ url: r.FirstURL, title: r.Text, snippet: r.Text });
292
+ }
293
+
294
+ // 3. Related topics — flatten one level of nesting
295
+ for (const topic of data.RelatedTopics ?? []) {
296
+ if (results.length >= limit) break;
297
+ if (topic.FirstURL && topic.Text) {
298
+ results.push({ url: topic.FirstURL, title: topic.Text, snippet: topic.Text });
299
+ }
300
+ for (const sub of topic.Topics ?? []) {
301
+ if (results.length >= limit) break;
302
+ results.push({ url: sub.FirstURL, title: sub.Text, snippet: sub.Text });
303
+ }
304
+ }
305
+
306
+ return results;
307
+ }
308
+
309
+ /**
310
+ * Search using whichever engine is explicitly requested or has an API key
311
+ * available. Falls through to the DDG Instant Answer API as a zero-cost
312
+ * last resort — no key required.
313
+ *
314
+ * Prefer {@link defaultSearchEngine} + {@link FallbackSearchEngine} when
315
+ * you need composable retry / fallback behaviour.
316
+ */
317
+ export async function webSearch(
318
+ query: string,
319
+ opts: {
320
+ engine?: SearchEngine;
321
+ numResults?: number;
322
+ timeRange?: "day" | "week" | "month" | "year";
323
+ topic?: "news" | "general";
324
+ } = {},
325
+ ): Promise<WebSearchResult[]> {
326
+ const engine = opts.engine
327
+ ? resolveSearchEngine(opts.engine, process.env[envKeyForEngine(opts.engine)])
328
+ : defaultSearchEngine();
329
+ return engine.search({
330
+ query,
331
+ numResults: opts.numResults,
332
+ timeRange: opts.timeRange,
333
+ topic: opts.topic,
334
+ });
335
+ }
336
+
337
+ // ---------------------------------------------------------------------------
338
+ // Engine registry — OCP: adding a new engine = one registerSearchEngine() call
339
+ // ---------------------------------------------------------------------------
340
+
341
+ /**
342
+ * A factory that creates an ISearchEngine from an optional API key.
343
+ * key is undefined for keyless engines (e.g. DDG).
344
+ */
345
+ type EngineFactory = (key: string | undefined) => ISearchEngine;
346
+
347
+ /** The global engine registry. Seeded with built-in engines below. */
348
+ const ENGINE_REGISTRY = new Map<string, EngineFactory>();
349
+
350
+ /**
351
+ * Register a search engine under a name.
352
+ *
353
+ * Call this to add a new engine without touching any existing code:
354
+ * @example
355
+ * registerSearchEngine("my-engine", (key) => new MyEngine(key!))
356
+ */
357
+ export function registerSearchEngine(name: string, factory: EngineFactory): void {
358
+ ENGINE_REGISTRY.set(name, factory);
359
+ }
360
+
361
+ /**
362
+ * Resolve a registered engine by name, passing the provided API key.
363
+ * Throws a descriptive error for unknown names or missing required keys.
364
+ */
365
+ export function resolveSearchEngine(name: string, key?: string | undefined): ISearchEngine {
366
+ const factory = ENGINE_REGISTRY.get(name);
367
+ if (!factory) throw new Error(`Unknown search engine: "${name}". Register it with registerSearchEngine().`);
368
+ return factory(key);
369
+ }
370
+
371
+ /** @internal Map engine name to its env var key name (for webSearch auto-detect). */
372
+ function envKeyForEngine(name: string): string {
373
+ const envKeys: Record<string, string> = {
374
+ brave: "BRAVE_SEARCH_API_KEY",
375
+ tavily: "TAVILY_API_KEY",
376
+ exa: "EXA_API_KEY",
377
+ };
378
+ return envKeys[name] ?? "";
379
+ }
380
+
381
+ // Seed the registry with built-in engines.
382
+ // Adding a new engine: call registerSearchEngine() — do NOT edit this block.
383
+ registerSearchEngine("brave", (key) => {
384
+ if (!key) throw new Error("BRAVE_SEARCH_API_KEY not set");
385
+ return new BraveSearchEngine(key);
386
+ });
387
+ registerSearchEngine("tavily", (key) => {
388
+ if (!key) throw new Error("TAVILY_API_KEY not set");
389
+ return new TavilySearchEngine(key);
390
+ });
391
+ registerSearchEngine("exa", (key) => {
392
+ if (!key) throw new Error("EXA_API_KEY not set");
393
+ return new ExaSearchEngine(key);
394
+ });
395
+ registerSearchEngine("ddg", () => new DdgSearchEngine());
396
+
397
+ // ---------------------------------------------------------------------------
398
+ // ISearchEngine adapters — concrete implementations of the port
399
+ // ---------------------------------------------------------------------------
400
+
401
+ /** Maps the canonical timeRange string to Brave's freshness parameter. */
402
+ const BRAVE_FRESHNESS: Record<string, "pd" | "pw" | "pm" | "py"> = {
403
+ day: "pd",
404
+ week: "pw",
405
+ month: "pm",
406
+ year: "py",
407
+ };
408
+
409
+ /** Brave Search adapter implementing ISearchEngine. */
410
+ export class BraveSearchEngine implements ISearchEngine {
411
+ constructor(private readonly apiKey: string, private readonly country?: string) {}
412
+
413
+ search(req: SearchQuery): Promise<WebSearchResult[]> {
414
+ const freshness = req.timeRange ? BRAVE_FRESHNESS[req.timeRange] : undefined;
415
+ return braveSearch(req.query, {
416
+ apiKey: this.apiKey,
417
+ numResults: req.numResults,
418
+ country: this.country,
419
+ freshness,
420
+ });
421
+ }
422
+ }
423
+
424
+ /** Tavily adapter implementing ISearchEngine. */
425
+ export class TavilySearchEngine implements ISearchEngine {
426
+ constructor(private readonly apiKey: string) {}
427
+
428
+ search(req: SearchQuery): Promise<WebSearchResult[]> {
429
+ return tavilySearch(req.query, {
430
+ apiKey: this.apiKey,
431
+ numResults: req.numResults,
432
+ timeRange: req.timeRange,
433
+ topic: req.topic,
434
+ });
435
+ }
436
+ }
437
+
438
+ /** Exa adapter implementing ISearchEngine. */
439
+ export class ExaSearchEngine implements ISearchEngine {
440
+ constructor(private readonly apiKey: string) {}
441
+
442
+ search(req: SearchQuery): Promise<WebSearchResult[]> {
443
+ return exaSearch(req.query, { apiKey: this.apiKey, numResults: req.numResults });
444
+ }
445
+ }
446
+
447
+ /** DuckDuckGo Instant Answer adapter — no API key required. */
448
+ export class DdgSearchEngine implements ISearchEngine {
449
+ search(req: SearchQuery): Promise<WebSearchResult[]> {
450
+ return ddgSearch(req.query, { numResults: req.numResults });
451
+ }
452
+ }
453
+
454
+ // ---------------------------------------------------------------------------
455
+ // FallbackSearchEngine — strategy composite
456
+ // ---------------------------------------------------------------------------
457
+
458
+ export interface FallbackSearchEngineOptions {
459
+ /**
460
+ * Treat an empty result set as a failure and try the next engine.
461
+ * Default: true.
462
+ */
463
+ fallbackOnEmpty?: boolean;
464
+ /**
465
+ * Swallow a thrown error and try the next engine instead of propagating.
466
+ * Default: true.
467
+ */
468
+ fallbackOnError?: boolean;
469
+ }
470
+
471
+ /**
472
+ * A composite ISearchEngine that tries each engine in order, falling back
473
+ * to the next when the current one returns empty results or throws.
474
+ *
475
+ * Because it implements ISearchEngine itself it is fully composable —
476
+ * nest FallbackSearchEngines, wrap them in caches, inject stubs in tests.
477
+ *
478
+ * @example
479
+ * // Tavily with DDG as zero-cost fallback
480
+ * const engine = new FallbackSearchEngine([
481
+ * new TavilySearchEngine(process.env.TAVILY_API_KEY),
482
+ * new DdgSearchEngine(),
483
+ * ]);
484
+ */
485
+ export class FallbackSearchEngine implements ISearchEngine {
486
+ private readonly fallbackOnEmpty: boolean;
487
+ private readonly fallbackOnError: boolean;
488
+
489
+ constructor(
490
+ private readonly engines: ISearchEngine[],
491
+ opts: FallbackSearchEngineOptions = {},
492
+ ) {
493
+ if (engines.length === 0) throw new Error("FallbackSearchEngine requires at least one engine");
494
+ this.fallbackOnEmpty = opts.fallbackOnEmpty ?? true;
495
+ this.fallbackOnError = opts.fallbackOnError ?? true;
496
+ }
497
+
498
+ async search(req: SearchQuery): Promise<WebSearchResult[]> {
499
+ let lastError: unknown;
500
+
501
+ for (const engine of this.engines) {
502
+ try {
503
+ const results = await engine.search(req);
504
+ if (results.length > 0 || !this.fallbackOnEmpty) return results;
505
+ // Empty + fallbackOnEmpty → try next engine
506
+ } catch (err) {
507
+ if (!this.fallbackOnError) throw err;
508
+ lastError = err;
509
+ // Error + fallbackOnError → try next engine
510
+ }
511
+ }
512
+
513
+ // All engines exhausted — surface the last error or return empty
514
+ if (lastError) throw lastError;
515
+ return [];
516
+ }
517
+ }
518
+
519
+ // ---------------------------------------------------------------------------
520
+ // Wiring — compose engines from environment variables
521
+ // ---------------------------------------------------------------------------
522
+
523
+ /**
524
+ * Build a FallbackSearchEngine chain from environment variables.
525
+ *
526
+ * Priority order for keyed engines: Brave → Tavily → Exa.
527
+ * DuckDuckGo is always appended as the zero-cost last resort.
528
+ *
529
+ * The returned engine implements ISearchEngine — swap it for any stub
530
+ * in tests without touching call sites.
531
+ */
532
+ export function defaultSearchEngine(): ISearchEngine {
533
+ const engines: ISearchEngine[] = [];
534
+
535
+ const brave = process.env["BRAVE_SEARCH_API_KEY"];
536
+ if (brave) engines.push(new BraveSearchEngine(brave));
537
+
538
+ const tavily = process.env["TAVILY_API_KEY"];
539
+ if (tavily) engines.push(new TavilySearchEngine(tavily));
540
+
541
+ const exa = process.env["EXA_API_KEY"];
542
+ if (exa) engines.push(new ExaSearchEngine(exa));
543
+
544
+ // DDG always last — no key needed, never throws the "no key" error
545
+ engines.push(new DdgSearchEngine());
546
+
547
+ return new FallbackSearchEngine(engines);
548
+ }
@@ -0,0 +1,69 @@
1
+ /**
2
+ * TDD tests for toMarkdown() keepImages flag.
3
+ */
4
+
5
+ import { describe, expect, it } from "vitest";
6
+ import { toMarkdown } from "../src/convert.js";
7
+
8
+ describe("toMarkdown() keepImages flag", () => {
9
+ it("1. default strips images — no src or alt in output", () => {
10
+ const md = toMarkdown('<img src="a.png" alt="A nice photo">');
11
+ expect(md).not.toContain("a.png");
12
+ expect(md).not.toContain("A nice photo");
13
+ expect(md.trim()).toBe("");
14
+ });
15
+
16
+ it("2. keepImages: true preserves image as markdown", () => {
17
+ const md = toMarkdown('<img src="a.png" alt="A nice photo">', { keepImages: true });
18
+ expect(md).toContain("a.png");
19
+ expect(md).toContain("A nice photo");
20
+ expect(md).toMatch(/!\[.*\]\(a\.png\)/);
21
+ });
22
+
23
+ it("3. keepImages: true with missing alt produces empty alt", () => {
24
+ const md = toMarkdown('<img src="a.png">', { keepImages: true });
25
+ expect(md).toContain("a.png");
26
+ expect(md).toMatch(/!\[\]\(a\.png\)/);
27
+ });
28
+
29
+ it("4. keepImages: false explicitly strips images (same as default)", () => {
30
+ const md = toMarkdown('<img src="b.jpg" alt="Photo">', { keepImages: false });
31
+ expect(md).not.toContain("b.jpg");
32
+ });
33
+
34
+ it("5. keepImages does not affect surrounding text content", () => {
35
+ const html = '<p>Before</p><img src="x.png" alt="X"><p>After</p>';
36
+
37
+ const stripped = toMarkdown(html);
38
+ expect(stripped).toContain("Before");
39
+ expect(stripped).toContain("After");
40
+ expect(stripped).not.toContain("x.png");
41
+
42
+ const kept = toMarkdown(html, { keepImages: true });
43
+ expect(kept).toContain("Before");
44
+ expect(kept).toContain("After");
45
+ expect(kept).toContain("x.png");
46
+ });
47
+
48
+ it("6. multiple images — all stripped when keepImages: false", () => {
49
+ const html = '<img src="1.jpg" alt="One"><img src="2.png" alt="Two">';
50
+ const md = toMarkdown(html);
51
+ expect(md).not.toContain("1.jpg");
52
+ expect(md).not.toContain("2.png");
53
+ });
54
+
55
+ it("7. multiple images — all preserved when keepImages: true", () => {
56
+ const html = '<img src="1.jpg" alt="One"><img src="2.png" alt="Two">';
57
+ const md = toMarkdown(html, { keepImages: true });
58
+ expect(md).toContain("1.jpg");
59
+ expect(md).toContain("2.png");
60
+ });
61
+
62
+ it("8. absolute URL src is preserved as-is", () => {
63
+ const md = toMarkdown(
64
+ '<img src="https://example.com/photo.jpg" alt="Remote">',
65
+ { keepImages: true },
66
+ );
67
+ expect(md).toContain("https://example.com/photo.jpg");
68
+ });
69
+ });