otherwise-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/README.md +193 -0
  2. package/bin/otherwise.js +5 -0
  3. package/frontend/404.html +84 -0
  4. package/frontend/assets/OpenDyslexic3-Bold-CDyRs55Y.ttf +0 -0
  5. package/frontend/assets/OpenDyslexic3-Regular-CIBXa4WE.ttf +0 -0
  6. package/frontend/assets/__vite-browser-external-BIHI7g3E.js +1 -0
  7. package/frontend/assets/conversational-worker-CeKiciGk.js +2929 -0
  8. package/frontend/assets/dictation-worker-D0aYfq8b.js +29 -0
  9. package/frontend/assets/gemini-color-CgSQmmva.png +0 -0
  10. package/frontend/assets/index-BLux5ps4.js +21 -0
  11. package/frontend/assets/index-Blh8_TEM.js +5272 -0
  12. package/frontend/assets/index-BpQ1PuKu.js +18 -0
  13. package/frontend/assets/index-Df737c8w.css +1 -0
  14. package/frontend/assets/index-xaYHL6wb.js +113 -0
  15. package/frontend/assets/ort-wasm-simd-threaded.asyncify-BynIiDiv.wasm +0 -0
  16. package/frontend/assets/ort-wasm-simd-threaded.jsep-B0T3yYHD.wasm +0 -0
  17. package/frontend/assets/transformers-tULNc5V3.js +31 -0
  18. package/frontend/assets/tts-worker-DPJWqT7N.js +2899 -0
  19. package/frontend/assets/voice-mode-worker-GzvIE_uh.js +2927 -0
  20. package/frontend/assets/worker-2d5ABSLU.js +31 -0
  21. package/frontend/banner.png +0 -0
  22. package/frontend/favicon.svg +3 -0
  23. package/frontend/google55e5ec47ee14a5f8.html +1 -0
  24. package/frontend/index.html +234 -0
  25. package/frontend/manifest.json +17 -0
  26. package/frontend/pdf.worker.min.mjs +21 -0
  27. package/frontend/robots.txt +5 -0
  28. package/frontend/sitemap.xml +27 -0
  29. package/package.json +81 -0
  30. package/src/agent/index.js +1066 -0
  31. package/src/agent/location.js +51 -0
  32. package/src/agent/prompt.js +548 -0
  33. package/src/agent/tools.js +4372 -0
  34. package/src/browser/detect.js +68 -0
  35. package/src/browser/session.js +1109 -0
  36. package/src/config.js +137 -0
  37. package/src/email/client.js +503 -0
  38. package/src/index.js +557 -0
  39. package/src/inference/anthropic.js +113 -0
  40. package/src/inference/google.js +373 -0
  41. package/src/inference/index.js +81 -0
  42. package/src/inference/ollama.js +383 -0
  43. package/src/inference/openai.js +140 -0
  44. package/src/inference/openrouter.js +378 -0
  45. package/src/inference/xai.js +200 -0
  46. package/src/logBridge.js +9 -0
  47. package/src/models.js +146 -0
  48. package/src/remote/client.js +225 -0
  49. package/src/scheduler/cron.js +243 -0
  50. package/src/server.js +3876 -0
  51. package/src/storage/db.js +1135 -0
  52. package/src/storage/supabase.js +364 -0
  53. package/src/tunnel/cloudflare.js +241 -0
  54. package/src/ui/components/App.jsx +687 -0
  55. package/src/ui/components/BrowserSelect.jsx +111 -0
  56. package/src/ui/components/FilePicker.jsx +472 -0
  57. package/src/ui/components/Header.jsx +444 -0
  58. package/src/ui/components/HelpPanel.jsx +173 -0
  59. package/src/ui/components/HistoryPanel.jsx +158 -0
  60. package/src/ui/components/MessageList.jsx +235 -0
  61. package/src/ui/components/ModelSelector.jsx +304 -0
  62. package/src/ui/components/PromptInput.jsx +515 -0
  63. package/src/ui/components/StreamingResponse.jsx +134 -0
  64. package/src/ui/components/ThinkingIndicator.jsx +365 -0
  65. package/src/ui/components/ToolExecution.jsx +714 -0
  66. package/src/ui/components/index.js +82 -0
  67. package/src/ui/context/TerminalContext.jsx +150 -0
  68. package/src/ui/context/index.js +13 -0
  69. package/src/ui/hooks/index.js +16 -0
  70. package/src/ui/hooks/useChatState.js +675 -0
  71. package/src/ui/hooks/useCommands.js +280 -0
  72. package/src/ui/hooks/useFileAttachments.js +216 -0
  73. package/src/ui/hooks/useKeyboardShortcuts.js +173 -0
  74. package/src/ui/hooks/useNotifications.js +185 -0
  75. package/src/ui/hooks/useTerminalSize.js +151 -0
  76. package/src/ui/hooks/useWebSocket.js +273 -0
  77. package/src/ui/index.js +94 -0
  78. package/src/ui/ink-runner.js +22 -0
  79. package/src/ui/utils/formatters.js +424 -0
  80. package/src/ui/utils/index.js +6 -0
  81. package/src/ui/utils/markdown.js +166 -0
@@ -0,0 +1,4372 @@
1
+ import {
2
+ readFileSync,
3
+ writeFileSync,
4
+ readdirSync,
5
+ statSync,
6
+ existsSync,
7
+ mkdirSync,
8
+ lstatSync,
9
+ realpathSync,
10
+ } from "fs";
11
+ import { execSync, spawn } from "child_process";
12
+ import { homedir, tmpdir } from "os";
13
+ import { join, dirname, resolve, normalize, isAbsolute, relative } from "path";
14
+ import {
15
+ createScheduledTask,
16
+ getScheduledTasks,
17
+ deleteScheduledTask,
18
+ } from "../storage/db.js";
19
+ import cron from "node-cron";
20
+ import {
21
+ reloadTask,
22
+ cancelTask,
23
+ scheduleOneTimeTask,
24
+ cancelOneTimeTask,
25
+ } from "../scheduler/cron.js";
26
+ import { ProxyAgent, fetch as undiciFetch } from "undici";
27
+ import {
28
+ getBrowserPage,
29
+ closeBrowser,
30
+ isBrowserActive,
31
+ findElement,
32
+ extractPageContent,
33
+ executeActionSequence,
34
+ fetchHtmlWithBrowser,
35
+ } from "../browser/session.js";
36
+
37
+ /** Proxy URL for web search requests (fetch + browser). Use WEBSEARCH_PROXY or HTTPS_PROXY. */
38
+ function getWebSearchProxy() {
39
+ return (
40
+ process.env.WEBSEARCH_PROXY ||
41
+ process.env.HTTPS_PROXY ||
42
+ process.env.HTTP_PROXY ||
43
+ null
44
+ );
45
+ }
46
+
47
+ // ============================================
48
+ // Security Configuration
49
+ // ============================================
50
+
51
+ /**
52
+ * Sensitive paths that should NEVER be accessed
53
+ * These are blocked regardless of sandbox settings
54
+ */
55
+ const BLOCKED_PATHS = [
56
+ // SSH keys and credentials
57
+ "/.ssh/",
58
+ "/.gnupg/",
59
+ "/.aws/",
60
+ "/.azure/",
61
+ "/.gcloud/",
62
+ "/.config/gcloud/",
63
+
64
+ // System files
65
+ "/etc/passwd",
66
+ "/etc/shadow",
67
+ "/etc/sudoers",
68
+ "/etc/ssh/",
69
+
70
+ // Secrets and tokens
71
+ "/.npmrc",
72
+ "/.pypirc",
73
+ "/.netrc",
74
+ "/.env",
75
+ "/credentials",
76
+ "/secrets/",
77
+ "/tokens/",
78
+
79
+ // Browser data
80
+ "/.mozilla/",
81
+ "/.chrome/",
82
+ "/.config/google-chrome/",
83
+ "/Library/Keychains/",
84
+
85
+ // macOS specific
86
+ "/Library/Keychains/",
87
+ "/private/etc/",
88
+ ];
89
+
90
+ /**
91
+ * Patterns that indicate dangerous commands (regex)
92
+ * More comprehensive than simple string matching
93
+ */
94
+ const DANGEROUS_COMMAND_PATTERNS = [
95
+ // Recursive deletion of root or system directories
96
+ /rm\s+(-[rfvI]+\s+)*[\/~]\s*$/i,
97
+ /rm\s+(-[rfvI]+\s+)*\/\s/i,
98
+ /rm\s+(-[rfvI]+\s+)*\/\*/i,
99
+ /rm\s+(-[rfvI]+\s+)*~\//i,
100
+
101
+ // Format/destroy disks
102
+ /mkfs/i,
103
+ /dd\s+.*of\s*=\s*\/dev\//i,
104
+ /wipefs/i,
105
+ /fdisk/i,
106
+ /parted/i,
107
+
108
+ // Fork bomb
109
+ /:\(\)\s*\{/,
110
+ /\.\s*\/dev\/null/,
111
+
112
+ // System modification
113
+ /chmod\s+(-[rwxR]+\s+)*777\s+\//i,
114
+ /chown\s+.*\/$/i,
115
+
116
+ // Dangerous redirections
117
+ />\s*\/dev\/sd/i,
118
+ />\s*\/dev\/nv/i,
119
+ />\s*\/etc\//i,
120
+
121
+ // Privilege escalation attempts
122
+ /sudo\s+su/i,
123
+ /sudo\s+-i/i,
124
+ /sudo\s+bash/i,
125
+ /sudo\s+sh/i,
126
+
127
+ // Network attacks
128
+ /curl\s+.*\|\s*(sudo\s+)?bash/i,
129
+ /wget\s+.*\|\s*(sudo\s+)?bash/i,
130
+ /curl\s+.*\|\s*(sudo\s+)?sh/i,
131
+ /wget\s+.*\|\s*(sudo\s+)?sh/i,
132
+
133
+ // Python/Ruby/Node one-liners for system commands
134
+ /python[23]?\s+-c\s+["']import\s+os;?\s*os\.(system|popen|exec)/i,
135
+ /ruby\s+-e\s+["'`].*system/i,
136
+ /node\s+-e\s+["'`].*exec/i,
137
+
138
+ // Encoded command execution
139
+ /base64\s+-d.*\|.*bash/i,
140
+ /echo\s+.*\|\s*base64\s+-d\s*\|/i,
141
+
142
+ // Modifying shell configs
143
+ />\s*~\/\.(bashrc|zshrc|profile|bash_profile)/i,
144
+ /echo\s+.*>>\s*~\/\.(bashrc|zshrc|profile)/i,
145
+
146
+ // Cron manipulation
147
+ /crontab\s+-r/i,
148
+ /rm\s+.*cron/i,
149
+ ];
150
+
151
+ /**
152
+ * Default timeout for tool execution (ms)
153
+ */
154
+ const DEFAULT_TOOL_TIMEOUT = 30000;
155
+
156
+ // ============================================
157
+ // Working Directory State Management
158
+ // ============================================
159
+
160
+ /**
161
+ * Current working directory for the agent session
162
+ * This persists across tool calls within a session
163
+ */
164
+ let currentWorkingDirectory = process.cwd();
165
+
166
+ /**
167
+ * Get the current working directory
168
+ * @returns {string} - The current working directory path
169
+ */
170
+ export function getAgentWorkingDirectory() {
171
+ return currentWorkingDirectory;
172
+ }
173
+
174
+ /**
175
+ * Set the current working directory
176
+ * @param {string} path - The new working directory path
177
+ */
178
+ export function setAgentWorkingDirectory(path) {
179
+ currentWorkingDirectory = path;
180
+ }
181
+
182
+ /**
183
+ * Reset working directory to process cwd (for new sessions)
184
+ */
185
+ export function resetAgentWorkingDirectory() {
186
+ currentWorkingDirectory = process.cwd();
187
+ }
188
+
189
+ /**
190
+ * Per-tool timeout overrides
191
+ */
192
+ const TOOL_TIMEOUTS = {
193
+ execute_command: 30000,
194
+ web_search: 25000,
195
+ fetch_url: 15000,
196
+ browser_navigate: 45000, // Increased for retry logic
197
+ browser_click: 25000, // Element detection tries multiple strategies (2s each) + click action
198
+ browser_type: 25000, // Element detection + typing
199
+ browser_read: 10000,
200
+ browser_screenshot: 15000,
201
+ browser_launch: 20000, // Browser startup can be slow
202
+ browser_close: 5000,
203
+ browser_interact: 120000, // Composite tool: can execute multiple actions (2 min max)
204
+ read_file: 5000,
205
+ write_file: 5000,
206
+ list_directory: 5000,
207
+ search_files: 15000,
208
+ };
209
+
210
+ /** Max chars of page content to auto-append after navigate/click/type (avoids extra browser_read round-trip). */
211
+ const BROWSER_AUTO_READ_MAX_LENGTH = 6000;
212
+
213
+ /**
214
+ * Extract page content and truncate for inclusion in tool output.
215
+ * Returns empty string on error so the main result is still useful.
216
+ */
217
+ async function getTruncatedPageContent(
218
+ page,
219
+ maxLen = BROWSER_AUTO_READ_MAX_LENGTH,
220
+ ) {
221
+ try {
222
+ const content = await extractPageContent(page);
223
+ if (content.length <= maxLen) return content;
224
+ return (
225
+ content.substring(0, maxLen) +
226
+ "\n\n... (truncated; use browser_read for full content)"
227
+ );
228
+ } catch {
229
+ return "";
230
+ }
231
+ }
232
+
233
+ /**
234
+ * Retry configuration for operations that may fail transiently
235
+ */
236
+ const RETRY_CONFIG = {
237
+ maxRetries: 3,
238
+ baseDelayMs: 1000,
239
+ maxDelayMs: 10000,
240
+ };
241
+
242
+ /**
243
+ * Sleep for a specified duration
244
+ * @param {number} ms - Milliseconds to sleep
245
+ */
246
+ function sleep(ms) {
247
+ return new Promise((resolve) => setTimeout(resolve, ms));
248
+ }
249
+
250
+ // ============================================
251
+ // Search Query Refinement & Date Injection
252
+ // ============================================
253
+
254
+ const TIME_SENSITIVE_PATTERNS = [
255
+ /\b(weather|forecast|temperature|temp)\b/i,
256
+ /\b(today|tonight|tomorrow|this week|this weekend|right now|currently)\b/i,
257
+ /\b(latest|breaking|recent|current|live)\b/i,
258
+ /\b(stock price|share price|market|trading)\b/i,
259
+ /\b(score|scores|game|match|playing)\b/i,
260
+ /\b(open now|hours|is .+ open|closed)\b/i,
261
+ /\b(news|headlines)\b/i,
262
+ /\b(price of|cost of|how much)\b/i,
263
+ ];
264
+
265
+ const HAS_DATE_PATTERN =
266
+ /\b(20\d{2}|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b/i;
267
+
268
+ function refineSearchQuery(query) {
269
+ const trimmed = (query || "").trim();
270
+ if (!trimmed) return trimmed;
271
+
272
+ const isTimeSensitive = TIME_SENSITIVE_PATTERNS.some((p) => p.test(trimmed));
273
+ if (!isTimeSensitive) return trimmed;
274
+
275
+ if (HAS_DATE_PATTERN.test(trimmed)) return trimmed;
276
+
277
+ const now = new Date();
278
+ const month = now.toLocaleString("en-US", { month: "long" });
279
+ const year = now.getFullYear();
280
+ return `${trimmed} ${month} ${year}`;
281
+ }
282
+
283
+ // ============================================
284
+ // Search Result Cache (TTL-based)
285
+ // ============================================
286
+
287
+ class SearchCache {
288
+ constructor(ttlMs = 5 * 60 * 1000, maxEntries = 50) {
289
+ this._cache = new Map();
290
+ this._ttl = ttlMs;
291
+ this._max = maxEntries;
292
+ }
293
+ _normalizeKey(query) {
294
+ return (query || "").trim().toLowerCase().replace(/\s+/g, " ");
295
+ }
296
+ get(query) {
297
+ const key = this._normalizeKey(query);
298
+ const entry = this._cache.get(key);
299
+ if (!entry) return null;
300
+ if (Date.now() - entry.ts > this._ttl) {
301
+ this._cache.delete(key);
302
+ return null;
303
+ }
304
+ return entry.data;
305
+ }
306
+ set(query, data) {
307
+ const key = this._normalizeKey(query);
308
+ if (this._cache.size >= this._max) {
309
+ const oldest = this._cache.keys().next().value;
310
+ this._cache.delete(oldest);
311
+ }
312
+ this._cache.set(key, { data, ts: Date.now() });
313
+ }
314
+ }
315
+
316
+ const searchCache = new SearchCache();
317
+
318
+ /**
319
+ * Execute a function with exponential backoff retry
320
+ * @param {Function} fn - Async function to execute
321
+ * @param {object} options - Retry options
322
+ * @returns {Promise<any>} - Result of the function
323
+ */
324
+ async function withRetry(fn, options = {}) {
325
+ const maxRetries = options.maxRetries || RETRY_CONFIG.maxRetries;
326
+ const baseDelay = options.baseDelayMs || RETRY_CONFIG.baseDelayMs;
327
+ const maxDelay = options.maxDelayMs || RETRY_CONFIG.maxDelayMs;
328
+ const retryOn = options.retryOn || (() => true); // Default: retry on all errors
329
+
330
+ let lastError;
331
+
332
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
333
+ try {
334
+ return await fn(attempt);
335
+ } catch (err) {
336
+ lastError = err;
337
+
338
+ // Check if we should retry this error
339
+ if (!retryOn(err) || attempt === maxRetries) {
340
+ throw err;
341
+ }
342
+
343
+ // Calculate delay with exponential backoff and jitter
344
+ const delay = Math.min(
345
+ baseDelay * Math.pow(2, attempt) + Math.random() * 1000,
346
+ maxDelay,
347
+ );
348
+
349
+ console.log(
350
+ `[Retry] Attempt ${attempt + 1} failed: ${err.message}. Retrying in ${Math.round(delay)}ms...`,
351
+ );
352
+ await sleep(delay);
353
+ }
354
+ }
355
+
356
+ throw lastError;
357
+ }
358
+
359
+ // ============================================
360
+ // Memory Search Utilities
361
+ // ============================================
362
+
363
+ /**
364
+ * Cosine similarity between two vectors
365
+ * @param {Array<number>} a - First vector
366
+ * @param {Array<number>} b - Second vector
367
+ * @returns {number} Similarity score between 0 and 1
368
+ */
369
+ function cosineSimilarity(a, b) {
370
+ if (!a || !b || a.length !== b.length) return 0;
371
+ let dotProduct = 0,
372
+ normA = 0,
373
+ normB = 0;
374
+ for (let i = 0; i < a.length; i++) {
375
+ dotProduct += a[i] * b[i];
376
+ normA += a[i] * a[i];
377
+ normB += b[i] * b[i];
378
+ }
379
+ if (normA === 0 || normB === 0) return 0;
380
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
381
+ }
382
+
383
+ /**
384
+ * Generate embedding for a query string using OpenAI or Google API
385
+ * @param {string} query - Text to embed
386
+ * @param {object} config - Config with API keys
387
+ * @returns {Array<number>|null} Embedding vector or null if unavailable
388
+ */
389
+ async function generateQueryEmbedding(query, config) {
390
+ // Try OpenAI first
391
+ if (config.apiKeys?.openai) {
392
+ try {
393
+ const response = await fetch("https://api.openai.com/v1/embeddings", {
394
+ method: "POST",
395
+ headers: {
396
+ Authorization: `Bearer ${config.apiKeys.openai}`,
397
+ "Content-Type": "application/json",
398
+ },
399
+ body: JSON.stringify({
400
+ model: "text-embedding-3-small",
401
+ input: query,
402
+ }),
403
+ });
404
+ if (response.ok) {
405
+ const data = await response.json();
406
+ return data.data[0].embedding;
407
+ }
408
+ } catch (err) {
409
+ console.warn("[Memory] OpenAI embedding failed:", err.message);
410
+ }
411
+ }
412
+
413
+ // Fallback to Google
414
+ if (config.apiKeys?.google) {
415
+ try {
416
+ const response = await fetch(
417
+ `https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=${config.apiKeys.google}`,
418
+ {
419
+ method: "POST",
420
+ headers: { "Content-Type": "application/json" },
421
+ body: JSON.stringify({
422
+ content: { parts: [{ text: query }] },
423
+ }),
424
+ },
425
+ );
426
+ if (response.ok) {
427
+ const data = await response.json();
428
+ return data.embedding.values;
429
+ }
430
+ } catch (err) {
431
+ console.warn("[Memory] Google embedding failed:", err.message);
432
+ }
433
+ }
434
+
435
+ return null; // No embedding available, will use text search
436
+ }
437
+
438
+ /**
439
+ * Tool definitions with metadata and execution functions
440
+ */
441
+ export const TOOLS = {
442
+ // ============================================
443
+ // Filesystem Tools
444
+ // ============================================
445
+
446
+ read_file: {
447
+ description: "Read the contents of a file",
448
+ parameters: {
449
+ path: {
450
+ type: "string",
451
+ description: "Path to the file to read",
452
+ required: true,
453
+ },
454
+ },
455
+ execute: async ({ path: inputPath }, config) => {
456
+ try {
457
+ const resolvedPath = resolvePath(inputPath);
458
+
459
+ // Check for path traversal attempts
460
+ if (hasPathTraversal(inputPath, resolvedPath)) {
461
+ return `Error: Path traversal detected`;
462
+ }
463
+
464
+ // Check permissions (throws on failure)
465
+ checkReadPermission(resolvedPath, config);
466
+
467
+ if (!existsSync(resolvedPath)) {
468
+ return `Error: File not found: ${inputPath}`;
469
+ }
470
+
471
+ const content = readFileSync(resolvedPath, "utf-8");
472
+ return content;
473
+ } catch (err) {
474
+ return `Error reading file: ${err.message}`;
475
+ }
476
+ },
477
+ },
478
+
479
+ write_file: {
480
+ description: "Write content to a file (creates directories if needed)",
481
+ parameters: {
482
+ path: {
483
+ type: "string",
484
+ description: "Path to the file to write",
485
+ required: true,
486
+ },
487
+ content: {
488
+ type: "string",
489
+ description: "Content to write",
490
+ required: true,
491
+ },
492
+ },
493
+ execute: async ({ path: inputPath, content }, config, snapshotFn) => {
494
+ try {
495
+ const resolvedPath = resolvePath(inputPath);
496
+
497
+ // Check for path traversal attempts
498
+ if (hasPathTraversal(inputPath, resolvedPath)) {
499
+ return `Error: Path traversal detected`;
500
+ }
501
+
502
+ // Check permissions (throws on failure)
503
+ checkWritePermission(resolvedPath, config);
504
+
505
+ // Also check the directory for write permission
506
+ const dir = dirname(resolvedPath);
507
+ checkWritePermission(dir, config);
508
+
509
+ // Whether we will create the parent dir (for undo on regeneration)
510
+ const dirExisted = existsSync(dir);
511
+ const createdDir = dirExisted ? null : dir;
512
+
513
+ // Capture snapshot BEFORE writing (for undo on regeneration)
514
+ if (snapshotFn) {
515
+ const fileExisted = existsSync(resolvedPath);
516
+ let oldContent = null;
517
+ if (fileExisted) {
518
+ try {
519
+ oldContent = readFileSync(resolvedPath, "utf-8");
520
+ } catch (readErr) {
521
+ // If we can't read (e.g., binary file), skip snapshotting
522
+ console.warn(
523
+ "[write_file] Could not read file for snapshot:",
524
+ readErr.message,
525
+ );
526
+ }
527
+ }
528
+ snapshotFn({
529
+ path: resolvedPath,
530
+ existed: fileExisted,
531
+ content: oldContent,
532
+ createdDir,
533
+ });
534
+ }
535
+
536
+ // Create directory if it doesn't exist
537
+ if (!dirExisted) {
538
+ mkdirSync(dir, { recursive: true });
539
+ }
540
+
541
+ writeFileSync(resolvedPath, content, "utf-8");
542
+ return { path: inputPath, content };
543
+ } catch (err) {
544
+ return `Error writing file: ${err.message}`;
545
+ }
546
+ },
547
+ },
548
+
549
+ edit_file: {
550
+ description:
551
+ "Edit a file by replacing specific text. Use this for precise edits instead of rewriting entire files.",
552
+ parameters: {
553
+ path: {
554
+ type: "string",
555
+ description: "Path to the file to edit",
556
+ required: true,
557
+ },
558
+ old_string: {
559
+ type: "string",
560
+ description:
561
+ "The exact text to find and replace (must match exactly including whitespace)",
562
+ required: true,
563
+ },
564
+ new_string: {
565
+ type: "string",
566
+ description: "The text to replace it with",
567
+ required: true,
568
+ },
569
+ replace_all: {
570
+ type: "boolean",
571
+ description:
572
+ "Replace all occurrences (default: false, only replaces first match)",
573
+ required: false,
574
+ },
575
+ },
576
+ execute: async (
577
+ { path: inputPath, old_string, new_string, replace_all = false },
578
+ config,
579
+ snapshotFn,
580
+ ) => {
581
+ try {
582
+ const resolvedPath = resolvePath(inputPath);
583
+
584
+ // Check for path traversal attempts
585
+ if (hasPathTraversal(inputPath, resolvedPath)) {
586
+ return `Error: Path traversal detected`;
587
+ }
588
+
589
+ // Check permissions (throws on failure)
590
+ checkReadPermission(resolvedPath, config);
591
+ checkWritePermission(resolvedPath, config);
592
+
593
+ if (!existsSync(resolvedPath)) {
594
+ return `Error: File not found: ${inputPath}`;
595
+ }
596
+
597
+ // Read current content
598
+ const originalContent = readFileSync(resolvedPath, "utf-8");
599
+
600
+ // Check if old_string exists in file
601
+ if (!originalContent.includes(old_string)) {
602
+ // Provide helpful error with context
603
+ const lines = originalContent.split("\n");
604
+ const preview = lines.slice(0, 10).join("\n");
605
+ return `Error: Could not find the text to replace in ${inputPath}.\n\nSearched for:\n---\n${old_string.substring(0, 200)}${old_string.length > 200 ? "..." : ""}\n---\n\nFile preview (first 10 lines):\n---\n${preview}\n---\n\nMake sure the old_string matches exactly, including whitespace and indentation.`;
606
+ }
607
+
608
+ // Capture snapshot BEFORE writing (for undo on regeneration)
609
+ if (snapshotFn) {
610
+ snapshotFn({
611
+ path: resolvedPath,
612
+ existed: true, // edit_file only works on existing files
613
+ content: originalContent,
614
+ });
615
+ }
616
+
617
+ // Count occurrences
618
+ const occurrences = (
619
+ originalContent.match(new RegExp(escapeRegExp(old_string), "g")) || []
620
+ ).length;
621
+
622
+ // Perform replacement
623
+ let newContent;
624
+ if (replace_all) {
625
+ newContent = originalContent.split(old_string).join(new_string);
626
+ } else {
627
+ // Replace only first occurrence
628
+ const index = originalContent.indexOf(old_string);
629
+ newContent =
630
+ originalContent.substring(0, index) +
631
+ new_string +
632
+ originalContent.substring(index + old_string.length);
633
+ }
634
+
635
+ // Write the modified content
636
+ writeFileSync(resolvedPath, newContent, "utf-8");
637
+
638
+ // Generate a simple diff for the result
639
+ const replacedCount = replace_all ? occurrences : 1;
640
+ const remainingOccurrences = occurrences - replacedCount;
641
+
642
+ // Create a unified diff-like output
643
+ const diffResult = generateSimpleDiff(
644
+ old_string,
645
+ new_string,
646
+ inputPath,
647
+ );
648
+
649
+ let resultMessage = `Successfully edited ${inputPath}\n`;
650
+ resultMessage += `Replaced ${replacedCount} occurrence${replacedCount > 1 ? "s" : ""}`;
651
+ if (remainingOccurrences > 0) {
652
+ resultMessage += ` (${remainingOccurrences} more occurrence${remainingOccurrences > 1 ? "s" : ""} remain)`;
653
+ }
654
+ resultMessage += `\n\n${diffResult}`;
655
+
656
+ return resultMessage;
657
+ } catch (err) {
658
+ return `Error editing file: ${err.message}`;
659
+ }
660
+ },
661
+ },
662
+
663
+ list_directory: {
664
+ description: "List files and directories in a path",
665
+ parameters: {
666
+ path: {
667
+ type: "string",
668
+ description: "Directory path to list",
669
+ required: true,
670
+ },
671
+ },
672
+ execute: async ({ path: inputPath }, config) => {
673
+ try {
674
+ const resolvedPath = resolvePath(inputPath);
675
+
676
+ // Check for path traversal attempts
677
+ if (hasPathTraversal(inputPath, resolvedPath)) {
678
+ return `Error: Path traversal detected`;
679
+ }
680
+
681
+ // Check permissions (throws on failure)
682
+ checkReadPermission(resolvedPath, config);
683
+
684
+ if (!existsSync(resolvedPath)) {
685
+ return `Error: Directory not found: ${inputPath}`;
686
+ }
687
+
688
+ const entries = readdirSync(resolvedPath);
689
+ const detailed = entries.map((name) => {
690
+ const fullPath = join(resolvedPath, name);
691
+ try {
692
+ const stat = statSync(fullPath);
693
+ return {
694
+ name,
695
+ type: stat.isDirectory() ? "directory" : "file",
696
+ size: stat.isFile() ? stat.size : null,
697
+ };
698
+ } catch {
699
+ return { name, type: "unknown" };
700
+ }
701
+ });
702
+
703
+ // Filter out entries that would lead to blocked paths
704
+ const filtered = detailed.filter((entry) => {
705
+ const fullPath = join(resolvedPath, entry.name);
706
+ return !isBlockedPath(fullPath);
707
+ });
708
+
709
+ return JSON.stringify(filtered, null, 2);
710
+ } catch (err) {
711
+ return `Error listing directory: ${err.message}`;
712
+ }
713
+ },
714
+ },
715
+
716
+ search_files: {
717
+ description: "Search for files by name pattern",
718
+ parameters: {
719
+ query: {
720
+ type: "string",
721
+ description: "Search pattern (supports * wildcard)",
722
+ required: true,
723
+ },
724
+ path: {
725
+ type: "string",
726
+ description: "Directory to search in (default: current)",
727
+ required: false,
728
+ },
729
+ maxdepth: {
730
+ type: "number",
731
+ description: "Maximum directory depth to search (default: 5, max: 10)",
732
+ required: false,
733
+ },
734
+ },
735
+ execute: async ({ query, path: inputPath, maxdepth }, config) => {
736
+ try {
737
+ const searchPath = resolvePath(inputPath || ".");
738
+
739
+ // Check for path traversal attempts
740
+ if (inputPath && hasPathTraversal(inputPath, searchPath)) {
741
+ return `Error: Path traversal detected`;
742
+ }
743
+
744
+ // Check permissions (throws on failure)
745
+ checkReadPermission(searchPath, config);
746
+
747
+ // Sanitize query to prevent command injection
748
+ // Only allow alphanumeric, dots, underscores, hyphens, and wildcards
749
+ const sanitizedQuery = query.replace(/[^a-zA-Z0-9._\-*?]/g, "");
750
+ if (sanitizedQuery !== query) {
751
+ console.warn(
752
+ `[search_files] Query sanitized: "${query}" -> "${sanitizedQuery}"`,
753
+ );
754
+ }
755
+
756
+ // Limit maxdepth to prevent timeout on large directory trees
757
+ const depth = Math.min(Math.max(1, maxdepth || 5), 10);
758
+
759
+ // Use find command with sanitized input and depth limit
760
+ // -maxdepth prevents searching deep nested directories (e.g., node_modules)
761
+ const timeout = getToolTimeout("search_files");
762
+ const result = execSync(
763
+ `find "${searchPath}" -maxdepth ${depth} -name "${sanitizedQuery}" -type f 2>/dev/null | head -50`,
764
+ { encoding: "utf-8", timeout, killSignal: "SIGKILL" },
765
+ );
766
+
767
+ // Filter results to exclude blocked paths
768
+ const lines = result
769
+ .trim()
770
+ .split("\n")
771
+ .filter((line) => {
772
+ return line && !isBlockedPath(line);
773
+ });
774
+
775
+ const resultText =
776
+ lines.join("\n") || "No files found matching the pattern.";
777
+ return depth < 10
778
+ ? `${resultText}\n\n(Searched up to ${depth} levels deep. Use maxdepth parameter to search deeper.)`
779
+ : resultText;
780
+ } catch (err) {
781
+ if (
782
+ err.killed ||
783
+ err.signal === "SIGKILL" ||
784
+ err.code === "ETIMEDOUT"
785
+ ) {
786
+ return `Error: Search timed out. Try searching a more specific directory or reducing maxdepth.`;
787
+ }
788
+ return `Error searching: ${err.message}`;
789
+ }
790
+ },
791
+ },
792
+
793
+ // ============================================
794
+ // Shell Tools
795
+ // ============================================
796
+
797
+ set_working_directory: {
798
+ description:
799
+ "Change the current working directory for future commands. This persists across tool calls.",
800
+ parameters: {
801
+ path: {
802
+ type: "string",
803
+ description:
804
+ "Path to the directory to switch to (absolute or relative)",
805
+ required: true,
806
+ },
807
+ },
808
+ execute: async ({ path: inputPath }, config) => {
809
+ try {
810
+ // Resolve the path (handles ~, relative paths, etc.)
811
+ let resolvedPath;
812
+ if (inputPath.startsWith("~")) {
813
+ resolvedPath = join(homedir(), inputPath.slice(1));
814
+ } else if (isAbsolute(inputPath)) {
815
+ resolvedPath = normalize(inputPath);
816
+ } else {
817
+ // Relative path - resolve from current working directory
818
+ resolvedPath = resolve(currentWorkingDirectory, inputPath);
819
+ }
820
+
821
+ // Check if path exists and is a directory
822
+ if (!existsSync(resolvedPath)) {
823
+ return `Error: Directory not found: ${inputPath}`;
824
+ }
825
+
826
+ const stat = statSync(resolvedPath);
827
+ if (!stat.isDirectory()) {
828
+ return `Error: Path is not a directory: ${inputPath}`;
829
+ }
830
+
831
+ // Check for blocked paths
832
+ if (isBlockedPath(resolvedPath)) {
833
+ return `Error: Access denied - cannot switch to sensitive directory`;
834
+ }
835
+
836
+ // Check read permission
837
+ checkReadPermission(resolvedPath, config);
838
+
839
+ // Update the working directory state
840
+ const previousDir = currentWorkingDirectory;
841
+ currentWorkingDirectory = resolvedPath;
842
+
843
+ return `Changed working directory:\n From: ${previousDir}\n To: ${resolvedPath}\n\nFuture commands will run in this directory.`;
844
+ } catch (err) {
845
+ return `Error changing directory: ${err.message}`;
846
+ }
847
+ },
848
+ },
849
+
850
+ get_working_directory: {
851
+ description: "Get the current working directory",
852
+ parameters: {},
853
+ execute: async (params, config) => {
854
+ return `Current working directory: ${currentWorkingDirectory}`;
855
+ },
856
+ },
857
+
858
+ execute_command: {
859
+ description: "Execute a shell command in the current working directory",
860
+ parameters: {
861
+ command: {
862
+ type: "string",
863
+ description: "Command to execute",
864
+ required: true,
865
+ },
866
+ },
867
+ execute: async ({ command }, config, shellUndoFn) => {
868
+ if (!config.permissions?.shell) {
869
+ return "Error: Shell commands are disabled in configuration.";
870
+ }
871
+
872
+ // Validate command against dangerous patterns
873
+ const validation = validateCommand(command);
874
+ if (!validation.valid) {
875
+ console.warn(
876
+ `[execute_command] Blocked: ${validation.reason} - Command: ${command.substring(0, 100)}`,
877
+ );
878
+ return `Error: This command is blocked for safety reasons (${validation.reason}).`;
879
+ }
880
+
881
+ try {
882
+ const timeout = getToolTimeout("execute_command");
883
+ const result = execSync(command, {
884
+ encoding: "utf-8",
885
+ timeout,
886
+ maxBuffer: 1024 * 1024, // 1MB output limit
887
+ cwd: currentWorkingDirectory, // Use the agent's working directory
888
+ // Run in a restricted environment
889
+ env: {
890
+ ...process.env,
891
+ // Prevent some shell behaviors
892
+ HISTFILE: "/dev/null",
893
+ HISTSIZE: "0",
894
+ },
895
+ });
896
+ // Record reversible command for undo on regeneration (Strategy 1)
897
+ if (shellUndoFn) {
898
+ const entry = parseReversibleCommand(command);
899
+ if (entry) {
900
+ shellUndoFn(entry);
901
+ }
902
+ }
903
+ return result || "(Command completed with no output)";
904
+ } catch (err) {
905
+ // Check if it was a timeout
906
+ if (err.killed) {
907
+ return `Error: Command timed out after ${getToolTimeout("execute_command") / 1000} seconds`;
908
+ }
909
+ return `Command failed: ${err.message}\n${err.stderr || ""}`;
910
+ }
911
+ },
912
+ },
913
+
914
+ // ============================================
915
+ // Email Tools (MyMX for receiving, Resend for sending)
916
+ // ============================================
917
+
918
+ check_email: {
919
+ description:
920
+ "List recently received emails. With MyMX, emails are automatically received via webhooks and stored as chats.",
921
+ parameters: {
922
+ limit: {
923
+ type: "number",
924
+ description: "Max emails to show (default: 10)",
925
+ required: false,
926
+ },
927
+ },
928
+ execute: async ({ limit = 10 }, config) => {
929
+ if (!config.permissions?.email) {
930
+ return "Error: Email access is disabled in configuration.";
931
+ }
932
+
933
+ try {
934
+ const { checkEmail } = await import("../email/client.js");
935
+ return await checkEmail(config);
936
+ } catch (err) {
937
+ return err.message;
938
+ }
939
+ },
940
+ },
941
+
942
+ send_email: {
943
+ description:
944
+ "Send an email via Resend API. Requires Resend to be configured in Settings → Email Integration.",
945
+ parameters: {
946
+ to: {
947
+ type: "string",
948
+ description: "Recipient email address",
949
+ required: true,
950
+ },
951
+ subject: { type: "string", description: "Email subject", required: true },
952
+ body: {
953
+ type: "string",
954
+ description: "Email body content (plain text)",
955
+ required: true,
956
+ },
957
+ },
958
+ execute: async ({ to, subject, body }, config) => {
959
+ if (!config.permissions?.email) {
960
+ return "Error: Email access is disabled in configuration.";
961
+ }
962
+
963
+ try {
964
+ const { sendEmail } = await import("../email/client.js");
965
+ const result = await sendEmail(config, to, subject, body);
966
+ return `✅ Email sent successfully to ${to}\nMessage ID: ${result.messageId}`;
967
+ } catch (err) {
968
+ return `Error sending email: ${err.message}`;
969
+ }
970
+ },
971
+ },
972
+
973
+ // ============================================
974
+ // Scheduling Tools
975
+ // ============================================
976
+
977
+ schedule_task: {
978
+ description:
979
+ "Schedule a RECURRING task at specified times (cron syntax). For one-time 'in X minutes', use schedule_task_once instead.",
980
+ parameters: {
981
+ cron_expression: {
982
+ type: "string",
983
+ description:
984
+ 'Cron expression: minute hour day month weekday (e.g. "0 9 * * *" = 9am daily, "*/15 * * * *" = every 15 min)',
985
+ required: true,
986
+ },
987
+ task_description: {
988
+ type: "string",
989
+ description: "What the task should do",
990
+ required: true,
991
+ },
992
+ },
993
+ execute: async ({ cron_expression, task_description }, config) => {
994
+ try {
995
+ if (!cron.validate(cron_expression)) {
996
+ return `Error: Invalid cron expression "${cron_expression}". Use format: minute hour day month weekday (e.g. "0 9 * * *" for 9am daily).`;
997
+ }
998
+ const id = createScheduledTask(cron_expression, task_description);
999
+ const ok = reloadTask(id, config);
1000
+ if (!ok) {
1001
+ return `Task was saved (ID: ${id}) but failed to schedule. Check server logs.`;
1002
+ }
1003
+ return `Recurring task scheduled (ID: ${id}). Will run according to: ${cron_expression}`;
1004
+ } catch (err) {
1005
+ return `Error scheduling task: ${err.message}`;
1006
+ }
1007
+ },
1008
+ },
1009
+
1010
+ schedule_task_once: {
1011
+ description:
1012
+ "Schedule a ONE-TIME task to run once after N minutes. Use this for 'in 5 minutes' or 'run once in 10 minutes'. Does not persist if CLI restarts.",
1013
+ parameters: {
1014
+ run_in_minutes: {
1015
+ type: "number",
1016
+ description:
1017
+ "How many minutes from now to run the task (e.g. 5 for 'in 5 minutes')",
1018
+ required: true,
1019
+ },
1020
+ task_description: {
1021
+ type: "string",
1022
+ description: "What the task should do",
1023
+ required: true,
1024
+ },
1025
+ },
1026
+ execute: async ({ run_in_minutes, task_description }, config) => {
1027
+ try {
1028
+ const minutes = Math.max(
1029
+ 1,
1030
+ Math.min(1440, Number(run_in_minutes) || 1),
1031
+ ); // 1 min to 24h
1032
+ const id = scheduleOneTimeTask(task_description, minutes, config);
1033
+ return `One-time task scheduled (ID: ${id}). Will run once in ${minutes} minute(s).`;
1034
+ } catch (err) {
1035
+ return `Error scheduling one-time task: ${err.message}`;
1036
+ }
1037
+ },
1038
+ },
1039
+
1040
+ list_scheduled_tasks: {
1041
+ description: "List all scheduled tasks",
1042
+ parameters: {},
1043
+ execute: async (params, config) => {
1044
+ try {
1045
+ const tasks = getScheduledTasks(false);
1046
+ if (tasks.length === 0) {
1047
+ return "No scheduled tasks.";
1048
+ }
1049
+ return JSON.stringify(tasks, null, 2);
1050
+ } catch (err) {
1051
+ return `Error listing tasks: ${err.message}`;
1052
+ }
1053
+ },
1054
+ },
1055
+
1056
+ cancel_task: {
1057
+ description: "Cancel a scheduled task",
1058
+ parameters: {
1059
+ task_id: {
1060
+ type: "number",
1061
+ description: "ID of the task to cancel",
1062
+ required: true,
1063
+ },
1064
+ },
1065
+ execute: async ({ task_id }, config) => {
1066
+ try {
1067
+ // Stop the running cron job first
1068
+ cancelTask(task_id);
1069
+ // Then remove from database
1070
+ deleteScheduledTask(task_id);
1071
+ return `Task ${task_id} cancelled successfully.`;
1072
+ } catch (err) {
1073
+ return `Error cancelling task: ${err.message}`;
1074
+ }
1075
+ },
1076
+ },
1077
+
1078
+ // ============================================
1079
+ // Web Tools
1080
+ // ============================================
1081
+
1082
+ web_search: {
1083
+ description:
1084
+ "Search the web for current information. Returns titles, URLs, summaries, and enriched page content for top results. For comprehensive data (like earnings reports, event lists), use numResults=10+ and consider making multiple searches with different date/keyword variations.",
1085
+ parameters: {
1086
+ query: {
1087
+ type: "string",
1088
+ description:
1089
+ "Search query - be specific with dates, company names, or keywords",
1090
+ required: true,
1091
+ },
1092
+ numResults: {
1093
+ type: "number",
1094
+ description:
1095
+ "Number of results (default: 8, max: 15). Use higher values for comprehensive data gathering.",
1096
+ required: false,
1097
+ },
1098
+ },
1099
+ execute: async ({ query, numResults = 8 }, config) => {
1100
+ try {
1101
+ const maxResults = Math.min(numResults, 15);
1102
+
1103
+ // Refine query with date injection for time-sensitive searches
1104
+ const refinedQuery = refineSearchQuery(query);
1105
+ if (refinedQuery !== query) {
1106
+ console.log("[WebSearch] Refined query:", refinedQuery);
1107
+ }
1108
+
1109
+ // Check cache first
1110
+ const cacheKey = `${refinedQuery}:${maxResults}`;
1111
+ const cached = searchCache.get(cacheKey);
1112
+ if (cached) {
1113
+ console.log("[WebSearch] Cache hit for:", refinedQuery);
1114
+ return cached;
1115
+ }
1116
+
1117
+ // Try backend route if available (unified search with Exa/Tavily/Marginalia)
1118
+ const backendUrl = config?.remote?.backendUrl;
1119
+ let results = null;
1120
+ if (backendUrl) {
1121
+ try {
1122
+ const resp = await fetchWithTimeout(
1123
+ `${backendUrl.replace(/\/$/, "")}/api/web-search`,
1124
+ {
1125
+ method: "POST",
1126
+ headers: { "Content-Type": "application/json" },
1127
+ body: JSON.stringify({
1128
+ query: refinedQuery,
1129
+ numResults: maxResults,
1130
+ maxPages: maxResults,
1131
+ enrichTop: Math.min(3, maxResults),
1132
+ }),
1133
+ },
1134
+ 12000,
1135
+ );
1136
+ const data = await resp.json();
1137
+ if (data.success && data.pages?.length) {
1138
+ results = data.pages.map((p) => ({
1139
+ url: p.url,
1140
+ title: p.title || p.searchTitle || "",
1141
+ snippet: p.searchSnippet || p.excerpt || "",
1142
+ source: p.source || "",
1143
+ content: p.content || p.excerpt || "",
1144
+ engines: p.engines || [],
1145
+ }));
1146
+ console.log(
1147
+ "[WebSearch] Backend returned",
1148
+ results.length,
1149
+ "enriched results",
1150
+ );
1151
+ }
1152
+ } catch (e) {
1153
+ console.log(
1154
+ "[WebSearch] Backend unavailable, falling back to direct scraping:",
1155
+ e?.message,
1156
+ );
1157
+ }
1158
+ }
1159
+
1160
+ // Fallback to direct scraping if backend unavailable or returned nothing
1161
+ if (!results || results.length === 0) {
1162
+ results = await performWebSearch(refinedQuery, maxResults);
1163
+ }
1164
+
1165
+ if (results.length === 0) {
1166
+ return (
1167
+ `No results found for: "${query}"\n\nSuggestions:\n` +
1168
+ `1. Try more specific keywords or add dates (e.g., "January 2026")\n` +
1169
+ `2. Try alternative sources (e.g., "earnings calendar site:yahoo.com")\n` +
1170
+ `3. Break down the query into smaller parts`
1171
+ );
1172
+ }
1173
+
1174
+ // Auto-enrich top 3 results with page content if not already enriched
1175
+ const enrichCount = Math.min(3, results.length);
1176
+ const enrichPromises = [];
1177
+ for (let i = 0; i < enrichCount; i++) {
1178
+ const r = results[i];
1179
+ if (r.content && r.content.length > 200) continue;
1180
+ enrichPromises.push(
1181
+ fetchUrlContent(r.url, 8000)
1182
+ .then((content) => {
1183
+ if (content && content.length > 100) {
1184
+ r.content = content.slice(0, 4000);
1185
+ if (!r.snippet || r.snippet.length < 50) {
1186
+ r.snippet = extractBestSnippet(content, refinedQuery, 400);
1187
+ }
1188
+ }
1189
+ })
1190
+ .catch(() => {}),
1191
+ );
1192
+ }
1193
+ if (enrichPromises.length > 0) {
1194
+ await Promise.allSettled(enrichPromises);
1195
+ }
1196
+
1197
+ // Fix empty snippets from enriched content
1198
+ for (const r of results) {
1199
+ if ((!r.snippet || r.snippet.length < 20) && r.content) {
1200
+ r.snippet = extractBestSnippet(
1201
+ r.content,
1202
+ refinedQuery,
1203
+ 400,
1204
+ );
1205
+ }
1206
+ }
1207
+
1208
+ // Format results with enriched content
1209
+ let output = `Search results for "${query}" (${results.length} results):\n\n`;
1210
+ output += results
1211
+ .map((r, i) => {
1212
+ let entry = `${i + 1}. ${r.title}\n Source: ${r.source}\n URL: ${r.url}\n Summary: ${r.snippet || "No summary available"}`;
1213
+ if (i < enrichCount && r.content && r.content.length > 200) {
1214
+ entry += `\n Content preview: ${r.content.slice(0, 1500)}`;
1215
+ }
1216
+ return entry;
1217
+ })
1218
+ .join("\n\n");
1219
+
1220
+ if (
1221
+ query.toLowerCase().includes("earnings") ||
1222
+ query.toLowerCase().includes("quarterly") ||
1223
+ query.toLowerCase().includes("reports") ||
1224
+ query.toLowerCase().includes("calendar")
1225
+ ) {
1226
+ output +=
1227
+ "\n\n💡 TIP: For complete earnings/calendar data, consider:\n";
1228
+ output +=
1229
+ '- Searching for specific dates (e.g., "earnings January 27 2026")\n';
1230
+ output +=
1231
+ '- Using financial aggregator sites in your query (e.g., "site:yahoo.com/calendar" or "site:zacks.com")\n';
1232
+ output += "- Fetching promising URLs for detailed company lists\n";
1233
+ output += "- Making multiple searches for different days of the week";
1234
+ }
1235
+
1236
+ // Cache the result
1237
+ searchCache.set(cacheKey, output);
1238
+
1239
+ return output;
1240
+ } catch (err) {
1241
+ return `Error searching: ${err.message}\n\nTIP: Try rephrasing your query or breaking it into smaller parts.`;
1242
+ }
1243
+ },
1244
+ },
1245
+
1246
+ fetch_url: {
1247
+ description:
1248
+ "Fetch full text content from a URL. Use this to get detailed data from earnings calendars, company lists, or news articles. Extracts and formats the text content.",
1249
+ parameters: {
1250
+ url: {
1251
+ type: "string",
1252
+ description:
1253
+ "URL to fetch - works best with news sites, financial data pages, and text-heavy content",
1254
+ required: true,
1255
+ },
1256
+ },
1257
+ execute: async ({ url }, config) => {
1258
+ try {
1259
+ const content = await fetchUrlContent(url, 15000); // Slightly longer timeout
1260
+
1261
+ if (!content || content.length < 100) {
1262
+ return (
1263
+ `No substantial content extracted from URL.\n\n` +
1264
+ `This might happen if:\n` +
1265
+ `1. The page requires JavaScript to render content\n` +
1266
+ `2. The site blocks automated access\n` +
1267
+ `3. The content is behind a login\n\n` +
1268
+ `TIP: Try web_search for related content from other sources.`
1269
+ );
1270
+ }
1271
+
1272
+ // Add summary of content length
1273
+ const wordCount = content.split(/\s+/).length;
1274
+ return (
1275
+ `Fetched content from: ${url}\n` +
1276
+ `Content length: ~${wordCount} words\n\n` +
1277
+ `---\n\n${content}`
1278
+ );
1279
+ } catch (err) {
1280
+ return (
1281
+ `Error fetching URL: ${err.message}\n\n` +
1282
+ `TIP: This site may block automated access. Try:\n` +
1283
+ `1. web_search for similar content from other sources\n` +
1284
+ `2. A different URL from the search results`
1285
+ );
1286
+ }
1287
+ },
1288
+ },
1289
+
1290
+ // ============================================
1291
+ // Browser Control Tools
1292
+ // ============================================
1293
+
1294
+ browser_launch: {
1295
+ description:
1296
+ "Launch a browser for interactive web navigation. Call this before using other browser tools.",
1297
+ parameters: {
1298
+ headless: {
1299
+ type: "boolean",
1300
+ description: "Run browser invisibly (default: true)",
1301
+ required: false,
1302
+ },
1303
+ },
1304
+ execute: async ({ headless }, config) => {
1305
+ try {
1306
+ // Override config headless setting if parameter provided
1307
+ const browserConfig = { ...config };
1308
+ if (headless !== undefined) {
1309
+ browserConfig.browserHeadless = headless;
1310
+ }
1311
+
1312
+ const page = await getBrowserPage(browserConfig);
1313
+ return `Browser launched successfully. Ready for navigation.`;
1314
+ } catch (err) {
1315
+ return `Error launching browser: ${err.message}`;
1316
+ }
1317
+ },
1318
+ },
1319
+
1320
+ browser_navigate: {
1321
+ description:
1322
+ "Navigate the browser to a URL. Automatically retries on network errors with exponential backoff. By default returns page content (title, URL, interactive elements, text) so you often don't need a separate browser_read; use readAfter: false to skip.",
1323
+ parameters: {
1324
+ url: {
1325
+ type: "string",
1326
+ description: "URL to navigate to",
1327
+ required: true,
1328
+ },
1329
+ readAfter: {
1330
+ type: "boolean",
1331
+ description:
1332
+ "Include page content in the result after navigation (default: true). Set false to only get title/URL.",
1333
+ required: false,
1334
+ },
1335
+ },
1336
+ execute: async ({ url, readAfter = true }, config) => {
1337
+ // Ensure URL has protocol
1338
+ let fullUrl = url;
1339
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
1340
+ fullUrl = "https://" + url;
1341
+ }
1342
+
1343
+ // Check for commonly blocked sites and suggest alternatives
1344
+ const blockedSiteAlternatives = {
1345
+ "nasdaq.com":
1346
+ 'TIP: nasdaq.com often blocks automated browsers. Try using web_search for "NASDAQ earnings calendar [date]" or fetch_url with a financial news article instead.',
1347
+ "bloomberg.com":
1348
+ "TIP: bloomberg.com has strict bot detection. Try web_search or fetch_url with a news aggregator.",
1349
+ "linkedin.com":
1350
+ "TIP: linkedin.com blocks automated access. Try web_search for LinkedIn profile information.",
1351
+ };
1352
+
1353
+ const domain = fullUrl
1354
+ .replace(/^https?:\/\//, "")
1355
+ .split("/")[0]
1356
+ .replace("www.", "");
1357
+ const alternative = Object.entries(blockedSiteAlternatives).find(([d]) =>
1358
+ domain.includes(d),
1359
+ );
1360
+
1361
+ try {
1362
+ const page = await getBrowserPage(config);
1363
+
1364
+ // Use retry logic with exponential backoff
1365
+ const result = await withRetry(
1366
+ async (attempt) => {
1367
+ console.log(
1368
+ `[Browser] Navigating to: ${fullUrl} (attempt ${attempt + 1})`,
1369
+ );
1370
+
1371
+ // Try different wait strategies on retries
1372
+ const waitStrategy =
1373
+ attempt === 0
1374
+ ? "domcontentloaded"
1375
+ : attempt === 1
1376
+ ? "load"
1377
+ : "networkidle";
1378
+
1379
+ await page.goto(fullUrl, {
1380
+ waitUntil: waitStrategy,
1381
+ timeout: 20000,
1382
+ });
1383
+
1384
+ // Wait a bit for dynamic content (reduced for speed; auto-read still gets content)
1385
+ await page.waitForTimeout(1000);
1386
+
1387
+ const title = await page.title();
1388
+ const currentUrl = page.url();
1389
+
1390
+ // Check for bot/block pages only with specific signals (avoid false positives on normal content)
1391
+ const pageContent = await page.content();
1392
+ const lower = pageContent.toLowerCase();
1393
+ const isBlockPage =
1394
+ lower.includes("checking your browser") ||
1395
+ lower.includes("cf-browser-verification") ||
1396
+ /access\s+denied/i.test(pageContent) ||
1397
+ lower.includes("unusual traffic from your computer") ||
1398
+ lower.includes("please complete the security check") ||
1399
+ lower.includes("g-recaptcha") ||
1400
+ (lower.includes("challenge-platform") &&
1401
+ lower.includes("cloudflare")) ||
1402
+ // Short page with clear block wording (avoids "blocked" in article text)
1403
+ (pageContent.length < 8000 &&
1404
+ /(you have been blocked|your (access|request) (has been )?blocked|blocked (by|from) )/i.test(
1405
+ pageContent,
1406
+ ));
1407
+ if (isBlockPage) {
1408
+ throw new Error(
1409
+ "Bot detection triggered - site is blocking automated access",
1410
+ );
1411
+ }
1412
+
1413
+ return { title, currentUrl };
1414
+ },
1415
+ {
1416
+ maxRetries: 2,
1417
+ baseDelayMs: 2000,
1418
+ // Retry on network and HTTP2 errors
1419
+ retryOn: (err) => {
1420
+ const retryableErrors = [
1421
+ "ERR_HTTP2",
1422
+ "ERR_CONNECTION",
1423
+ "ERR_TIMED_OUT",
1424
+ "ETIMEDOUT",
1425
+ "ECONNRESET",
1426
+ "Navigation timeout",
1427
+ "net::ERR_",
1428
+ ];
1429
+ return retryableErrors.some((e) => err.message.includes(e));
1430
+ },
1431
+ },
1432
+ );
1433
+
1434
+ let out = `Navigated to: ${result.title}\nURL: ${result.currentUrl}`;
1435
+ if (readAfter) {
1436
+ const content = await getTruncatedPageContent(page);
1437
+ if (content) out += `\n\n--- Page content ---\n${content}`;
1438
+ }
1439
+ return out;
1440
+ } catch (err) {
1441
+ let errorMsg = `Error navigating: ${err.message}`;
1442
+
1443
+ // Add helpful alternative suggestion
1444
+ if (alternative) {
1445
+ errorMsg += `\n\n${alternative[1]}`;
1446
+ } else if (
1447
+ err.message.includes("Bot detection") ||
1448
+ err.message.includes("ERR_HTTP2")
1449
+ ) {
1450
+ errorMsg +=
1451
+ "\n\nThis site may be blocking automated browsers. Consider using:\n";
1452
+ errorMsg +=
1453
+ "1. web_search to find the information from other sources\n";
1454
+ errorMsg +=
1455
+ "2. fetch_url to get content from a news article about this topic\n";
1456
+ errorMsg +=
1457
+ "3. A more specific search query to find direct links to the data";
1458
+ }
1459
+
1460
+ return errorMsg;
1461
+ }
1462
+ },
1463
+ },
1464
+
1465
+ browser_click: {
1466
+ description:
1467
+ 'Click an element on the current page. For links, waits for navigation and returns the new URL. By default returns page content so you often don\'t need a separate browser_read. Prefer a[href="/path"] from browser_read output for reliable link clicks. Supports CSS, :has-text(), or link text. Auto-retries with force for overlays.',
1468
+ parameters: {
1469
+ selector: {
1470
+ type: "string",
1471
+ description:
1472
+ "CSS selector, Playwright selector (:has-text(), :text()), text content, or element name to click",
1473
+ required: true,
1474
+ },
1475
+ force: {
1476
+ type: "boolean",
1477
+ description:
1478
+ "Force click even if element is covered or not visible (default: false)",
1479
+ required: false,
1480
+ },
1481
+ readAfter: {
1482
+ type: "boolean",
1483
+ description:
1484
+ "Include page content in the result after click (default: true). Set false to only get title/URL.",
1485
+ required: false,
1486
+ },
1487
+ },
1488
+ execute: async ({ selector, force = false, readAfter = true }, config) => {
1489
+ try {
1490
+ const page = await getBrowserPage(config);
1491
+
1492
+ console.log("[Browser] Clicking:", selector, force ? "(forced)" : "");
1493
+ const element = await findElement(page, selector);
1494
+ await element.scrollIntoViewIfNeeded();
1495
+
1496
+ const clickOptions = { timeout: 5000, force };
1497
+ const performClick = async () => {
1498
+ try {
1499
+ await element.click(clickOptions);
1500
+ } catch (clickErr) {
1501
+ if (!force) {
1502
+ const retryWithForce =
1503
+ clickErr.message.includes("intercepts pointer events") ||
1504
+ clickErr.message.includes("element is not visible") ||
1505
+ clickErr.message.includes("element is outside the viewport");
1506
+ if (retryWithForce) {
1507
+ console.log(
1508
+ "[Browser] Element not clickable (overlay/not visible/viewport), retrying with force: true",
1509
+ );
1510
+ await element.click({ ...clickOptions, force: true });
1511
+ return;
1512
+ }
1513
+ }
1514
+ throw clickErr;
1515
+ }
1516
+ };
1517
+
1518
+ // If it's a link, wait for navigation so we return the new page and agent knows the click worked
1519
+ const isLink = await element
1520
+ .evaluate(
1521
+ (el) =>
1522
+ el.tagName === "A" &&
1523
+ el.getAttribute("href") &&
1524
+ !el.getAttribute("href").startsWith("#"),
1525
+ )
1526
+ .catch(() => false);
1527
+
1528
+ if (isLink) {
1529
+ await Promise.all([
1530
+ page
1531
+ .waitForNavigation({
1532
+ waitUntil: "domcontentloaded",
1533
+ timeout: 6000,
1534
+ })
1535
+ .catch(() => null),
1536
+ performClick(),
1537
+ ]);
1538
+ const title = await page.title();
1539
+ const url = page.url();
1540
+ let out = `Clicked "${selector}". Navigated to: ${title}\nURL: ${url}`;
1541
+ if (readAfter) {
1542
+ const content = await getTruncatedPageContent(page);
1543
+ if (content) out += `\n\n--- Page content ---\n${content}`;
1544
+ }
1545
+ return out;
1546
+ }
1547
+
1548
+ await performClick();
1549
+ await page.waitForTimeout(400);
1550
+ const title = await page.title();
1551
+ let out = `Clicked "${selector}". Current page: ${title}`;
1552
+ if (readAfter) {
1553
+ const content = await getTruncatedPageContent(page);
1554
+ if (content) out += `\n\n--- Page content ---\n${content}`;
1555
+ }
1556
+ return out;
1557
+ } catch (err) {
1558
+ // Provide more helpful error messages
1559
+ if (err.message.includes("element is not visible")) {
1560
+ return `Error: Element "${selector}" exists but is not visible. Try using force: true or scrolling to the element first.`;
1561
+ }
1562
+ if (err.message.includes("element is outside of the viewport")) {
1563
+ return `Error: Element "${selector}" is outside the viewport. Try scrolling to it first.`;
1564
+ }
1565
+ if (err.message.includes("intercepts pointer events")) {
1566
+ return `Error: Another element is covering "${selector}". Try using force: true or closing any modals/popups first.`;
1567
+ }
1568
+ if (err.message.includes("Could not find element")) {
1569
+ return `Error: ${err.message}\n\nTip: Try using different selector strategies:\n- Text: "Submit" or "button:Submit"\n- Playwright: 'button:has-text("Submit")'\n- CSS: "#submit-btn" or ".submit-button"`;
1570
+ }
1571
+ return `Error clicking element: ${err.message}`;
1572
+ }
1573
+ },
1574
+ },
1575
+
1576
+ browser_type: {
1577
+ description:
1578
+ "Type text into an input field on the current page. By default returns page content after typing so you often don't need a separate browser_read.",
1579
+ parameters: {
1580
+ selector: {
1581
+ type: "string",
1582
+ description:
1583
+ "CSS selector, placeholder text, or label of the input field",
1584
+ required: true,
1585
+ },
1586
+ text: { type: "string", description: "Text to type", required: true },
1587
+ submit: {
1588
+ type: "boolean",
1589
+ description: "Press Enter after typing to submit (default: false)",
1590
+ required: false,
1591
+ },
1592
+ readAfter: {
1593
+ type: "boolean",
1594
+ description:
1595
+ "Include page content in the result after typing (default: true). Set false to only get title.",
1596
+ required: false,
1597
+ },
1598
+ },
1599
+ execute: async (
1600
+ { selector, text, submit = false, readAfter = true },
1601
+ config,
1602
+ ) => {
1603
+ try {
1604
+ const page = await getBrowserPage(config);
1605
+
1606
+ console.log("[Browser] Typing into:", selector);
1607
+ const element = await findElement(page, selector);
1608
+
1609
+ // Clear existing content and type new text
1610
+ await element.fill(text);
1611
+
1612
+ if (submit) {
1613
+ await element.press("Enter");
1614
+ await page.waitForTimeout(1500); // Wait for form submission
1615
+ }
1616
+
1617
+ const title = await page.title();
1618
+ let out = `Typed "${text}" into "${selector}"${submit ? " and submitted" : ""}. Current page: ${title}`;
1619
+ if (readAfter) {
1620
+ const content = await getTruncatedPageContent(page);
1621
+ if (content) out += `\n\n--- Page content ---\n${content}`;
1622
+ }
1623
+ return out;
1624
+ } catch (err) {
1625
+ return `Error typing: ${err.message}`;
1626
+ }
1627
+ },
1628
+ },
1629
+
1630
+ browser_read: {
1631
+ description:
1632
+ 'Read and extract full content from the current page. Returns page title, URL, interactive elements (with href for links so you can click via a[href="..."]), and text content. Use when you need full content or when navigate/click/type didn\'t include content (e.g. readAfter was false). For scoped read use a simple CSS selector (e.g. "#section_id"); Playwright selectors like :has-text() are not supported for the selector param.',
1633
+ parameters: {
1634
+ selector: {
1635
+ type: "string",
1636
+ description:
1637
+ "Optional CSS selector to scope content extraction (e.g. #section_id). Standard CSS only; no :has-text() etc.",
1638
+ required: false,
1639
+ },
1640
+ },
1641
+ execute: async ({ selector }, config) => {
1642
+ try {
1643
+ const page = await getBrowserPage(config);
1644
+
1645
+ console.log(
1646
+ "[Browser] Reading page content",
1647
+ selector ? `(selector: ${selector})` : "",
1648
+ );
1649
+ const content = await extractPageContent(page, selector);
1650
+
1651
+ return content;
1652
+ } catch (err) {
1653
+ return `Error reading page: ${err.message}`;
1654
+ }
1655
+ },
1656
+ },
1657
+
1658
+ browser_screenshot: {
1659
+ description: "Take a screenshot of the current page",
1660
+ parameters: {
1661
+ path: {
1662
+ type: "string",
1663
+ description:
1664
+ "File path to save screenshot (default: screenshot.png in current directory)",
1665
+ required: false,
1666
+ },
1667
+ fullPage: {
1668
+ type: "boolean",
1669
+ description: "Capture full scrollable page (default: false)",
1670
+ required: false,
1671
+ },
1672
+ },
1673
+ execute: async ({ path, fullPage = false }, config) => {
1674
+ try {
1675
+ const page = await getBrowserPage(config);
1676
+
1677
+ const screenshotPath = path || `screenshot-${Date.now()}.png`;
1678
+ const resolvedPath = resolve(screenshotPath);
1679
+
1680
+ console.log("[Browser] Taking screenshot:", resolvedPath);
1681
+ await page.screenshot({
1682
+ path: resolvedPath,
1683
+ fullPage,
1684
+ });
1685
+
1686
+ return `Screenshot saved to: ${resolvedPath}`;
1687
+ } catch (err) {
1688
+ return `Error taking screenshot: ${err.message}`;
1689
+ }
1690
+ },
1691
+ },
1692
+
1693
+ browser_close: {
1694
+ description:
1695
+ "Close the browser and free resources. Call when done with browser tasks.",
1696
+ parameters: {},
1697
+ execute: async (params, config) => {
1698
+ try {
1699
+ if (!isBrowserActive()) {
1700
+ return "Browser is not currently running.";
1701
+ }
1702
+
1703
+ await closeBrowser();
1704
+ return "Browser closed successfully.";
1705
+ } catch (err) {
1706
+ return `Error closing browser: ${err.message}`;
1707
+ }
1708
+ },
1709
+ },
1710
+
1711
+ browser_interact: {
1712
+ description: `Execute multiple browser actions in a single call. Reduces latency by batching operations. By default appends final page content so you often don't need a separate browser_read. Use a trailing read action or readAfter for content.
1713
+
1714
+ Supported action types:
1715
+ - click: { type: "click", selector: "...", force?: boolean, waitForNavigation?: boolean }
1716
+ - type: { type: "type", selector: "...", text: "...", clear?: boolean, submit?: boolean, delay?: number }
1717
+ - fill: { type: "fill", selector: "...", text: "...", submit?: boolean }
1718
+ - wait: { type: "wait", selector?: "...", ms?: number, url?: "...", load?: "domcontentloaded|networkidle" }
1719
+ - scroll: { type: "scroll", selector?: "...", direction?: "up|down|top|bottom", amount?: number }
1720
+ - hover: { type: "hover", selector: "..." }
1721
+ - select: { type: "select", selector: "...", value: "..." }
1722
+ - check: { type: "check", selector: "...", uncheck?: boolean }
1723
+ - press: { type: "press", key: "Enter|Tab|...", selector?: "..." }
1724
+ - navigate: { type: "navigate", url: "...", waitUntil?: "domcontentloaded|networkidle" }
1725
+ - read: { type: "read", selector?: "...", maxLength?: number } - Extract page content (use at end of sequence)
1726
+ - evaluate: { type: "evaluate", script: "return document.title" }
1727
+ - screenshot: { type: "screenshot", path?: "...", fullPage?: boolean }
1728
+
1729
+ Each action can have:
1730
+ - continueOnError: boolean - If true, continue sequence even if this action fails
1731
+ - delayAfter: number - Wait N ms after this action completes
1732
+ - waitForNavigation: boolean - For click actions, wait for page navigation to complete`,
1733
+ parameters: {
1734
+ actions: {
1735
+ type: "array",
1736
+ description:
1737
+ "Array of action objects to execute in sequence (can be JSON string or array)",
1738
+ required: true,
1739
+ },
1740
+ readAfter: {
1741
+ type: "boolean",
1742
+ description:
1743
+ "Append final page content to the result after all actions (default: true). Set false to only get URL/title and action results.",
1744
+ required: false,
1745
+ },
1746
+ },
1747
+ execute: async (args, config) => {
1748
+ const readAfter = args.readAfter !== false;
1749
+ // Parse actions from args - handle both direct array and JSON string
1750
+ let actions = args.actions;
1751
+
1752
+ // If args itself is a string, try to parse it
1753
+ if (typeof args === "string") {
1754
+ try {
1755
+ const parsed = JSON.parse(args);
1756
+ actions = parsed.actions || parsed;
1757
+ } catch (e) {
1758
+ return `Error: Could not parse arguments as JSON: ${e.message}`;
1759
+ }
1760
+ }
1761
+
1762
+ // If actions is a string, try to parse it as JSON
1763
+ if (typeof actions === "string") {
1764
+ try {
1765
+ actions = JSON.parse(actions);
1766
+ } catch (e) {
1767
+ return `Error: Could not parse actions as JSON array: ${e.message}\nReceived: ${actions.substring(0, 200)}`;
1768
+ }
1769
+ }
1770
+
1771
+ // Validate actions array
1772
+ if (!actions || !Array.isArray(actions)) {
1773
+ return `Error: actions must be a non-empty array of action objects. Received type: ${typeof actions}, value: ${JSON.stringify(actions).substring(0, 200)}`;
1774
+ }
1775
+
1776
+ if (actions.length === 0) {
1777
+ return "Error: actions array is empty";
1778
+ }
1779
+
1780
+ try {
1781
+ const page = await getBrowserPage(config);
1782
+
1783
+ console.log(
1784
+ `[Browser] Executing ${actions.length} actions in sequence`,
1785
+ );
1786
+ const result = await executeActionSequence(page, actions);
1787
+
1788
+ // Format the result for readability
1789
+ let output = `**Browser Interaction Results**\n`;
1790
+ output += `Completed: ${result.completed}/${result.total} actions\n\n`;
1791
+
1792
+ result.results.forEach((r, i) => {
1793
+ const status = r.success ? "✓" : "✗";
1794
+ output += `${i + 1}. ${status} ${r.action}: ${r.message || r.error || "done"}\n`;
1795
+ if (r.result !== undefined) {
1796
+ output += ` Result: ${JSON.stringify(r.result).substring(0, 200)}\n`;
1797
+ }
1798
+ });
1799
+
1800
+ output += `\n**Final State:**\n`;
1801
+ output += `URL: ${result.finalState.url}\n`;
1802
+ output += `Title: ${result.finalState.title}\n`;
1803
+
1804
+ if (readAfter) {
1805
+ const page = await getBrowserPage(config);
1806
+ const content = await getTruncatedPageContent(page);
1807
+ if (content) output += `\n--- Page content ---\n${content}`;
1808
+ }
1809
+
1810
+ if (result.lastError) {
1811
+ output += `\n⚠️ Last Error: ${result.lastError}`;
1812
+ }
1813
+
1814
+ return output;
1815
+ } catch (err) {
1816
+ return `Error executing browser actions: ${err.message}`;
1817
+ }
1818
+ },
1819
+ },
1820
+
1821
+ // ============================================
1822
+ // Memory Tools
1823
+ // ============================================
1824
+
1825
+ search_memory: {
1826
+ description:
1827
+ "Search through your chat memory/history to find past conversations. Searches both local CLI history and Supabase cloud chats (when authenticated). Use simple, specific keywords (names, topics, single words). Returns matching chats with context snippets showing where the match was found. Use read_memory to get full content.",
1828
+ parameters: {
1829
+ query: {
1830
+ type: "string",
1831
+ description:
1832
+ "Search query - use SIMPLE keywords like a name, topic, or single word. Avoid long phrases.",
1833
+ required: true,
1834
+ },
1835
+ limit: {
1836
+ type: "number",
1837
+ description: "Max results to return (default: 10)",
1838
+ required: false,
1839
+ },
1840
+ },
1841
+ execute: async ({ query, limit = 10 }, config) => {
1842
+ const { getAllEmbeddings, searchChatsText, getChat } =
1843
+ await import("../storage/db.js");
1844
+
1845
+ const allResults = [];
1846
+
1847
+ // --- 1. Search Supabase (cloud/CLI tier chats) when authenticated ---
1848
+ if (config.accessToken) {
1849
+ try {
1850
+ const { verifyToken, searchChatsInSupabase } =
1851
+ await import("../storage/supabase.js");
1852
+ const user = await verifyToken(config.accessToken);
1853
+ if (user?.id) {
1854
+ const supaResults = await searchChatsInSupabase(
1855
+ user.id,
1856
+ query,
1857
+ limit,
1858
+ );
1859
+ for (const r of supaResults) {
1860
+ const dateStr = r.updated_at
1861
+ ? new Date(r.updated_at).toLocaleDateString()
1862
+ : "Unknown date";
1863
+ const snippetLine = r.snippet ? `\n ${r.snippet}` : "";
1864
+ allResults.push({
1865
+ source: "cloud",
1866
+ score: r.similarity,
1867
+ line: `- [Cloud Chat ${r.chatId}] "${r.title || "Untitled"}" (${dateStr}) - match: ${r.matchSource}${snippetLine}`,
1868
+ });
1869
+ }
1870
+ }
1871
+ } catch (err) {
1872
+ console.warn("[search_memory] Supabase search failed:", err.message);
1873
+ }
1874
+ }
1875
+
1876
+ // --- 2. Search local SQLite (vector search first, then text) ---
1877
+ try {
1878
+ const allEmbeddings = getAllEmbeddings();
1879
+ if (allEmbeddings.length > 0) {
1880
+ const queryEmbed = await generateQueryEmbedding(query, config);
1881
+ if (queryEmbed) {
1882
+ const scored = allEmbeddings
1883
+ .map((e) => ({
1884
+ chatId: e.chat_id,
1885
+ similarity: cosineSimilarity(queryEmbed, e.embedding),
1886
+ }))
1887
+ .sort((a, b) => b.similarity - a.similarity)
1888
+ .slice(0, limit);
1889
+
1890
+ const relevant = scored.filter((s) => s.similarity > 0.2);
1891
+ for (const s of relevant) {
1892
+ const chat = getChat(s.chatId);
1893
+ if (!chat) continue;
1894
+ const dateStr = chat.updated_at
1895
+ ? new Date(chat.updated_at).toLocaleDateString()
1896
+ : "Unknown date";
1897
+ allResults.push({
1898
+ source: "local",
1899
+ score: s.similarity,
1900
+ line: `- [Local Chat ${s.chatId}] "${chat.title || "Untitled"}" (${dateStr}) - relevance: ${(s.similarity * 100).toFixed(0)}%`,
1901
+ });
1902
+ }
1903
+ }
1904
+ }
1905
+
1906
+ if (!allResults.some((r) => r.source === "local")) {
1907
+ const textResults = searchChatsText(query, limit);
1908
+ for (const r of textResults) {
1909
+ const dateStr = r.updated_at
1910
+ ? new Date(r.updated_at).toLocaleDateString()
1911
+ : "Unknown date";
1912
+ const matchInfo =
1913
+ r.match_count > 1 ? ` (${r.match_count} matches)` : "";
1914
+ const snippet = r.matching_snippet || "";
1915
+ allResults.push({
1916
+ source: "local",
1917
+ score: r.match_count > 1 ? 0.7 : 0.5,
1918
+ line: `- [Local Chat ${r.id}] "${r.title || "Untitled"}" (${dateStr})${matchInfo}\n ${snippet}`,
1919
+ });
1920
+ }
1921
+ }
1922
+ } catch (err) {
1923
+ console.warn("[search_memory] Local search failed:", err.message);
1924
+ }
1925
+
1926
+ // --- 3. Merge and return ---
1927
+ if (allResults.length === 0) {
1928
+ return `No chats found matching "${query}". Try a different keyword - simpler, single words often work better than phrases.`;
1929
+ }
1930
+
1931
+ allResults.sort((a, b) => b.score - a.score);
1932
+ const formatted = allResults.slice(0, limit).map((r) => r.line);
1933
+ return `Found ${formatted.length} chats matching "${query}":\n\n${formatted.join("\n\n")}\n\nUse read_memory with a chat ID to see the full conversation.`;
1934
+ },
1935
+ },
1936
+
1937
+ read_memory: {
1938
+ description:
1939
+ "Read the full contents of a specific chat from your memory. Use search_memory first to find relevant chat IDs.",
1940
+ parameters: {
1941
+ chat_id: {
1942
+ type: "string",
1943
+ description: "The chat ID (UUID) to read (from search_memory results)",
1944
+ required: true,
1945
+ },
1946
+ },
1947
+ execute: async ({ chat_id }, config) => {
1948
+ const { getChatWithMessages } = await import("../storage/db.js");
1949
+
1950
+ try {
1951
+ const chat = getChatWithMessages(chat_id);
1952
+ if (!chat) {
1953
+ return `Error: Chat ${chat_id} not found.`;
1954
+ }
1955
+
1956
+ // Format the conversation header
1957
+ const dateStr = chat.updated_at
1958
+ ? new Date(chat.updated_at).toLocaleDateString()
1959
+ : "Unknown date";
1960
+ const header = `Chat: "${chat.title || "Untitled"}" (ID: ${chat.id})\nDate: ${dateStr}\nMessages: ${chat.messages.length}\n\n`;
1961
+
1962
+ // Format each message
1963
+ const conversation = chat.messages
1964
+ .map((m) => {
1965
+ const role = m.role === "user" ? "User" : "Assistant";
1966
+ const time = m.created_at
1967
+ ? new Date(m.created_at).toLocaleTimeString()
1968
+ : "";
1969
+ // Truncate very long messages to avoid context overflow
1970
+ const content =
1971
+ m.content.length > 2000
1972
+ ? m.content.substring(0, 2000) +
1973
+ "\n... [truncated - message continues for " +
1974
+ (m.content.length - 2000) +
1975
+ " more characters]"
1976
+ : m.content;
1977
+ return `[${time}] ${role}:\n${content}`;
1978
+ })
1979
+ .join("\n\n---\n\n");
1980
+
1981
+ return header + conversation;
1982
+ } catch (err) {
1983
+ return `Error reading chat: ${err.message}`;
1984
+ }
1985
+ },
1986
+ },
1987
+
1988
+ // ============================================
1989
+ // RAG (Document Search) Tools
1990
+ // ============================================
1991
+
1992
+ rag_search: {
1993
+ description:
1994
+ "Search through uploaded documents for relevant information. Use this when the user mentions documents with @ (like @guidebook). Returns relevant excerpts with citations.",
1995
+ parameters: {
1996
+ query: {
1997
+ type: "string",
1998
+ description: "What to search for in the documents",
1999
+ required: true,
2000
+ },
2001
+ documents: {
2002
+ type: "string",
2003
+ description:
2004
+ "Comma-separated list of document names to search (from @mentions)",
2005
+ required: false,
2006
+ },
2007
+ top_k: {
2008
+ type: "number",
2009
+ description: "Number of results to return (default: 8)",
2010
+ required: false,
2011
+ },
2012
+ },
2013
+ // Mark as conditionally available - only enabled when RAG documents are mentioned
2014
+ isConditional: true,
2015
+ execute: async ({ query, documents, top_k = 8 }, config) => {
2016
+ const { getAllRagDocuments, getRagChunksByDocuments } =
2017
+ await import("../storage/db.js");
2018
+
2019
+ try {
2020
+ // Get all RAG documents
2021
+ const allDocs = getAllRagDocuments();
2022
+ if (allDocs.length === 0) {
2023
+ return "No documents have been uploaded yet. Ask the user to upload documents using the Storage dropdown in the sidebar.";
2024
+ }
2025
+
2026
+ // Filter to requested documents if specified
2027
+ let targetDocs = allDocs;
2028
+ if (documents) {
2029
+ const docNames = documents
2030
+ .split(",")
2031
+ .map((d) => d.trim().toLowerCase());
2032
+ targetDocs = allDocs.filter((doc) => {
2033
+ const docNameLower = doc.name.toLowerCase();
2034
+ return docNames.some(
2035
+ (name) =>
2036
+ docNameLower.includes(name) || name.includes(docNameLower),
2037
+ );
2038
+ });
2039
+
2040
+ if (targetDocs.length === 0) {
2041
+ const availableDocs = allDocs.map((d) => d.name).join(", ");
2042
+ return `No documents found matching: ${documents}\n\nAvailable documents: ${availableDocs}`;
2043
+ }
2044
+ }
2045
+
2046
+ // Generate query embedding
2047
+ const queryEmbedding = await generateQueryEmbedding(query, config);
2048
+ if (!queryEmbedding) {
2049
+ return "Error: Could not generate query embedding. Make sure an OpenAI or Google API key is configured.";
2050
+ }
2051
+
2052
+ // Get chunks from target documents
2053
+ const docIds = targetDocs.map((d) => d.id);
2054
+ const chunks = getRagChunksByDocuments(docIds);
2055
+
2056
+ if (chunks.length === 0) {
2057
+ return `No content found in the specified documents. The documents may not have been properly indexed.`;
2058
+ }
2059
+
2060
+ // Build a map of doc IDs to names for faster lookup
2061
+ // Use Number() to ensure consistent types (SQLite can return strings or ints)
2062
+ const docIdToName = new Map();
2063
+ targetDocs.forEach((d) => {
2064
+ docIdToName.set(Number(d.id), d.name);
2065
+ });
2066
+
2067
+ console.log(
2068
+ "[rag_search] Document ID map:",
2069
+ Object.fromEntries(docIdToName),
2070
+ );
2071
+
2072
+ // Score chunks by similarity
2073
+ const scored = chunks.map((chunk) => {
2074
+ // Parse embedding from blob
2075
+ let embedding;
2076
+ if (Buffer.isBuffer(chunk.embedding)) {
2077
+ embedding = Array.from(
2078
+ new Float32Array(
2079
+ chunk.embedding.buffer,
2080
+ chunk.embedding.byteOffset,
2081
+ chunk.embedding.length / 4,
2082
+ ),
2083
+ );
2084
+ } else if (chunk.embedding instanceof Uint8Array) {
2085
+ embedding = Array.from(new Float32Array(chunk.embedding.buffer));
2086
+ } else {
2087
+ embedding = chunk.embedding;
2088
+ }
2089
+
2090
+ const similarity = cosineSimilarity(queryEmbedding, embedding);
2091
+
2092
+ // Find document name for this chunk (ensure both IDs are numbers for comparison)
2093
+ // Note: DB returns docId (camelCase), pageNumber, chunkIndex
2094
+ const chunkDocId = Number(chunk.docId || chunk.doc_id);
2095
+ const docName = docIdToName.get(chunkDocId) || "Unknown Document";
2096
+
2097
+ return {
2098
+ text: chunk.text,
2099
+ docId: chunkDocId,
2100
+ docName,
2101
+ pageNumber: chunk.pageNumber || chunk.page_number,
2102
+ chunkIndex: chunk.chunkIndex || chunk.chunk_index,
2103
+ similarity,
2104
+ };
2105
+ });
2106
+
2107
+ // Sort by similarity and take top results
2108
+ // Use a lower threshold (25%) to catch more potentially relevant content
2109
+ const topResults = scored
2110
+ .filter((s) => s.similarity > 0.25) // Minimum similarity threshold (25%)
2111
+ .sort((a, b) => b.similarity - a.similarity)
2112
+ .slice(0, top_k);
2113
+
2114
+ console.log(
2115
+ "[rag_search] Top results:",
2116
+ topResults.map((r) => ({
2117
+ docName: r.docName,
2118
+ similarity: Math.round(r.similarity * 100),
2119
+ textPreview: r.text.substring(0, 50),
2120
+ })),
2121
+ );
2122
+
2123
+ if (topResults.length === 0) {
2124
+ return `No relevant content found for "${query}" in the specified documents. Try rephrasing your question or using different keywords.`;
2125
+ }
2126
+
2127
+ // Format results with citations - make it very clear this is real document content
2128
+ const docNamesUsed = [
2129
+ ...new Set(topResults.map((r) => r.docName)),
2130
+ ].join(", ");
2131
+ let output = `📚 DOCUMENT SEARCH RESULTS from: ${docNamesUsed}\n`;
2132
+ output += `Found ${topResults.length} relevant sections. USE THIS CONTENT TO ANSWER:\n\n`;
2133
+
2134
+ topResults.forEach((result, i) => {
2135
+ const pageInfo = result.pageNumber
2136
+ ? `, Page ${result.pageNumber}`
2137
+ : "";
2138
+ const relevance = Math.round(result.similarity * 100);
2139
+
2140
+ output += `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`;
2141
+ output += `📄 SOURCE [${i + 1}]: ${result.docName}${pageInfo}\n`;
2142
+ output += ` Relevance: ${relevance}%\n\n`;
2143
+ output += ` CONTENT:\n`;
2144
+ output += ` ${result.text.trim().split("\n").join("\n ")}\n\n`;
2145
+ });
2146
+
2147
+ output += `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n`;
2148
+ output += `⚠️ IMPORTANT: Use the CONTENT above to answer the user's question.\n`;
2149
+ output += `Cite sources as: "According to [${docNamesUsed}, Page X]..." or quote directly.`;
2150
+
2151
+ return output;
2152
+ } catch (err) {
2153
+ console.error("[rag_search] Error:", err);
2154
+ return `Error searching documents: ${err.message}`;
2155
+ }
2156
+ },
2157
+ },
2158
+ };
2159
+
2160
+ // ============================================
2161
+ // Web Search Helpers (exported for server use)
2162
+ // ============================================
2163
+
2164
+ const WEBSEARCH_DEBUG =
2165
+ process.env.WEBSEARCH_DEBUG === "1" || process.env.WEBSEARCH_DEBUG === "true";
2166
+
2167
+ function webSearchDebug(engine, msg, extra = {}) {
2168
+ if (!WEBSEARCH_DEBUG) return;
2169
+ const parts = [`[WebSearch:${engine}]`, msg];
2170
+ if (Object.keys(extra).length) parts.push(JSON.stringify(extra));
2171
+ console.log(parts.join(" "));
2172
+ }
2173
+
2174
+ /** When an engine returns 0 results: always log first 2.5k chars of HTML; if WEBSEARCH_DEBUG=1, write full HTML to a file and log path. */
2175
+ function logZeroResultHtml(engine, html) {
2176
+ const sampleLen = 2500;
2177
+ const sample =
2178
+ html.length <= sampleLen
2179
+ ? html
2180
+ : html.substring(0, sampleLen) + "\n... (truncated)";
2181
+ console.log(
2182
+ "[WebSearch] " +
2183
+ engine +
2184
+ " HTML (first " +
2185
+ Math.min(html.length, sampleLen) +
2186
+ " chars):",
2187
+ );
2188
+ console.log(sample);
2189
+ if (WEBSEARCH_DEBUG && html.length > sampleLen) {
2190
+ const path = join(
2191
+ tmpdir(),
2192
+ `otherwise-websearch-${engine.toLowerCase()}-${Date.now()}.html`,
2193
+ );
2194
+ writeFileSync(path, html, "utf8");
2195
+ console.log("[WebSearch] " + engine + " full HTML written to: " + path);
2196
+ }
2197
+ }
2198
+
2199
+ // Rotating browser-like User-Agents and full headers to reduce bot detection
2200
+ const BROWSER_USER_AGENTS = [
2201
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
2202
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
2203
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
2204
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
2205
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
2206
+ ];
2207
+ function getBrowserHeaders(origin) {
2208
+ const ua =
2209
+ BROWSER_USER_AGENTS[Math.floor(Math.random() * BROWSER_USER_AGENTS.length)];
2210
+ return {
2211
+ "User-Agent": ua,
2212
+ Accept:
2213
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
2214
+ "Accept-Language": "en-US,en;q=0.9",
2215
+ "Accept-Encoding": "gzip, deflate, br",
2216
+ Referer: origin ? `${origin}/` : undefined,
2217
+ "Sec-Fetch-Dest": "document",
2218
+ "Sec-Fetch-Mode": "navigate",
2219
+ "Sec-Fetch-Site": origin ? "same-origin" : "none",
2220
+ "Sec-Fetch-User": "?1",
2221
+ "Upgrade-Insecure-Requests": "1",
2222
+ "sec-ch-ua":
2223
+ '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
2224
+ "sec-ch-ua-mobile": "?0",
2225
+ "sec-ch-ua-platform": '"Windows"',
2226
+ };
2227
+ }
2228
+
2229
+ /** Google: use AdsBot UA so Google may serve static HTML instead of JS-only/captcha (bypass technique). */
2230
+ function getGoogleSearchHeaders() {
2231
+ return {
2232
+ "User-Agent": "AdsBot-Google (+http://www.google.com/adsbot.html)",
2233
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
2234
+ "Accept-Language": "en-US,en;q=0.9",
2235
+ };
2236
+ }
2237
+
2238
+ /** Bing: use bingbot UA so Bing may treat request as crawler and serve SERP HTML. */
2239
+ function getBingSearchHeaders() {
2240
+ return {
2241
+ "User-Agent":
2242
+ "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
2243
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
2244
+ "Accept-Language": "en-US,en;q=0.9",
2245
+ };
2246
+ }
2247
+
2248
+ /** Extract destination URL from Bing click redirect (bing.com/ck/a?....&u=BASE64). Supports URL-safe base64. */
2249
+ function resolveBingCkUrl(href) {
2250
+ if (!href || !href.includes("bing.com/ck/a")) return null;
2251
+ try {
2252
+ const u = new URL(href);
2253
+ const uParam = u.searchParams.get("u");
2254
+ if (!uParam) return null;
2255
+ // Bing may use URL-safe base64 (- and _ instead of + and /)
2256
+ const base64 = uParam.replace(/-/g, "+").replace(/_/g, "/");
2257
+ const decoded = Buffer.from(base64, "base64").toString("utf8");
2258
+ if (decoded.startsWith("a1")) return decoded.slice(2); // Bing sometimes prefixes with a1
2259
+ if (/^https?:\/\//i.test(decoded)) return decoded;
2260
+ return null;
2261
+ } catch {
2262
+ return null;
2263
+ }
2264
+ }
2265
+
2266
+ /** Extract destination URL from Yahoo redirect (r.search.yahoo.com/.../RU=encoded/RK=...). */
2267
+ function resolveYahooRedirect(href) {
2268
+ if (!href || !href.includes("r.search.yahoo")) return null;
2269
+ try {
2270
+ const match = href.match(/[?/]RU=([^/]+)(?:\/|$)/i);
2271
+ if (!match) return null;
2272
+ const decoded = decodeURIComponent(match[1].replace(/\+/g, " "));
2273
+ if (/^https?:\/\//i.test(decoded)) return decoded;
2274
+ return null;
2275
+ } catch {
2276
+ return null;
2277
+ }
2278
+ }
2279
+
2280
+ async function fetchWithTimeout(url, options = {}, timeoutMs = 10000) {
2281
+ const controller = new AbortController();
2282
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
2283
+ const useProxy = options && options.useWebSearchProxy && getWebSearchProxy();
2284
+ const opts = { ...options };
2285
+ delete opts.useWebSearchProxy;
2286
+ const fetchOpts = {
2287
+ ...opts,
2288
+ signal: controller.signal,
2289
+ redirect: "follow",
2290
+ };
2291
+ try {
2292
+ const response = useProxy
2293
+ ? await undiciFetch(url, {
2294
+ ...fetchOpts,
2295
+ dispatcher: new ProxyAgent(useProxy),
2296
+ })
2297
+ : await fetch(url, fetchOpts);
2298
+ clearTimeout(timeout);
2299
+ return response;
2300
+ } catch (err) {
2301
+ clearTimeout(timeout);
2302
+ throw err;
2303
+ }
2304
+ }
2305
+
2306
+ /**
2307
+ * Normalize URL for deduplication (strip fragment, trailing slash, lowercase host)
2308
+ */
2309
+ function normalizeUrlForDedup(url) {
2310
+ try {
2311
+ const u = new URL(url);
2312
+ u.hash = "";
2313
+ u.searchParams.sort();
2314
+ let path = u.pathname.replace(/\/+$/, "") || "/";
2315
+ u.pathname = path;
2316
+ u.hostname = u.hostname.toLowerCase();
2317
+ return u.toString();
2318
+ } catch {
2319
+ return url;
2320
+ }
2321
+ }
2322
+
2323
+ /**
2324
+ * Extract the most relevant snippet from page content by scoring paragraphs
2325
+ * against query terms instead of blindly taking the first N chars.
2326
+ */
2327
+ function extractBestSnippet(content, query, maxLen = 400) {
2328
+ if (!content || !query) return (content || "").slice(0, maxLen);
2329
+
2330
+ const queryTerms = new Set(
2331
+ query
2332
+ .toLowerCase()
2333
+ .split(/\W+/)
2334
+ .filter((t) => t.length > 1),
2335
+ );
2336
+ if (queryTerms.size === 0) return content.slice(0, maxLen);
2337
+
2338
+ const paragraphs = content
2339
+ .split(/\n{2,}|\r\n{2,}/)
2340
+ .map((p) => p.replace(/\s+/g, " ").trim())
2341
+ .filter((p) => p.length > 30 && p.length < 2000);
2342
+
2343
+ if (paragraphs.length === 0) return content.slice(0, maxLen);
2344
+
2345
+ let bestScore = -1;
2346
+ let bestParagraph = paragraphs[0];
2347
+
2348
+ for (const para of paragraphs) {
2349
+ const lower = para.toLowerCase();
2350
+ // Skip nav/boilerplate-like paragraphs
2351
+ if (/^(menu|navigation|skip to|cookie|accept|sign in|log in)/i.test(para))
2352
+ continue;
2353
+ let score = 0;
2354
+ for (const term of queryTerms) {
2355
+ const idx = lower.indexOf(term);
2356
+ if (idx !== -1) {
2357
+ score += 10;
2358
+ if (idx < 50) score += 5; // Term appears early
2359
+ }
2360
+ }
2361
+ // Prefer mid-length paragraphs (not too short, not too long)
2362
+ if (para.length > 60 && para.length < 800) score += 3;
2363
+ if (score > bestScore) {
2364
+ bestScore = score;
2365
+ bestParagraph = para;
2366
+ }
2367
+ }
2368
+
2369
+ return bestParagraph.slice(0, maxLen);
2370
+ }
2371
+
2372
+ /**
2373
+ * Run multiple search engines in parallel, aggregate and dedupe results.
2374
+ * Uses Google, Bing, Brave, DuckDuckGo, and Startpage (all no-API scrape);
2375
+ * results appearing in multiple engines are ranked higher.
2376
+ */
2377
+ export async function performWebSearch(query, numResults = 8) {
2378
+ console.log(
2379
+ "[WebSearch] Searching for:",
2380
+ query,
2381
+ "(requesting",
2382
+ numResults,
2383
+ "results)",
2384
+ );
2385
+
2386
+ // Per-engine result cap (request more so we have enough after dedupe)
2387
+ const perEngine = Math.min(numResults + 8, 20);
2388
+
2389
+ const engineFns = [
2390
+ { name: "Google", fn: () => searchGoogle(query, perEngine) },
2391
+ { name: "Bing", fn: () => searchBing(query, perEngine) },
2392
+ { name: "Brave", fn: () => searchBrave(query, perEngine) },
2393
+ { name: "DuckDuckGo", fn: () => searchDuckDuckGo(query, perEngine) },
2394
+ { name: "Startpage", fn: () => searchStartpage(query, perEngine) },
2395
+ { name: "Yahoo", fn: () => searchYahoo(query, perEngine) },
2396
+ { name: "Ecosia", fn: () => searchEcosia(query, perEngine) },
2397
+ ];
2398
+
2399
+ const settled = await Promise.allSettled(engineFns.map((e) => e.fn()));
2400
+
2401
+ const byNormalized = new Map(); // normalizedUrl -> { result, engines[], firstSeenOrder }
2402
+
2403
+ let order = 0;
2404
+ for (let i = 0; i < settled.length; i++) {
2405
+ const status = settled[i];
2406
+ const engineName = engineFns[i].name;
2407
+ if (status.status === "rejected") {
2408
+ console.error(
2409
+ "[WebSearch]",
2410
+ engineName,
2411
+ "error:",
2412
+ status.reason?.message,
2413
+ );
2414
+ continue;
2415
+ }
2416
+ const list = status.value || [];
2417
+ console.log("[WebSearch]", engineName, "returned", list.length, "results");
2418
+ for (const r of list) {
2419
+ const key = normalizeUrlForDedup(r.url);
2420
+ const existing = byNormalized.get(key);
2421
+ if (existing) {
2422
+ existing.engines.push(engineName);
2423
+ if (!existing.result.snippet && r.snippet) {
2424
+ existing.result.snippet = r.snippet;
2425
+ }
2426
+ if (!existing.result.title && r.title) {
2427
+ existing.result.title = r.title;
2428
+ }
2429
+ } else {
2430
+ byNormalized.set(key, {
2431
+ result: {
2432
+ url: r.url,
2433
+ title: r.title,
2434
+ snippet: r.snippet,
2435
+ source: r.source,
2436
+ },
2437
+ engines: [engineName],
2438
+ firstSeenOrder: order++,
2439
+ });
2440
+ }
2441
+ }
2442
+ }
2443
+
2444
+ // Relevance scoring: query-term matches in title/snippet + cross-engine count + order
2445
+ const queryTerms = new Set(
2446
+ query
2447
+ .toLowerCase()
2448
+ .split(/\W+/)
2449
+ .filter((t) => t.length > 1),
2450
+ );
2451
+
2452
+ const sorted = [...byNormalized.values()]
2453
+ .sort((a, b) => {
2454
+ const textA = `${a.result.title} ${a.result.snippet}`.toLowerCase();
2455
+ const textB = `${b.result.title} ${b.result.snippet}`.toLowerCase();
2456
+ const termHitsA = [...queryTerms].filter((t) => textA.includes(t)).length;
2457
+ const termHitsB = [...queryTerms].filter((t) => textB.includes(t)).length;
2458
+ const aScore =
2459
+ termHitsA * 500 + a.engines.length * 1000 - a.firstSeenOrder;
2460
+ const bScore =
2461
+ termHitsB * 500 + b.engines.length * 1000 - b.firstSeenOrder;
2462
+ return bScore - aScore;
2463
+ })
2464
+ .slice(0, numResults)
2465
+ .map((x) => ({ ...x.result, engines: x.engines }));
2466
+
2467
+ console.log("[WebSearch] Total unique results after dedupe:", sorted.length);
2468
+ return sorted;
2469
+ }
2470
+
2471
+ /** Parse DuckDuckGo HTML SERP and return results array. */
2472
+ function parseDuckDuckGoResults(html, numResults) {
2473
+ const results = [];
2474
+ const seenUrls = new Set();
2475
+ function add(url, title, snippet) {
2476
+ if (url.includes("uddg=")) {
2477
+ const uddgMatch = url.match(/uddg=([^&]+)/);
2478
+ if (uddgMatch) url = decodeURIComponent(uddgMatch[1]);
2479
+ }
2480
+ if (!url.startsWith("http") || seenUrls.has(url)) return;
2481
+ seenUrls.add(url);
2482
+ let source = "";
2483
+ try {
2484
+ source = new URL(url).hostname.replace("www.", "");
2485
+ } catch {}
2486
+ results.push({
2487
+ url,
2488
+ title: title || source,
2489
+ snippet: snippet || `From ${source}`,
2490
+ source,
2491
+ });
2492
+ }
2493
+ const resultPattern =
2494
+ /<a[^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?<a[^>]*class="result__snippet"[^>]*>([\s\S]*?)<\/a>/gi;
2495
+ let match;
2496
+ while (
2497
+ (match = resultPattern.exec(html)) !== null &&
2498
+ results.length < numResults
2499
+ ) {
2500
+ let snippet = match[3].replace(/<[^>]+>/g, " ").trim();
2501
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 250);
2502
+ add(match[1], decodeHtmlEntities(match[2].trim()), snippet);
2503
+ }
2504
+ if (results.length < numResults) {
2505
+ const uddgRegex = /<a[^>]*href="([^"]*uddg=[^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
2506
+ let um;
2507
+ while (
2508
+ (um = uddgRegex.exec(html)) !== null &&
2509
+ results.length < numResults
2510
+ ) {
2511
+ const title = decodeHtmlEntities(um[2].replace(/<[^>]+>/g, "").trim());
2512
+ if (title.length < 3 || title.length > 300) continue;
2513
+ add(um[1], title, "");
2514
+ }
2515
+ }
2516
+ if (results.length < numResults) {
2517
+ const blocks = html.split(/class="[^"]*result__body[^"]*"[^>]*>/i);
2518
+ for (let i = 1; i < blocks.length && results.length < numResults; i++) {
2519
+ const block = blocks[i].substring(0, 2000);
2520
+ const linkMatch = block.match(/<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/i);
2521
+ if (linkMatch)
2522
+ add(linkMatch[1], decodeHtmlEntities(linkMatch[2].trim()), "");
2523
+ }
2524
+ }
2525
+ return results;
2526
+ }
2527
+
2528
+ /** Search DuckDuckGo HTML version. Retries with headless browser if fetch returns 0 results. */
2529
+ async function searchDuckDuckGo(query, numResults) {
2530
+ const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
2531
+ const headers = getBrowserHeaders("https://html.duckduckgo.com");
2532
+ headers["Sec-Fetch-Site"] = "none";
2533
+
2534
+ const response = await fetchWithTimeout(
2535
+ url,
2536
+ { headers, useWebSearchProxy: true },
2537
+ 12000,
2538
+ );
2539
+ let html = await response.text();
2540
+
2541
+ let results = parseDuckDuckGoResults(html, numResults);
2542
+
2543
+ if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
2544
+ try {
2545
+ console.log("[WebSearch] DuckDuckGo retrying with headless browser...");
2546
+ html = await fetchHtmlWithBrowser(url);
2547
+ results = parseDuckDuckGoResults(html, numResults);
2548
+ } catch (e) {
2549
+ console.log(
2550
+ "[WebSearch] DuckDuckGo browser fetch failed:",
2551
+ e?.message || e,
2552
+ );
2553
+ }
2554
+ }
2555
+
2556
+ if (results.length === 0) {
2557
+ const uddgCount = (html.match(/uddg=/g) || []).length;
2558
+ const resultACount = (html.match(/result__a/g) || []).length;
2559
+ const resultBodyCount = (html.match(/result__body/g) || []).length;
2560
+ console.log(
2561
+ "[WebSearch] DuckDuckGo: 0 results | htmlLen=",
2562
+ html.length,
2563
+ "| uddg=",
2564
+ uddgCount,
2565
+ "result__a=",
2566
+ resultACount,
2567
+ "result__body=",
2568
+ resultBodyCount,
2569
+ );
2570
+ logZeroResultHtml("DuckDuckGo", html);
2571
+ }
2572
+ console.log("[WebSearch] DuckDuckGo found:", results.length, "results");
2573
+ return results;
2574
+ }
2575
+
2576
+ /**
2577
+ * Decode HTML entities
2578
+ */
2579
+ function decodeHtmlEntities(text) {
2580
+ return text
2581
+ .replace(/&lt;/g, "<")
2582
+ .replace(/&gt;/g, ">")
2583
+ .replace(/&amp;/g, "&")
2584
+ .replace(/&quot;/g, '"')
2585
+ .replace(/&#39;/g, "'")
2586
+ .replace(/&apos;/g, "'")
2587
+ .replace(/&nbsp;/g, " ")
2588
+ .replace(/&#x([0-9a-f]+);/gi, (_, hex) =>
2589
+ String.fromCharCode(parseInt(hex, 16)),
2590
+ )
2591
+ .replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(dec));
2592
+ }
2593
+
2594
+ /** Parse Brave SERP HTML and return results array. */
2595
+ function parseBraveResults(html, numResults) {
2596
+ const results = [];
2597
+ const seenUrls = new Set();
2598
+ const blocks = html.split(/data-type="web"/);
2599
+ for (let i = 1; i < blocks.length && results.length < numResults; i++) {
2600
+ const block = blocks[i].substring(0, 3000);
2601
+ const hrefMatch = block.match(/href="(https?:\/\/[^"]+)"/);
2602
+ if (!hrefMatch) continue;
2603
+ const resultUrl = hrefMatch[1];
2604
+ if (resultUrl.includes("brave.com") || seenUrls.has(resultUrl)) continue;
2605
+ const titleMatch = block.match(/title="([^"]{10,300})"/);
2606
+ let title = titleMatch ? decodeHtmlEntities(titleMatch[1].trim()) : "";
2607
+ if (!title) {
2608
+ const innerTitleMatch = block.match(
2609
+ /class="[^"]*title[^"]*"[^>]*>([^<]+)</,
2610
+ );
2611
+ if (innerTitleMatch)
2612
+ title = decodeHtmlEntities(innerTitleMatch[1].trim());
2613
+ }
2614
+ if (!title || title.length < 5) continue;
2615
+ let snippet = "";
2616
+ const snippetMatch = block.match(
2617
+ /class="[^"]*(?:snippet-description|description)[^"]*"[^>]*>([^<]+)/,
2618
+ );
2619
+ if (snippetMatch)
2620
+ snippet = decodeHtmlEntities(snippetMatch[1].trim())
2621
+ .replace(/\s+/g, " ")
2622
+ .slice(0, 300);
2623
+ seenUrls.add(resultUrl);
2624
+ let source = "";
2625
+ try {
2626
+ source = new URL(resultUrl).hostname.replace("www.", "");
2627
+ } catch {}
2628
+ results.push({
2629
+ url: resultUrl,
2630
+ title,
2631
+ snippet: snippet || `From ${source}`,
2632
+ source,
2633
+ });
2634
+ }
2635
+ if (results.length < numResults) {
2636
+ const cardPattern =
2637
+ /class="enrichment-card-item[^"]*"[^>]*href="(https?:\/\/[^"]+)"/g;
2638
+ let cardMatch;
2639
+ while (
2640
+ (cardMatch = cardPattern.exec(html)) !== null &&
2641
+ results.length < numResults
2642
+ ) {
2643
+ const cardUrl = cardMatch[1];
2644
+ if (seenUrls.has(cardUrl) || cardUrl.includes("brave.com")) continue;
2645
+ seenUrls.add(cardUrl);
2646
+ let source = "";
2647
+ try {
2648
+ source = new URL(cardUrl).hostname.replace("www.", "");
2649
+ } catch {}
2650
+ const cardContext = html.substring(
2651
+ cardMatch.index,
2652
+ cardMatch.index + 500,
2653
+ );
2654
+ const cardTitleMatch = cardContext.match(/title="([^"]+)"/);
2655
+ const title = cardTitleMatch
2656
+ ? decodeHtmlEntities(cardTitleMatch[1].trim())
2657
+ : `Article from ${source}`;
2658
+ results.push({ url: cardUrl, title, snippet: `From ${source}`, source });
2659
+ }
2660
+ }
2661
+ return results;
2662
+ }
2663
+
2664
+ /**
2665
+ * Search Brave - reliable, privacy-focused. Retries with headless browser if fetch returns 0 results.
2666
+ */
2667
+ async function searchBrave(query, numResults) {
2668
+ const url = `https://search.brave.com/search?q=${encodeURIComponent(query)}&source=web`;
2669
+ const response = await fetchWithTimeout(
2670
+ url,
2671
+ {
2672
+ headers: {
2673
+ "User-Agent":
2674
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
2675
+ Accept:
2676
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
2677
+ "Accept-Language": "en-US,en;q=0.9",
2678
+ "Accept-Encoding": "gzip, deflate, br",
2679
+ },
2680
+ useWebSearchProxy: true,
2681
+ },
2682
+ 12000,
2683
+ );
2684
+ let html = await response.text();
2685
+ console.log("[WebSearch] Brave response:", html.length, "bytes");
2686
+ if (html.length < 5000) return [];
2687
+
2688
+ let results = parseBraveResults(html, numResults);
2689
+
2690
+ if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
2691
+ try {
2692
+ console.log("[WebSearch] Brave retrying with headless browser...");
2693
+ html = await fetchHtmlWithBrowser(url);
2694
+ results = parseBraveResults(html, numResults);
2695
+ } catch (e) {
2696
+ console.log("[WebSearch] Brave browser fetch failed:", e?.message || e);
2697
+ }
2698
+ }
2699
+
2700
+ console.log("[WebSearch] Brave found:", results.length, "results");
2701
+ return results;
2702
+ }
2703
+
2704
+ /** Parse Startpage SERP HTML and return results array. */
2705
+ function parseStartpageResults(html, numResults) {
2706
+ const results = [];
2707
+ const seenUrls = new Set();
2708
+ const patterns = [
2709
+ /<a[^>]*class="[^"]*result-title[^"]*"[^>]*href="([^"]+)"[^>]*>[\s\S]*?<h2[^>]*>([^<]+)<\/h2>[\s\S]*?<\/a>[\s\S]*?<p[^>]*class="[^"]*description[^"]*"[^>]*>([\s\S]*?)<\/p>/gi,
2710
+ /<a[^>]*class="[^"]*w-gl__result-title[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<p[^>]*class="[^"]*w-gl__description[^"]*"[^>]*>([\s\S]*?)<\/p>/gi,
2711
+ /<a[^>]*href="(https?:\/\/(?!.*startpage)[^"]+)"[^>]*class="[^"]*result[^"]*"[^>]*>([\s\S]*?)<\/a>[\s\S]{0,500}?<p[^>]*>([\s\S]*?)<\/p>/gi,
2712
+ ];
2713
+ for (const pattern of patterns) {
2714
+ if (results.length >= numResults) break;
2715
+ let match;
2716
+ pattern.lastIndex = 0;
2717
+ while (
2718
+ (match = pattern.exec(html)) !== null &&
2719
+ results.length < numResults
2720
+ ) {
2721
+ const resultUrl = match[1];
2722
+ let title = match[2].replace(/<[^>]+>/g, "").trim();
2723
+ title = decodeHtmlEntities(title);
2724
+ let snippet = match[3].replace(/<[^>]+>/g, " ").trim();
2725
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
2726
+ if (
2727
+ !resultUrl.startsWith("http") ||
2728
+ resultUrl.includes("startpage.com") ||
2729
+ seenUrls.has(resultUrl) ||
2730
+ !title ||
2731
+ title.length < 3
2732
+ )
2733
+ continue;
2734
+ seenUrls.add(resultUrl);
2735
+ let source = "";
2736
+ try {
2737
+ source = new URL(resultUrl).hostname.replace("www.", "");
2738
+ } catch {}
2739
+ results.push({
2740
+ url: resultUrl,
2741
+ title,
2742
+ snippet: snippet || `From ${source}`,
2743
+ source,
2744
+ });
2745
+ }
2746
+ }
2747
+ if (results.length < numResults / 2) {
2748
+ const linkPattern =
2749
+ /<a[^>]*href="(https?:\/\/(?!.*startpage)[^"]+)"[^>]*>([^<]*(?:<[^>]*>[^<]*)*)<\/a>/gi;
2750
+ const descPattern =
2751
+ /<p[^>]*class="[^"]*(?:description|snippet|abstract)[^"]*"[^>]*>([\s\S]*?)<\/p>/gi;
2752
+ const links = [];
2753
+ const descs = [];
2754
+ let m;
2755
+ while ((m = linkPattern.exec(html)) !== null) {
2756
+ const url = m[1];
2757
+ const text = m[2].replace(/<[^>]+>/g, "").trim();
2758
+ if (
2759
+ url &&
2760
+ text &&
2761
+ text.length > 10 &&
2762
+ text.length < 200 &&
2763
+ !seenUrls.has(url)
2764
+ )
2765
+ links.push({ url, title: decodeHtmlEntities(text) });
2766
+ }
2767
+ while ((m = descPattern.exec(html)) !== null) {
2768
+ let d = m[1].replace(/<[^>]+>/g, " ").trim();
2769
+ descs.push(decodeHtmlEntities(d).replace(/\s+/g, " ").slice(0, 300));
2770
+ }
2771
+ for (let i = 0; i < links.length && results.length < numResults; i++) {
2772
+ if (seenUrls.has(links[i].url)) continue;
2773
+ seenUrls.add(links[i].url);
2774
+ let source = "";
2775
+ try {
2776
+ source = new URL(links[i].url).hostname.replace("www.", "");
2777
+ } catch {}
2778
+ results.push({
2779
+ url: links[i].url,
2780
+ title: links[i].title,
2781
+ snippet: descs[i] || `From ${source}`,
2782
+ source,
2783
+ });
2784
+ }
2785
+ }
2786
+ return results;
2787
+ }
2788
+
2789
+ /**
2790
+ * Search Startpage - reliable, privacy-focused. Retries with headless browser if fetch returns 0 results.
2791
+ */
2792
+ async function searchStartpage(query, numResults) {
2793
+ const url = `https://www.startpage.com/sp/search?q=${encodeURIComponent(query)}&cat=web&pl=ext-ff&language=english`;
2794
+ const response = await fetchWithTimeout(
2795
+ url,
2796
+ {
2797
+ headers: {
2798
+ "User-Agent":
2799
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
2800
+ Accept:
2801
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
2802
+ "Accept-Language": "en-US,en;q=0.9",
2803
+ "Accept-Encoding": "gzip, deflate, br",
2804
+ },
2805
+ useWebSearchProxy: true,
2806
+ },
2807
+ 12000,
2808
+ );
2809
+ let html = await response.text();
2810
+ console.log("[WebSearch] Startpage response:", html.length, "bytes");
2811
+ if (html.length < 5000) return [];
2812
+
2813
+ let results = parseStartpageResults(html, numResults);
2814
+
2815
+ if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
2816
+ try {
2817
+ console.log("[WebSearch] Startpage retrying with headless browser...");
2818
+ html = await fetchHtmlWithBrowser(url);
2819
+ results = parseStartpageResults(html, numResults);
2820
+ } catch (e) {
2821
+ console.log(
2822
+ "[WebSearch] Startpage browser fetch failed:",
2823
+ e?.message || e,
2824
+ );
2825
+ }
2826
+ }
2827
+
2828
+ console.log("[WebSearch] Startpage found:", results.length, "results");
2829
+ return results;
2830
+ }
2831
+
2832
+ /**
2833
+ * Extract destination URL from Google's /url?q=... redirect wrapper
2834
+ */
2835
+ function extractGoogleResultUrl(href) {
2836
+ if (!href || !href.includes("url?")) return href;
2837
+ const m = href.match(/[?&]q=([^&]+)/);
2838
+ if (m) return decodeURIComponent(m[1]);
2839
+ return href;
2840
+ }
2841
+
2842
+ /** Parse Google SERP HTML and return results array (used by fetch and browser fallback). */
2843
+ function parseGoogleResults(html, numResults) {
2844
+ const results = [];
2845
+ const seenUrls = new Set();
2846
+ function addResult(resultUrl, title, snippet, source) {
2847
+ try {
2848
+ resultUrl = decodeURIComponent(resultUrl);
2849
+ } catch {}
2850
+ if (
2851
+ !resultUrl.startsWith("http") ||
2852
+ resultUrl.includes("google.com") ||
2853
+ seenUrls.has(resultUrl)
2854
+ )
2855
+ return false;
2856
+ let s = source;
2857
+ if (!s) {
2858
+ try {
2859
+ s = new URL(resultUrl).hostname.replace("www.", "");
2860
+ } catch {}
2861
+ }
2862
+ const finalTitle =
2863
+ title && title.length >= 2 ? title.slice(0, 300) : s || resultUrl;
2864
+ seenUrls.add(resultUrl);
2865
+ results.push({
2866
+ url: resultUrl,
2867
+ title: finalTitle,
2868
+ snippet: (snippet || `From ${s}`).slice(0, 300),
2869
+ source: s,
2870
+ });
2871
+ return true;
2872
+ }
2873
+ const gBlocks = html.split(/<div[^>]*class="[^"]*\bg\b[^"]*"[^>]*>/i);
2874
+ for (let i = 1; i < gBlocks.length && results.length < numResults; i++) {
2875
+ const block = gBlocks[i].substring(0, 4000);
2876
+ const hrefMatch = block.match(
2877
+ /href="(\/url\?q=([^"]+)|(https?:\/\/[^"]+))"/,
2878
+ );
2879
+ if (!hrefMatch) continue;
2880
+ let resultUrl = hrefMatch[1].startsWith("http")
2881
+ ? hrefMatch[1]
2882
+ : extractGoogleResultUrl(hrefMatch[1]);
2883
+ const titleMatch =
2884
+ block.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i) ||
2885
+ block.match(/class="[^"]*LC20lb[^"]*"[^>]*>([^<]+)/i);
2886
+ const title = titleMatch
2887
+ ? decodeHtmlEntities(titleMatch[1].replace(/<[^>]+>/g, "").trim())
2888
+ : "";
2889
+ const snippetMatch =
2890
+ block.match(/class="[^"]*VwiC3b[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
2891
+ block.match(
2892
+ /<span[^>]*class="[^"]*\b(?:st|s)\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i,
2893
+ );
2894
+ let snippet = snippetMatch
2895
+ ? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
2896
+ : "";
2897
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
2898
+ addResult(resultUrl, title, snippet, "");
2899
+ }
2900
+ if (results.length < numResults && /data-ved=/.test(html)) {
2901
+ const vedBlocks = html.split(/data-ved="[^"]*"/);
2902
+ for (let i = 1; i < vedBlocks.length && results.length < numResults; i++) {
2903
+ const block = vedBlocks[i].substring(0, 3000);
2904
+ const hrefMatch = block.match(
2905
+ /href="(\/url\?q=([^"]+)|(https?:\/\/[^"]+))"/,
2906
+ );
2907
+ if (!hrefMatch) continue;
2908
+ let resultUrl = hrefMatch[1].startsWith("http")
2909
+ ? hrefMatch[1]
2910
+ : extractGoogleResultUrl(hrefMatch[1]);
2911
+ const titleMatch = block.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i);
2912
+ const title = titleMatch
2913
+ ? decodeHtmlEntities(titleMatch[1].replace(/<[^>]+>/g, "").trim())
2914
+ : "";
2915
+ addResult(resultUrl, title, "", "");
2916
+ }
2917
+ }
2918
+ if (results.length < numResults) {
2919
+ const urlQRegex = /href="(\/url\?q=)([^"]+)"/gi;
2920
+ let urlMatch;
2921
+ while (
2922
+ (urlMatch = urlQRegex.exec(html)) !== null &&
2923
+ results.length < numResults
2924
+ ) {
2925
+ const qValue = urlMatch[2];
2926
+ let resultUrl = "";
2927
+ try {
2928
+ resultUrl = decodeURIComponent(qValue.replace(/&amp;/g, "&"));
2929
+ } catch {
2930
+ continue;
2931
+ }
2932
+ if (
2933
+ !resultUrl.startsWith("http") ||
2934
+ resultUrl.includes("google.com") ||
2935
+ seenUrls.has(resultUrl)
2936
+ )
2937
+ continue;
2938
+ const after = html.substring(
2939
+ urlMatch.index,
2940
+ Math.min(html.length, urlMatch.index + 1200),
2941
+ );
2942
+ const titleMatch =
2943
+ after.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i) ||
2944
+ after.match(/<span[^>]*>([^<]{5,200})<\/span>/);
2945
+ const title = titleMatch
2946
+ ? decodeHtmlEntities(
2947
+ titleMatch[1].replace(/<[^>]+>/g, "").trim(),
2948
+ ).slice(0, 300)
2949
+ : "";
2950
+ addResult(resultUrl, title, "", "");
2951
+ }
2952
+ }
2953
+ if (results.length < numResults) {
2954
+ const urlQRegex2 = /\/url\?q=([^"&]+)/gi;
2955
+ let urlMatch2;
2956
+ while (
2957
+ (urlMatch2 = urlQRegex2.exec(html)) !== null &&
2958
+ results.length < numResults
2959
+ ) {
2960
+ let resultUrl = "";
2961
+ try {
2962
+ resultUrl = decodeURIComponent(urlMatch2[1].replace(/&amp;/g, "&"));
2963
+ } catch {
2964
+ continue;
2965
+ }
2966
+ if (
2967
+ !resultUrl.startsWith("http") ||
2968
+ resultUrl.includes("google.com") ||
2969
+ seenUrls.has(resultUrl)
2970
+ )
2971
+ continue;
2972
+ const after = html.substring(
2973
+ urlMatch2.index,
2974
+ Math.min(html.length, urlMatch2.index + 800),
2975
+ );
2976
+ const textBlock = after.match(/>([^<]{5,200})</);
2977
+ const title = textBlock
2978
+ ? decodeHtmlEntities(textBlock[1].trim()).slice(0, 300)
2979
+ : "";
2980
+ addResult(resultUrl, title, "", "");
2981
+ }
2982
+ }
2983
+ if (results.length < numResults) {
2984
+ const citeRegex = /<cite[^>]*>([^<]+)<\/cite>/gi;
2985
+ let citeMatch;
2986
+ while (
2987
+ (citeMatch = citeRegex.exec(html)) !== null &&
2988
+ results.length < numResults
2989
+ ) {
2990
+ const before = html.substring(
2991
+ Math.max(0, citeMatch.index - 1200),
2992
+ citeMatch.index,
2993
+ );
2994
+ const hrefAll = [...before.matchAll(/href="(\/url\?q=([^"]+))"/g)];
2995
+ const hrefMatch = hrefAll[hrefAll.length - 1];
2996
+ if (!hrefMatch) continue;
2997
+ let resultUrl = hrefMatch[1].startsWith("http")
2998
+ ? hrefMatch[1]
2999
+ : extractGoogleResultUrl(hrefMatch[1]);
3000
+ try {
3001
+ resultUrl = decodeURIComponent(resultUrl);
3002
+ } catch {
3003
+ continue;
3004
+ }
3005
+ if (
3006
+ !resultUrl.startsWith("http") ||
3007
+ resultUrl.includes("google.com") ||
3008
+ seenUrls.has(resultUrl)
3009
+ )
3010
+ continue;
3011
+ const h3All = [...before.matchAll(/<h3[^>]*>([\s\S]*?)<\/h3>/gi)];
3012
+ const titleMatch = h3All[h3All.length - 1];
3013
+ const title = titleMatch
3014
+ ? decodeHtmlEntities(
3015
+ titleMatch[1].replace(/<[^>]+>/g, "").trim(),
3016
+ ).slice(0, 300)
3017
+ : "";
3018
+ addResult(resultUrl, title, "", "");
3019
+ }
3020
+ }
3021
+ return results;
3022
+ }
3023
+
3024
+ /**
3025
+ * Fetch Google results via SerpAPI (no captcha). Set SERPAPI_API_KEY to use.
3026
+ */
3027
+ async function fetchSerpApiGoogle(query, numResults) {
3028
+ const apiKey = process.env.SERPAPI_API_KEY;
3029
+ if (!apiKey) return null;
3030
+ const url = `https://serpapi.com/search?engine=google&q=${encodeURIComponent(query)}&num=${Math.min(numResults + 5, 30)}&api_key=${apiKey}`;
3031
+ try {
3032
+ const response = await fetchWithTimeout(url, {}, 15000);
3033
+ if (!response.ok) return null;
3034
+ const data = await response.json();
3035
+ const organic = data.organic_results || [];
3036
+ const results = [];
3037
+ for (const r of organic) {
3038
+ if (results.length >= numResults) break;
3039
+ const link = r.link || r.redirect_link;
3040
+ if (!link || link.includes("google.com")) continue;
3041
+ let source = "";
3042
+ try {
3043
+ source = new URL(link).hostname.replace("www.", "");
3044
+ } catch {}
3045
+ results.push({
3046
+ url: link,
3047
+ title: (r.title || "").slice(0, 300),
3048
+ snippet: (r.snippet || `From ${source}`).slice(0, 300),
3049
+ source,
3050
+ });
3051
+ }
3052
+ if (results.length > 0)
3053
+ console.log(
3054
+ "[WebSearch] Google (SerpAPI) found:",
3055
+ results.length,
3056
+ "results",
3057
+ );
3058
+ return results;
3059
+ } catch (e) {
3060
+ console.log("[WebSearch] SerpAPI Google error:", e?.message || e);
3061
+ return null;
3062
+ }
3063
+ }
3064
+
3065
+ /**
3066
+ * Fetch Bing results via SerpAPI. Set SERPAPI_API_KEY to use.
3067
+ */
3068
+ async function fetchSerpApiBing(query, numResults) {
3069
+ const apiKey = process.env.SERPAPI_API_KEY;
3070
+ if (!apiKey) return null;
3071
+ const url = `https://serpapi.com/search?engine=bing&q=${encodeURIComponent(query)}&count=${Math.min(numResults + 5, 30)}&api_key=${apiKey}`;
3072
+ try {
3073
+ const response = await fetchWithTimeout(url, {}, 15000);
3074
+ if (!response.ok) return null;
3075
+ const data = await response.json();
3076
+ const organic = data.organic_results || [];
3077
+ const results = [];
3078
+ for (const r of organic) {
3079
+ if (results.length >= numResults) break;
3080
+ const link = r.link;
3081
+ if (!link || link.includes("bing.com") || link.includes("microsoft.com"))
3082
+ continue;
3083
+ let source = "";
3084
+ try {
3085
+ source = new URL(link).hostname.replace("www.", "");
3086
+ } catch {}
3087
+ results.push({
3088
+ url: link,
3089
+ title: (r.title || "").slice(0, 300),
3090
+ snippet: (r.snippet || `From ${source}`).slice(0, 300),
3091
+ source,
3092
+ });
3093
+ }
3094
+ if (results.length > 0)
3095
+ console.log(
3096
+ "[WebSearch] Bing (SerpAPI) found:",
3097
+ results.length,
3098
+ "results",
3099
+ );
3100
+ return results;
3101
+ } catch (e) {
3102
+ console.log("[WebSearch] SerpAPI Bing error:", e?.message || e);
3103
+ return null;
3104
+ }
3105
+ }
3106
+
3107
+ /**
3108
+ * Search Google (no API) - scrape SERP HTML. Uses SerpAPI if SERPAPI_API_KEY set; else proxy if WEBSEARCH_PROXY/HTTPS_PROXY set. Retries with headless browser if fetch returns 0 results.
3109
+ */
3110
+ async function searchGoogle(query, numResults) {
3111
+ const serpApiResults = await fetchSerpApiGoogle(query, numResults);
3112
+ if (serpApiResults && serpApiResults.length > 0) return serpApiResults;
3113
+
3114
+ const num = Math.min(numResults + 5, 30);
3115
+ const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=${num}&ncr=1`;
3116
+ let headers = getGoogleSearchHeaders();
3117
+ const fetchOptions = { headers, useWebSearchProxy: true };
3118
+
3119
+ let response = await fetchWithTimeout(url, fetchOptions, 15000);
3120
+ let html = await response.text();
3121
+
3122
+ if (
3123
+ /Update your browser|isn't supported anymore/i.test(html) &&
3124
+ html.length < 15000
3125
+ ) {
3126
+ headers = getBrowserHeaders("https://www.google.com");
3127
+ delete headers["Sec-Fetch-Site"];
3128
+ headers["Sec-Fetch-Site"] = "none";
3129
+ response = await fetchWithTimeout(
3130
+ url,
3131
+ { headers, useWebSearchProxy: true },
3132
+ 15000,
3133
+ );
3134
+ html = await response.text();
3135
+ }
3136
+
3137
+ const isBlocked =
3138
+ /unusual\s+traffic|we've\s+detected|detected\s+unusual|captcha|blocked|before you continue|consent\.google/i.test(
3139
+ html,
3140
+ ) && html.length < 20000;
3141
+ if (isBlocked) {
3142
+ console.log(
3143
+ "[WebSearch] Google: page appears blocked/captcha (length=",
3144
+ html.length,
3145
+ ")",
3146
+ );
3147
+ logZeroResultHtml("Google", html);
3148
+ return [];
3149
+ }
3150
+
3151
+ let results = parseGoogleResults(html, numResults);
3152
+
3153
+ // Skip browser retry when we already got captcha (browser would get same)
3154
+ const isCaptcha =
3155
+ /recaptcha|captcha-form|solveSimpleChallenge|data-sitekey/i.test(html);
3156
+ if (
3157
+ results.length === 0 &&
3158
+ process.env.WEBSEARCH_USE_BROWSER !== "0" &&
3159
+ !isCaptcha
3160
+ ) {
3161
+ try {
3162
+ console.log("[WebSearch] Google retrying with headless browser...");
3163
+ html = await fetchHtmlWithBrowser(url);
3164
+ if (/recaptcha|captcha-form|solveSimpleChallenge/i.test(html))
3165
+ return results;
3166
+ results = parseGoogleResults(html, numResults);
3167
+ } catch (e) {
3168
+ console.log("[WebSearch] Google browser fetch failed:", e?.message || e);
3169
+ }
3170
+ }
3171
+
3172
+ if (results.length === 0) {
3173
+ const gCount = (
3174
+ html.match(/<div[^>]*class="[^"]*\bg\b[^"]*"[^>]*>/gi) || []
3175
+ ).length;
3176
+ const urlQCount = (html.match(/\/url\?q=/gi) || []).length;
3177
+ const citeCount = (html.match(/<cite[^>]*>/gi) || []).length;
3178
+ const h3Count = (html.match(/<h3[^>]*>/gi) || []).length;
3179
+ console.log(
3180
+ "[WebSearch] Google: 0 results | htmlLen=",
3181
+ html.length,
3182
+ "status=",
3183
+ response?.status,
3184
+ "| gBlocks=",
3185
+ gCount,
3186
+ "urlQ=",
3187
+ urlQCount,
3188
+ "cite=",
3189
+ citeCount,
3190
+ "h3=",
3191
+ h3Count,
3192
+ );
3193
+ logZeroResultHtml("Google", html);
3194
+ } else {
3195
+ console.log("[WebSearch] Google found:", results.length, "results");
3196
+ }
3197
+ return results;
3198
+ }
3199
+
3200
+ /** Parse Bing SERP HTML and return results array (used by fetch and browser fallback). */
3201
+ function parseBingResults(html, numResults) {
3202
+ const results = [];
3203
+ const seenUrls = new Set();
3204
+ function addResult(resultUrl, title, snippet) {
3205
+ if (
3206
+ resultUrl.includes("bing.com") ||
3207
+ resultUrl.includes("microsoft.com") ||
3208
+ seenUrls.has(resultUrl)
3209
+ )
3210
+ return false;
3211
+ if (!title || title.length < 2) return false;
3212
+ seenUrls.add(resultUrl);
3213
+ let source = "";
3214
+ try {
3215
+ source = new URL(resultUrl).hostname.replace("www.", "");
3216
+ } catch {}
3217
+ results.push({
3218
+ url: resultUrl,
3219
+ title: title.slice(0, 300),
3220
+ snippet: (snippet || `From ${source}`).slice(0, 300),
3221
+ source,
3222
+ });
3223
+ return true;
3224
+ }
3225
+ function resolveUrl(href) {
3226
+ if (href && href.includes("bing.com/ck/a")) {
3227
+ const resolved = resolveBingCkUrl(href);
3228
+ if (
3229
+ resolved &&
3230
+ !resolved.includes("bing.com") &&
3231
+ !resolved.includes("microsoft.com")
3232
+ )
3233
+ return resolved;
3234
+ }
3235
+ return href;
3236
+ }
3237
+ // Match either double- or single-quoted href (browser-rendered HTML may use either)
3238
+ const linkRegexBing =
3239
+ /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
3240
+ // li or div (browser-rendered Bing often uses div for result blocks)
3241
+ const algoRegex = /<(?:li|div)[^>]*class="[^"]*b_(?:algo|ans)[^"]*"[^>]*>/gi;
3242
+ let algoMatch;
3243
+ const blocks = [];
3244
+ while ((algoMatch = algoRegex.exec(html)) !== null)
3245
+ blocks.push({ start: algoMatch.index, end: html.length });
3246
+ for (let i = 0; i < blocks.length && results.length < numResults; i++) {
3247
+ const start = blocks[i].start;
3248
+ const end = i + 1 < blocks.length ? blocks[i + 1].start : html.length;
3249
+ const block = html.substring(start, Math.min(start + 3500, end));
3250
+ linkRegexBing.lastIndex = 0;
3251
+ const allLinks = [...block.matchAll(linkRegexBing)];
3252
+ for (const linkMatch of allLinks) {
3253
+ let resultUrl = resolveUrl(linkMatch[1]);
3254
+ if (resultUrl.includes("bing.com") || resultUrl.includes("microsoft.com"))
3255
+ continue;
3256
+ let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
3257
+ title = decodeHtmlEntities(title);
3258
+ if (title.length < 2) continue;
3259
+ const snippetMatch =
3260
+ block.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
3261
+ block.match(
3262
+ /class="[^"]*b_caption[^"]*"[^>]*>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i,
3263
+ );
3264
+ let snippet = snippetMatch
3265
+ ? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
3266
+ : "";
3267
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
3268
+ if (addResult(resultUrl, title, snippet)) break;
3269
+ }
3270
+ }
3271
+ if (results.length < numResults) {
3272
+ const algoBlocks = html.split(
3273
+ /<(?:li|div)[^>]*class="[^"]*b_algo[^"]*"[^>]*>/i,
3274
+ );
3275
+ for (let i = 1; i < algoBlocks.length && results.length < numResults; i++) {
3276
+ const block = algoBlocks[i].substring(0, 3500);
3277
+ linkRegexBing.lastIndex = 0;
3278
+ const allLinks = [...block.matchAll(linkRegexBing)];
3279
+ for (const linkMatch of allLinks) {
3280
+ let resultUrl = resolveUrl(linkMatch[1]);
3281
+ if (
3282
+ resultUrl.includes("bing.com") ||
3283
+ resultUrl.includes("microsoft.com")
3284
+ )
3285
+ continue;
3286
+ let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
3287
+ title = decodeHtmlEntities(title);
3288
+ if (title.length < 2) continue;
3289
+ const snippetMatch = block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
3290
+ let snippet = snippetMatch
3291
+ ? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
3292
+ : "";
3293
+ snippet = decodeHtmlEntities(snippet)
3294
+ .replace(/\s+/g, " ")
3295
+ .slice(0, 300);
3296
+ if (addResult(resultUrl, title, snippet)) break;
3297
+ }
3298
+ }
3299
+ }
3300
+ if (results.length < numResults) {
3301
+ const h2Pattern =
3302
+ /<h2[^>]*>[\s\S]*?<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h2>[\s\S]{0,800}?<p[^>]*>([\s\S]*?)<\/p>/gi;
3303
+ let m;
3304
+ while ((m = h2Pattern.exec(html)) !== null && results.length < numResults) {
3305
+ let resultUrl = resolveUrl(m[1]);
3306
+ if (resultUrl.includes("bing.com") || resultUrl.includes("microsoft.com"))
3307
+ continue;
3308
+ const title = decodeHtmlEntities(m[2].replace(/<[^>]+>/g, "").trim());
3309
+ let snippet = m[3].replace(/<[^>]+>/g, " ").trim();
3310
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
3311
+ addResult(resultUrl, title, snippet);
3312
+ }
3313
+ }
3314
+ if (results.length < numResults) {
3315
+ linkRegexBing.lastIndex = 0;
3316
+ let lm;
3317
+ while (
3318
+ (lm = linkRegexBing.exec(html)) !== null &&
3319
+ results.length < numResults
3320
+ ) {
3321
+ let url = resolveUrl(lm[1]);
3322
+ if (
3323
+ url.includes("bing.com") ||
3324
+ url.includes("microsoft.com") ||
3325
+ seenUrls.has(url)
3326
+ )
3327
+ continue;
3328
+ let title = lm[2].replace(/<[^>]+>/g, "").trim();
3329
+ title = decodeHtmlEntities(title);
3330
+ if (title.length < 4 || title.length > 250) continue;
3331
+ if (/^(http|www|search|images|video|news|maps)/i.test(title)) continue;
3332
+ addResult(url, title, "", "");
3333
+ }
3334
+ }
3335
+ return results;
3336
+ }
3337
+
3338
+ /**
3339
+ * Search Bing (no API) - scrape SERP HTML. Uses SerpAPI if SERPAPI_API_KEY set; else proxy if WEBSEARCH_PROXY/HTTPS_PROXY set. Retries with headless browser if fetch returns 0 results.
3340
+ */
3341
+ async function searchBing(query, numResults) {
3342
+ const serpApiResults = await fetchSerpApiBing(query, numResults);
3343
+ if (serpApiResults && serpApiResults.length > 0) return serpApiResults;
3344
+
3345
+ const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${Math.min(numResults + 5, 30)}`;
3346
+ const headers = getBingSearchHeaders();
3347
+
3348
+ const response = await fetchWithTimeout(
3349
+ url,
3350
+ { headers, useWebSearchProxy: true },
3351
+ 15000,
3352
+ );
3353
+ let html = await response.text();
3354
+
3355
+ if (html.length < 2000) {
3356
+ console.log(
3357
+ "[WebSearch] Bing: response too short (length=",
3358
+ html.length,
3359
+ ", status=",
3360
+ response.status,
3361
+ ")",
3362
+ );
3363
+ logZeroResultHtml("Bing", html);
3364
+ return [];
3365
+ }
3366
+
3367
+ let results = parseBingResults(html, numResults);
3368
+
3369
+ if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
3370
+ try {
3371
+ console.log("[WebSearch] Bing retrying with headless browser...");
3372
+ html = await fetchHtmlWithBrowser(url);
3373
+ results = parseBingResults(html, numResults);
3374
+ } catch (e) {
3375
+ console.log("[WebSearch] Bing browser fetch failed:", e?.message || e);
3376
+ }
3377
+ }
3378
+
3379
+ if (results.length === 0) {
3380
+ const algoCount = (html.match(/b_algo|b_ans/gi) || []).length;
3381
+ const h2Count = (html.match(/<h2[^>]*>/gi) || []).length;
3382
+ const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
3383
+ console.log(
3384
+ "[WebSearch] Bing: 0 results | htmlLen=",
3385
+ html.length,
3386
+ "status=",
3387
+ response.status,
3388
+ "| b_algo=",
3389
+ algoCount,
3390
+ "h2=",
3391
+ h2Count,
3392
+ "links=",
3393
+ linkCount,
3394
+ );
3395
+ logZeroResultHtml("Bing", html);
3396
+ } else {
3397
+ console.log("[WebSearch] Bing found:", results.length, "results");
3398
+ }
3399
+ return results;
3400
+ }
3401
+
3402
+ /** Parse Yahoo SERP HTML and return results array (used by fetch and browser fallback). */
3403
+ function parseYahooResults(html, numResults) {
3404
+ const results = [];
3405
+ const seenUrls = new Set();
3406
+ const linkRegex =
3407
+ /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
3408
+ const avoidHosts =
3409
+ /yahoo\.com|bing\.com|google\.com|doubleclick|yimg\.com|search\.yahoo/i;
3410
+ let m;
3411
+ const candidates = [];
3412
+ while ((m = linkRegex.exec(html)) !== null) {
3413
+ let resultUrl = m[1];
3414
+ if (avoidHosts.test(resultUrl)) {
3415
+ const resolved = resolveYahooRedirect(resultUrl);
3416
+ if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
3417
+ else continue;
3418
+ }
3419
+ let title = m[2].replace(/<[^>]+>/g, "").trim();
3420
+ title = decodeHtmlEntities(title);
3421
+ if (!title) {
3422
+ try {
3423
+ title = new URL(resultUrl).hostname.replace("www.", "");
3424
+ } catch {
3425
+ continue;
3426
+ }
3427
+ }
3428
+ if (title.length < 2 || title.length > 400) continue;
3429
+ if (
3430
+ /^(Sign in|Mail|News|Sports|Finance|Weather|Settings|Help)$/i.test(title)
3431
+ )
3432
+ continue;
3433
+ candidates.push({ url: resultUrl, title, index: m.index });
3434
+ }
3435
+ if (
3436
+ candidates.length === 0 &&
3437
+ /#web|\.dd\s|class="[^"]*title[^"]*"/.test(html)
3438
+ ) {
3439
+ const altHref =
3440
+ /<a[^>]*class="[^"]*(?:title|dd)[^"]*"[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
3441
+ altHref.lastIndex = 0;
3442
+ while (
3443
+ (m = altHref.exec(html)) !== null &&
3444
+ candidates.length < numResults * 2
3445
+ ) {
3446
+ let resultUrl = m[1];
3447
+ if (avoidHosts.test(resultUrl)) {
3448
+ const resolved = resolveYahooRedirect(resultUrl);
3449
+ if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
3450
+ else continue;
3451
+ }
3452
+ let title = m[2].replace(/<[^>]+>/g, "").trim();
3453
+ title = decodeHtmlEntities(title);
3454
+ if (!title)
3455
+ try {
3456
+ title = new URL(resultUrl).hostname.replace("www.", "");
3457
+ } catch {
3458
+ continue;
3459
+ }
3460
+ if (title.length < 2 || title.length > 400) continue;
3461
+ candidates.push({ url: resultUrl, title, index: m.index });
3462
+ }
3463
+ }
3464
+ for (const c of candidates) {
3465
+ if (seenUrls.has(c.url)) continue;
3466
+ seenUrls.add(c.url);
3467
+ const context = html.substring(
3468
+ c.index,
3469
+ Math.min(html.length, c.index + 600),
3470
+ );
3471
+ const snippetMatch =
3472
+ context.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
3473
+ context.match(/class="[^"]*desc[^"]*"[^>]*>([\s\S]*?)<\//);
3474
+ let snippet = snippetMatch
3475
+ ? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
3476
+ : "";
3477
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
3478
+ let source = "";
3479
+ try {
3480
+ source = new URL(c.url).hostname.replace("www.", "");
3481
+ } catch {}
3482
+ results.push({
3483
+ url: c.url,
3484
+ title: c.title.slice(0, 300),
3485
+ snippet: snippet || `From ${source}`,
3486
+ source,
3487
+ });
3488
+ if (results.length >= numResults) break;
3489
+ }
3490
+ if (results.length < numResults && candidates.length === 0) {
3491
+ const linkRegex2 =
3492
+ /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
3493
+ let m2;
3494
+ while (
3495
+ (m2 = linkRegex2.exec(html)) !== null &&
3496
+ results.length < numResults
3497
+ ) {
3498
+ let resultUrl = m2[1];
3499
+ if (avoidHosts.test(resultUrl)) {
3500
+ const resolved = resolveYahooRedirect(resultUrl);
3501
+ if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
3502
+ else continue;
3503
+ }
3504
+ if (seenUrls.has(resultUrl)) continue;
3505
+ let title = m2[2].replace(/<[^>]+>/g, "").trim();
3506
+ title = decodeHtmlEntities(title);
3507
+ if (!title) {
3508
+ try {
3509
+ title = new URL(resultUrl).hostname.replace("www.", "");
3510
+ } catch {
3511
+ continue;
3512
+ }
3513
+ }
3514
+ if (title.length < 1 || title.length > 500) continue;
3515
+ if (
3516
+ /^(Sign in|Mail|News|Sports|Finance|Weather|Settings|Help)$/i.test(
3517
+ title,
3518
+ )
3519
+ )
3520
+ continue;
3521
+ seenUrls.add(resultUrl);
3522
+ let source = "";
3523
+ try {
3524
+ source = new URL(resultUrl).hostname.replace("www.", "");
3525
+ } catch {}
3526
+ results.push({
3527
+ url: resultUrl,
3528
+ title: title.slice(0, 300),
3529
+ snippet: `From ${source}`,
3530
+ source,
3531
+ });
3532
+ }
3533
+ }
3534
+ if (results.length < numResults) {
3535
+ const blocks = html.split(/class="[^"]*(?:algo|srch|dd)[^"]*"[^>]*>/i);
3536
+ for (let i = 1; i < blocks.length && results.length < numResults; i++) {
3537
+ const block = blocks[i].substring(0, 1500);
3538
+ const linkMatch = block.match(
3539
+ /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/i,
3540
+ );
3541
+ if (!linkMatch) continue;
3542
+ let resultUrl = linkMatch[1];
3543
+ if (avoidHosts.test(resultUrl)) {
3544
+ const resolved = resolveYahooRedirect(resultUrl);
3545
+ if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
3546
+ else continue;
3547
+ }
3548
+ let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
3549
+ title = decodeHtmlEntities(title);
3550
+ if (!title) {
3551
+ try {
3552
+ title = new URL(resultUrl).hostname.replace("www.", "");
3553
+ } catch {
3554
+ continue;
3555
+ }
3556
+ }
3557
+ if (title.length < 2) continue;
3558
+ if (seenUrls.has(resultUrl)) continue;
3559
+ seenUrls.add(resultUrl);
3560
+ let source = "";
3561
+ try {
3562
+ source = new URL(resultUrl).hostname.replace("www.", "");
3563
+ } catch {}
3564
+ results.push({
3565
+ url: resultUrl,
3566
+ title: title.slice(0, 300),
3567
+ snippet: `From ${source}`,
3568
+ source,
3569
+ });
3570
+ }
3571
+ }
3572
+ return results;
3573
+ }
3574
+
3575
+ /**
3576
+ * Search Yahoo (no API) - scrape SERP HTML. Retries with headless browser if fetch returns 0 results.
3577
+ */
3578
+ async function searchYahoo(query, numResults) {
3579
+ const url = `https://search.yahoo.com/search?p=${encodeURIComponent(query)}`;
3580
+ const headers = getBrowserHeaders("https://search.yahoo.com");
3581
+ headers["Sec-Fetch-Site"] = "none";
3582
+
3583
+ const response = await fetchWithTimeout(
3584
+ url,
3585
+ { headers, useWebSearchProxy: true },
3586
+ 12000,
3587
+ );
3588
+ let html = await response.text();
3589
+ if (html.length < 2000) return [];
3590
+
3591
+ let results = parseYahooResults(html, numResults);
3592
+
3593
+ if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
3594
+ try {
3595
+ console.log("[WebSearch] Yahoo retrying with headless browser...");
3596
+ html = await fetchHtmlWithBrowser(url);
3597
+ results = parseYahooResults(html, numResults);
3598
+ } catch (e) {
3599
+ console.log("[WebSearch] Yahoo browser fetch failed:", e?.message || e);
3600
+ }
3601
+ }
3602
+
3603
+ if (results.length === 0) {
3604
+ const algoCount = (html.match(/class="[^"]*algo[^"]*"/gi) || []).length;
3605
+ const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
3606
+ console.log(
3607
+ "[WebSearch] Yahoo: 0 results | htmlLen=",
3608
+ html.length,
3609
+ "| algo=",
3610
+ algoCount,
3611
+ "links=",
3612
+ linkCount,
3613
+ );
3614
+ logZeroResultHtml("Yahoo", html);
3615
+ }
3616
+ console.log("[WebSearch] Yahoo found:", results.length, "results");
3617
+ return results;
3618
+ }
3619
+
3620
+ /** Parse Ecosia SERP HTML and return results array (used by fetch and browser fallback). */
3621
+ function parseEcosiaResults(html, numResults) {
3622
+ const results = [];
3623
+ const seenUrls = new Set();
3624
+ const avoidEcosia = /ecosia\.org|duckduckgo\.com/i;
3625
+ const resultBlocks = html.split(
3626
+ /class="[^"]*(?:result|card|organic|abstract)[^"]*"[^>]*>/i,
3627
+ );
3628
+ for (let i = 1; i < resultBlocks.length && results.length < numResults; i++) {
3629
+ const block = resultBlocks[i].substring(0, 2500);
3630
+ const linkMatch = block.match(
3631
+ /<a[^>]*href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/i,
3632
+ );
3633
+ if (!linkMatch) continue;
3634
+ const resultUrl = linkMatch[1];
3635
+ if (avoidEcosia.test(resultUrl) || seenUrls.has(resultUrl)) continue;
3636
+ let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
3637
+ title = decodeHtmlEntities(title);
3638
+ if (!title || title.length < 3) continue;
3639
+ const snippetMatch =
3640
+ block.match(
3641
+ /class="[^"]*(?:snippet|description|abstract)[^"]*"[^>]*>([\s\S]*?)<\//i,
3642
+ ) || block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
3643
+ let snippet = snippetMatch
3644
+ ? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
3645
+ : "";
3646
+ snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
3647
+ seenUrls.add(resultUrl);
3648
+ let source = "";
3649
+ try {
3650
+ source = new URL(resultUrl).hostname.replace("www.", "");
3651
+ } catch {}
3652
+ results.push({
3653
+ url: resultUrl,
3654
+ title: title.slice(0, 300),
3655
+ snippet: snippet || `From ${source}`,
3656
+ source,
3657
+ });
3658
+ }
3659
+ if (results.length < numResults) {
3660
+ const linkRegex = /<a[^>]*href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
3661
+ let lm;
3662
+ while (
3663
+ (lm = linkRegex.exec(html)) !== null &&
3664
+ results.length < numResults
3665
+ ) {
3666
+ const resultUrl = lm[1];
3667
+ if (avoidEcosia.test(resultUrl) || seenUrls.has(resultUrl)) continue;
3668
+ let title = lm[2].replace(/<[^>]+>/g, "").trim();
3669
+ title = decodeHtmlEntities(title);
3670
+ if (title.length < 3 || title.length > 250) continue;
3671
+ if (/^(Ecosia|Plant|Privacy|Settings|Donate|Sign)/i.test(title)) continue;
3672
+ seenUrls.add(resultUrl);
3673
+ let source = "";
3674
+ try {
3675
+ source = new URL(resultUrl).hostname.replace("www.", "");
3676
+ } catch {}
3677
+ results.push({
3678
+ url: resultUrl,
3679
+ title: title.slice(0, 300),
3680
+ snippet: `From ${source}`,
3681
+ source,
3682
+ });
3683
+ }
3684
+ }
3685
+ return results;
3686
+ }
3687
+
3688
+ /**
3689
+ * Search Ecosia (no API) - scrape SERP HTML. Retries with headless browser if fetch returns 0 (e.g. Cloudflare).
3690
+ */
3691
+ async function searchEcosia(query, numResults) {
3692
+ const url = `https://www.ecosia.org/search?q=${encodeURIComponent(query)}`;
3693
+ const headers = getBrowserHeaders("https://www.ecosia.org");
3694
+ headers["Sec-Fetch-Site"] = "none";
3695
+
3696
+ const response = await fetchWithTimeout(
3697
+ url,
3698
+ { headers, useWebSearchProxy: true },
3699
+ 12000,
3700
+ );
3701
+ let html = await response.text();
3702
+ if (html.length < 2000) return [];
3703
+
3704
+ let results = parseEcosiaResults(html, numResults);
3705
+
3706
+ // Skip browser retry when Cloudflare challenge is present (page never settles, would timeout)
3707
+ const isCloudflareChallenge =
3708
+ /Just a moment|Enable JavaScript and cookies|cf_chl_opt|cF_chl_opt/i.test(
3709
+ html,
3710
+ );
3711
+ if (
3712
+ results.length === 0 &&
3713
+ process.env.WEBSEARCH_USE_BROWSER !== "0" &&
3714
+ !isCloudflareChallenge
3715
+ ) {
3716
+ try {
3717
+ console.log("[WebSearch] Ecosia retrying with headless browser...");
3718
+ html = await fetchHtmlWithBrowser(url);
3719
+ results = parseEcosiaResults(html, numResults);
3720
+ } catch (e) {
3721
+ console.log("[WebSearch] Ecosia browser fetch failed:", e?.message || e);
3722
+ }
3723
+ }
3724
+
3725
+ if (results.length === 0) {
3726
+ const isCloudflare =
3727
+ /Just a moment|Enable JavaScript and cookies|cf_chl_opt|cF_chl_opt/i.test(
3728
+ html,
3729
+ );
3730
+ if (isCloudflare)
3731
+ console.log(
3732
+ "[WebSearch] Ecosia: Cloudflare challenge (page requires JavaScript; try VPN or browser)",
3733
+ );
3734
+ const resultBlockCount = (
3735
+ html.match(/class="[^"]*(?:result|card|organic|abstract)[^"]*"/gi) || []
3736
+ ).length;
3737
+ const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
3738
+ console.log(
3739
+ "[WebSearch] Ecosia: 0 results | htmlLen=",
3740
+ html.length,
3741
+ "| resultBlocks=",
3742
+ resultBlockCount,
3743
+ "links=",
3744
+ linkCount,
3745
+ );
3746
+ logZeroResultHtml("Ecosia", html);
3747
+ }
3748
+ console.log("[WebSearch] Ecosia found:", results.length, "results");
3749
+ return results;
3750
+ }
3751
+
3752
+ /**
3753
+ * Fetch and extract text content from a URL
3754
+ * Enhanced to better handle structured data like tables and lists
3755
+ */
3756
+ export async function fetchUrlContent(url, timeoutMs = 15000) {
3757
+ console.log("[FetchURL] Fetching:", url);
3758
+
3759
+ const response = await fetchWithTimeout(
3760
+ url,
3761
+ {
3762
+ headers: {
3763
+ "User-Agent":
3764
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
3765
+ Accept:
3766
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
3767
+ "Accept-Language": "en-US,en;q=0.9",
3768
+ "Cache-Control": "no-cache",
3769
+ },
3770
+ },
3771
+ timeoutMs,
3772
+ );
3773
+
3774
+ const html = await response.text();
3775
+
3776
+ // Extract title
3777
+ const titleMatch = html.match(/<title[^>]*>([^<]*)<\/title>/i);
3778
+ const title = titleMatch ? decodeHtmlEntities(titleMatch[1].trim()) : "";
3779
+
3780
+ // Extract tables and convert to readable format
3781
+ let tables = [];
3782
+ const tablePattern = /<table[^>]*>([\s\S]*?)<\/table>/gi;
3783
+ let tableMatch;
3784
+ while ((tableMatch = tablePattern.exec(html)) !== null && tables.length < 5) {
3785
+ const tableHtml = tableMatch[1];
3786
+ const rows = [];
3787
+
3788
+ // Extract headers
3789
+ const headerPattern = /<th[^>]*>([\s\S]*?)<\/th>/gi;
3790
+ const headers = [];
3791
+ let headerMatch;
3792
+ while ((headerMatch = headerPattern.exec(tableHtml)) !== null) {
3793
+ headers.push(
3794
+ decodeHtmlEntities(headerMatch[1].replace(/<[^>]+>/g, "").trim()),
3795
+ );
3796
+ }
3797
+ if (headers.length > 0) {
3798
+ rows.push(headers.join(" | "));
3799
+ rows.push(headers.map(() => "---").join(" | "));
3800
+ }
3801
+
3802
+ // Extract rows
3803
+ const rowPattern = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
3804
+ let rowMatch;
3805
+ let rowCount = 0;
3806
+ while ((rowMatch = rowPattern.exec(tableHtml)) !== null && rowCount < 50) {
3807
+ const cellPattern = /<td[^>]*>([\s\S]*?)<\/td>/gi;
3808
+ const cells = [];
3809
+ let cellMatch;
3810
+ while ((cellMatch = cellPattern.exec(rowMatch[1])) !== null) {
3811
+ cells.push(
3812
+ decodeHtmlEntities(cellMatch[1].replace(/<[^>]+>/g, "").trim()),
3813
+ );
3814
+ }
3815
+ if (cells.length > 0) {
3816
+ rows.push(cells.join(" | "));
3817
+ rowCount++;
3818
+ }
3819
+ }
3820
+
3821
+ if (rows.length > 1) {
3822
+ tables.push(rows.join("\n"));
3823
+ }
3824
+ }
3825
+
3826
+ // Extract lists (ul/ol) and convert to readable format
3827
+ let lists = [];
3828
+ const listPattern = /<(?:ul|ol)[^>]*>([\s\S]*?)<\/(?:ul|ol)>/gi;
3829
+ let listMatch;
3830
+ while ((listMatch = listPattern.exec(html)) !== null && lists.length < 10) {
3831
+ const listHtml = listMatch[1];
3832
+ const items = [];
3833
+ const itemPattern = /<li[^>]*>([\s\S]*?)<\/li>/gi;
3834
+ let itemMatch;
3835
+ while (
3836
+ (itemMatch = itemPattern.exec(listHtml)) !== null &&
3837
+ items.length < 30
3838
+ ) {
3839
+ const itemText = decodeHtmlEntities(
3840
+ itemMatch[1]
3841
+ .replace(/<[^>]+>/g, " ")
3842
+ .replace(/\s+/g, " ")
3843
+ .trim(),
3844
+ );
3845
+ if (itemText.length > 5 && itemText.length < 500) {
3846
+ items.push(`• ${itemText}`);
3847
+ }
3848
+ }
3849
+ if (items.length > 2) {
3850
+ lists.push(items.join("\n"));
3851
+ }
3852
+ }
3853
+
3854
+ // Convert main HTML to text (removing scripts, styles, nav, footer)
3855
+ let text = html
3856
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
3857
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
3858
+ .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, "")
3859
+ .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, "")
3860
+ .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, "")
3861
+ .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, "")
3862
+ .replace(/<!--[\s\S]*?-->/g, "")
3863
+ // Preserve some structure
3864
+ .replace(/<\/h[1-6]>/gi, "\n\n")
3865
+ .replace(/<\/p>/gi, "\n")
3866
+ .replace(/<br\s*\/?>/gi, "\n")
3867
+ .replace(/<\/div>/gi, "\n")
3868
+ .replace(/<\/li>/gi, "\n")
3869
+ .replace(/<[^>]+>/g, " ")
3870
+ .replace(/&nbsp;/g, " ")
3871
+ .replace(/&amp;/g, "&")
3872
+ .replace(/&lt;/g, "<")
3873
+ .replace(/&gt;/g, ">")
3874
+ .replace(/&quot;/g, '"')
3875
+ .replace(/&#39;/g, "'")
3876
+ .replace(/\n\s*\n\s*\n/g, "\n\n")
3877
+ .replace(/[ \t]+/g, " ")
3878
+ .trim();
3879
+
3880
+ // Limit main text but keep more for data-rich pages
3881
+ const maxTextLength = 12000;
3882
+ if (text.length > maxTextLength) {
3883
+ text = text.slice(0, maxTextLength) + "\n\n... (content truncated)";
3884
+ }
3885
+
3886
+ // Build the output
3887
+ let output = title ? `Title: ${title}\n\n` : "";
3888
+
3889
+ // Include structured data if found
3890
+ if (tables.length > 0) {
3891
+ output += `=== TABLES FOUND (${tables.length}) ===\n\n`;
3892
+ tables.forEach((table, i) => {
3893
+ output += `Table ${i + 1}:\n${table}\n\n`;
3894
+ });
3895
+ }
3896
+
3897
+ if (lists.length > 0 && lists.some((l) => l.split("\n").length > 3)) {
3898
+ output += `=== LISTS FOUND ===\n\n`;
3899
+ lists
3900
+ .filter((l) => l.split("\n").length > 3)
3901
+ .slice(0, 5)
3902
+ .forEach((list, i) => {
3903
+ output += `List ${i + 1}:\n${list}\n\n`;
3904
+ });
3905
+ }
3906
+
3907
+ output += `=== PAGE CONTENT ===\n\n${text}`;
3908
+
3909
+ console.log(
3910
+ "[FetchURL] Extracted",
3911
+ output.length,
3912
+ "chars,",
3913
+ tables.length,
3914
+ "tables,",
3915
+ lists.length,
3916
+ "lists",
3917
+ );
3918
+ return output;
3919
+ }
3920
+
3921
+ /**
3922
+ * Deep research - search and fetch multiple pages
3923
+ */
3924
+ export async function deepWebResearch(query, numPages = 5) {
3925
+ const results = { success: false, query, pages: [] };
3926
+
3927
+ try {
3928
+ const searchResults = await performWebSearch(query, numPages + 2);
3929
+ if (searchResults.length === 0) {
3930
+ results.error = "No search results found";
3931
+ return results;
3932
+ }
3933
+
3934
+ for (const result of searchResults) {
3935
+ if (results.pages.length >= numPages) break;
3936
+ try {
3937
+ const content = await fetchUrlContent(result.url, 10000);
3938
+ if (content && content.length > 100) {
3939
+ results.pages.push({
3940
+ title: result.title,
3941
+ url: result.url,
3942
+ content: content.slice(0, 5000),
3943
+ excerpt: content.slice(0, 500),
3944
+ });
3945
+ }
3946
+ } catch (err) {
3947
+ // Include result even if fetch fails
3948
+ results.pages.push({
3949
+ title: result.title,
3950
+ url: result.url,
3951
+ content: result.snippet || "",
3952
+ excerpt: result.snippet || "",
3953
+ });
3954
+ }
3955
+ }
3956
+
3957
+ results.success = results.pages.length > 0;
3958
+ return results;
3959
+ } catch (err) {
3960
+ console.error("[DeepResearch] Error:", err);
3961
+ return results;
3962
+ }
3963
+ }
3964
+
3965
+ // ============================================
3966
+ // Helper Functions
3967
+ // ============================================
3968
+
3969
+ /**
3970
+ * Escape special regex characters in a string
3971
+ * @param {string} string - String to escape
3972
+ * @returns {string} - Escaped string safe for use in RegExp
3973
+ */
3974
+ function escapeRegExp(string) {
3975
+ return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
3976
+ }
3977
+
3978
+ /**
3979
+ * Generate a simple unified diff-like output for display
3980
+ * @param {string} oldStr - Original text
3981
+ * @param {string} newStr - New text
3982
+ * @param {string} filePath - Path to the file being edited
3983
+ * @returns {string} - Diff-like output
3984
+ */
3985
+ function generateSimpleDiff(oldStr, newStr, filePath) {
3986
+ const oldLines = oldStr.split("\n");
3987
+ const newLines = newStr.split("\n");
3988
+
3989
+ let diff = `--- a/${filePath}\n+++ b/${filePath}\n`;
3990
+
3991
+ // Add context with line markers
3992
+ const maxOldLines = Math.min(oldLines.length, 20); // Limit for readability
3993
+ const maxNewLines = Math.min(newLines.length, 20);
3994
+
3995
+ diff += `@@ -1,${oldLines.length} +1,${newLines.length} @@\n`;
3996
+
3997
+ // Show removed lines (old content)
3998
+ for (let i = 0; i < maxOldLines; i++) {
3999
+ diff += `-${oldLines[i]}\n`;
4000
+ }
4001
+ if (oldLines.length > maxOldLines) {
4002
+ diff += `... (${oldLines.length - maxOldLines} more lines removed)\n`;
4003
+ }
4004
+
4005
+ // Show added lines (new content)
4006
+ for (let i = 0; i < maxNewLines; i++) {
4007
+ diff += `+${newLines[i]}\n`;
4008
+ }
4009
+ if (newLines.length > maxNewLines) {
4010
+ diff += `... (${newLines.length - maxNewLines} more lines added)\n`;
4011
+ }
4012
+
4013
+ return diff;
4014
+ }
4015
+
4016
+ /**
4017
+ * Resolve path with home directory expansion and normalization
4018
+ * @param {string} path - Path to resolve
4019
+ * @returns {string} - Normalized absolute path
4020
+ */
4021
+ function resolvePath(path) {
4022
+ if (!path || typeof path !== "string") {
4023
+ throw new Error("Invalid path");
4024
+ }
4025
+
4026
+ let resolved;
4027
+ if (path.startsWith("~")) {
4028
+ resolved = join(homedir(), path.slice(1));
4029
+ } else {
4030
+ resolved = resolve(path);
4031
+ }
4032
+
4033
+ // Normalize to remove . and .. components
4034
+ return normalize(resolved);
4035
+ }
4036
+
4037
+ /**
4038
+ * Check if a path contains path traversal attempts
4039
+ * @param {string} inputPath - Original user-provided path
4040
+ * @param {string} resolvedPath - Resolved absolute path
4041
+ * @returns {boolean} - True if path traversal detected
4042
+ */
4043
+ function hasPathTraversal(inputPath, resolvedPath) {
4044
+ // Check for obvious traversal patterns in input
4045
+ if (inputPath.includes("..")) {
4046
+ // Get the expected directory
4047
+ const baseDir = process.cwd();
4048
+ const inputResolved = resolve(baseDir, inputPath);
4049
+
4050
+ // If resolving .. would take us outside the base, it's traversal
4051
+ if (
4052
+ !inputResolved.startsWith(baseDir) &&
4053
+ !inputPath.startsWith("/") &&
4054
+ !inputPath.startsWith("~")
4055
+ ) {
4056
+ return true;
4057
+ }
4058
+ }
4059
+
4060
+ return false;
4061
+ }
4062
+
4063
+ /**
4064
+ * Check if a path is in the blocked list
4065
+ * @param {string} path - Resolved absolute path
4066
+ * @returns {boolean} - True if path is blocked
4067
+ */
4068
+ function isBlockedPath(path) {
4069
+ const normalizedPath = normalize(path).toLowerCase();
4070
+
4071
+ for (const blocked of BLOCKED_PATHS) {
4072
+ if (normalizedPath.includes(blocked.toLowerCase())) {
4073
+ return true;
4074
+ }
4075
+ }
4076
+
4077
+ // Also check if it's a symlink pointing to a blocked path
4078
+ try {
4079
+ if (existsSync(path)) {
4080
+ const stat = lstatSync(path);
4081
+ if (stat.isSymbolicLink()) {
4082
+ const realPath = realpathSync(path);
4083
+ for (const blocked of BLOCKED_PATHS) {
4084
+ if (realPath.toLowerCase().includes(blocked.toLowerCase())) {
4085
+ return true;
4086
+ }
4087
+ }
4088
+ }
4089
+ }
4090
+ } catch (err) {
4091
+ // If we can't stat, let the actual operation fail
4092
+ }
4093
+
4094
+ return false;
4095
+ }
4096
+
4097
+ /**
4098
+ * Check if path is allowed for reading
4099
+ * @param {string} path - Resolved absolute path
4100
+ * @param {object} config - Configuration with permissions
4101
+ * @throws {Error} - If path is not allowed
4102
+ */
4103
+ function checkReadPermission(path, config) {
4104
+ // Check against blocked paths
4105
+ if (isBlockedPath(path)) {
4106
+ throw new Error(`Access denied: Cannot read sensitive path`);
4107
+ }
4108
+
4109
+ // Check config-based restrictions if provided
4110
+ if (config?.permissions?.fileRead === false) {
4111
+ throw new Error("File read access is disabled in configuration");
4112
+ }
4113
+
4114
+ // Check allowed paths if specified
4115
+ if (config?.permissions?.allowedReadPaths) {
4116
+ const allowed = config.permissions.allowedReadPaths.some((allowedPath) => {
4117
+ const resolvedAllowed = resolvePath(allowedPath);
4118
+ return path.startsWith(resolvedAllowed);
4119
+ });
4120
+
4121
+ if (!allowed) {
4122
+ throw new Error(`Access denied: Path not in allowed read paths`);
4123
+ }
4124
+ }
4125
+
4126
+ return true;
4127
+ }
4128
+
4129
+ /**
4130
+ * Check if path is allowed for writing
4131
+ * @param {string} path - Resolved absolute path
4132
+ * @param {object} config - Configuration with permissions
4133
+ * @throws {Error} - If path is not allowed
4134
+ */
4135
+ function checkWritePermission(path, config) {
4136
+ // Check against blocked paths
4137
+ if (isBlockedPath(path)) {
4138
+ throw new Error(`Access denied: Cannot write to sensitive path`);
4139
+ }
4140
+
4141
+ // Check config-based restrictions if provided
4142
+ if (config?.permissions?.fileWrite === false) {
4143
+ throw new Error("File write access is disabled in configuration");
4144
+ }
4145
+
4146
+ // Check allowed paths if specified
4147
+ if (config?.permissions?.allowedWritePaths) {
4148
+ const allowed = config.permissions.allowedWritePaths.some((allowedPath) => {
4149
+ const resolvedAllowed = resolvePath(allowedPath);
4150
+ return path.startsWith(resolvedAllowed);
4151
+ });
4152
+
4153
+ if (!allowed) {
4154
+ throw new Error(`Access denied: Path not in allowed write paths`);
4155
+ }
4156
+ }
4157
+
4158
+ return true;
4159
+ }
4160
+
4161
+ /**
4162
+ * Validate and sanitize a shell command
4163
+ * @param {string} command - Command to validate
4164
+ * @returns {{ valid: boolean, reason?: string }}
4165
+ */
4166
+ function validateCommand(command) {
4167
+ if (!command || typeof command !== "string") {
4168
+ return { valid: false, reason: "Invalid command" };
4169
+ }
4170
+
4171
+ // Check against dangerous patterns
4172
+ for (const pattern of DANGEROUS_COMMAND_PATTERNS) {
4173
+ if (pattern.test(command)) {
4174
+ return { valid: false, reason: "Command matches dangerous pattern" };
4175
+ }
4176
+ }
4177
+
4178
+ // Check for null bytes (command injection)
4179
+ if (command.includes("\0")) {
4180
+ return { valid: false, reason: "Command contains null bytes" };
4181
+ }
4182
+
4183
+ // Check for shell escape attempts
4184
+ if (command.includes("$(") || command.includes("`")) {
4185
+ // Allow backticks and $() for legitimate use, but warn
4186
+ console.warn(
4187
+ "[validateCommand] Command contains shell substitution - use with caution",
4188
+ );
4189
+ }
4190
+
4191
+ return { valid: true };
4192
+ }
4193
+
4194
+ /**
4195
+ * Parse a single reversible shell command for undo on regeneration (Strategy 1).
4196
+ * Returns { op, path?, path_src?, path_dest?, cwd } or null if not reversible.
4197
+ * Paths are resolved against currentWorkingDirectory and must stay within it.
4198
+ */
4199
+ function parseReversibleCommand(command) {
4200
+ if (!command || typeof command !== "string") return null;
4201
+ const trimmed = command.trim();
4202
+ if (trimmed.includes("|") || trimmed.includes(";")) return null;
4203
+
4204
+ const cwd = currentWorkingDirectory;
4205
+ const resolveSafe = (arg) => {
4206
+ const resolved = resolve(cwd, arg);
4207
+ if (!resolved.startsWith(cwd) && resolved !== cwd) return null;
4208
+ return resolved;
4209
+ };
4210
+
4211
+ // mkdir [ -p ] path
4212
+ const mkdirMatch = trimmed.match(/^mkdir\s+(-p\s+)?(.+)$/);
4213
+ if (mkdirMatch) {
4214
+ const path = resolveSafe(mkdirMatch[2].trim());
4215
+ if (path == null) return null;
4216
+ return { op: "mkdir", path, cwd };
4217
+ }
4218
+
4219
+ // touch path
4220
+ const touchMatch = trimmed.match(/^touch\s+(.+)$/);
4221
+ if (touchMatch) {
4222
+ const path = resolveSafe(touchMatch[1].trim());
4223
+ if (path == null) return null;
4224
+ return { op: "touch", path, cwd };
4225
+ }
4226
+
4227
+ // cp [ -r ] src dest
4228
+ const cpMatch = trimmed.match(/^cp\s+(-[rR]\s+)?(\S+)\s+(\S+)$/);
4229
+ if (cpMatch) {
4230
+ const pathDest = resolveSafe(cpMatch[3]);
4231
+ if (pathDest == null) return null;
4232
+ return { op: "cp", path_dest: pathDest, cwd };
4233
+ }
4234
+
4235
+ // mv src dest
4236
+ const mvMatch = trimmed.match(/^mv\s+(\S+)\s+(\S+)$/);
4237
+ if (mvMatch) {
4238
+ const pathSrc = resolveSafe(mvMatch[1]);
4239
+ const pathDest = resolveSafe(mvMatch[2]);
4240
+ if (pathSrc == null || pathDest == null) return null;
4241
+ return { op: "mv", path_src: pathSrc, path_dest: pathDest, cwd };
4242
+ }
4243
+
4244
+ return null;
4245
+ }
4246
+
4247
+ /**
4248
+ * Get timeout for a specific tool
4249
+ * @param {string} toolName - Name of the tool
4250
+ * @returns {number} - Timeout in milliseconds
4251
+ */
4252
+ function getToolTimeout(toolName) {
4253
+ return TOOL_TIMEOUTS[toolName] || DEFAULT_TOOL_TIMEOUT;
4254
+ }
4255
+
4256
+ /**
4257
+ * Tools that support snapshot callbacks for undo on regeneration
4258
+ */
4259
+ const SNAPSHOT_TOOLS = ["write_file", "edit_file"];
4260
+
4261
+ /**
4262
+ * Execute a tool by name with timeout and error handling
4263
+ * @param {string} name - Tool name
4264
+ * @param {object} args - Tool arguments
4265
+ * @param {object} config - Configuration
4266
+ * @param {function} snapshotFn - Optional callback for capturing file snapshots (for undo on regeneration)
4267
+ * @param {function} shellUndoFn - Optional callback for recording reversible shell commands (Strategy 1)
4268
+ * @param {string|null} toolCallId - Optional ID for this tool call (for snapshot/shell undo traceability)
4269
+ * @returns {Promise<string>} - Tool result or error message
4270
+ */
4271
+ export async function executeTool(
4272
+ name,
4273
+ args,
4274
+ config,
4275
+ snapshotFn = null,
4276
+ shellUndoFn = null,
4277
+ toolCallId = null,
4278
+ ) {
4279
+ const tool = TOOLS[name];
4280
+ if (!tool) {
4281
+ return `Error: Unknown tool "${name}"`;
4282
+ }
4283
+
4284
+ // Validate required parameters
4285
+ for (const [param, spec] of Object.entries(tool.parameters || {})) {
4286
+ if (spec.required && (args[param] === undefined || args[param] === null)) {
4287
+ return `Error: Missing required parameter "${param}" for tool "${name}"`;
4288
+ }
4289
+ }
4290
+
4291
+ // Execute with timeout
4292
+ const timeout = getToolTimeout(name);
4293
+
4294
+ // Only pass snapshotFn to tools that support it; wrap to pass toolCallId (Strategy 3)
4295
+ const shouldPassSnapshot = SNAPSHOT_TOOLS.includes(name) && snapshotFn;
4296
+ const wrappedSnapshotFn =
4297
+ shouldPassSnapshot && toolCallId
4298
+ ? (snapshot) => snapshotFn(snapshot, toolCallId)
4299
+ : snapshotFn;
4300
+ // execute_command gets shellUndoFn for reversible-command undo (Strategy 3: pass toolCallId)
4301
+ const shouldPassShellUndo = name === "execute_command" && shellUndoFn;
4302
+ const wrappedShellUndoFn =
4303
+ shouldPassShellUndo && toolCallId
4304
+ ? (entry) => shellUndoFn(entry, toolCallId)
4305
+ : shellUndoFn;
4306
+
4307
+ const executeArgs = () => {
4308
+ if (shouldPassSnapshot)
4309
+ return tool.execute(args, config, wrappedSnapshotFn);
4310
+ if (shouldPassShellUndo)
4311
+ return tool.execute(args, config, wrappedShellUndoFn);
4312
+ return tool.execute(args, config);
4313
+ };
4314
+
4315
+ try {
4316
+ const result = await Promise.race([
4317
+ executeArgs(),
4318
+ new Promise((_, reject) =>
4319
+ setTimeout(
4320
+ () =>
4321
+ reject(
4322
+ new Error(`Tool "${name}" timed out after ${timeout / 1000}s`),
4323
+ ),
4324
+ timeout,
4325
+ ),
4326
+ ),
4327
+ ]);
4328
+
4329
+ return result;
4330
+ } catch (err) {
4331
+ // Handle permission errors specifically
4332
+ if (err.message.includes("Access denied")) {
4333
+ console.warn(`[executeTool] Permission denied for ${name}:`, err.message);
4334
+ return `Error: ${err.message}`;
4335
+ }
4336
+
4337
+ // Handle timeout
4338
+ if (err.message.includes("timed out")) {
4339
+ console.warn(`[executeTool] Timeout for ${name}:`, err.message);
4340
+ return `Error: ${err.message}`;
4341
+ }
4342
+
4343
+ // Generic error
4344
+ console.error(`[executeTool] Error executing ${name}:`, err);
4345
+ return `Error executing ${name}: ${err.message}`;
4346
+ }
4347
+ }
4348
+
4349
+ /**
4350
+ * Get tool descriptions for system prompt
4351
+ */
4352
+ export function getToolDescriptions() {
4353
+ return Object.entries(TOOLS)
4354
+ .map(([name, tool]) => {
4355
+ const params = Object.entries(tool.parameters || {})
4356
+ .map(
4357
+ ([p, spec]) => `${p}${spec.required ? "" : "?"}: ${spec.description}`,
4358
+ )
4359
+ .join(", ");
4360
+ return `- ${name}(${params}): ${tool.description}`;
4361
+ })
4362
+ .join("\n");
4363
+ }
4364
+
4365
+ export default {
4366
+ TOOLS,
4367
+ executeTool,
4368
+ getToolDescriptions,
4369
+ getAgentWorkingDirectory,
4370
+ setAgentWorkingDirectory,
4371
+ resetAgentWorkingDirectory,
4372
+ };