otherwise-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -0
- package/bin/otherwise.js +5 -0
- package/frontend/404.html +84 -0
- package/frontend/assets/OpenDyslexic3-Bold-CDyRs55Y.ttf +0 -0
- package/frontend/assets/OpenDyslexic3-Regular-CIBXa4WE.ttf +0 -0
- package/frontend/assets/__vite-browser-external-BIHI7g3E.js +1 -0
- package/frontend/assets/conversational-worker-CeKiciGk.js +2929 -0
- package/frontend/assets/dictation-worker-D0aYfq8b.js +29 -0
- package/frontend/assets/gemini-color-CgSQmmva.png +0 -0
- package/frontend/assets/index-BLux5ps4.js +21 -0
- package/frontend/assets/index-Blh8_TEM.js +5272 -0
- package/frontend/assets/index-BpQ1PuKu.js +18 -0
- package/frontend/assets/index-Df737c8w.css +1 -0
- package/frontend/assets/index-xaYHL6wb.js +113 -0
- package/frontend/assets/ort-wasm-simd-threaded.asyncify-BynIiDiv.wasm +0 -0
- package/frontend/assets/ort-wasm-simd-threaded.jsep-B0T3yYHD.wasm +0 -0
- package/frontend/assets/transformers-tULNc5V3.js +31 -0
- package/frontend/assets/tts-worker-DPJWqT7N.js +2899 -0
- package/frontend/assets/voice-mode-worker-GzvIE_uh.js +2927 -0
- package/frontend/assets/worker-2d5ABSLU.js +31 -0
- package/frontend/banner.png +0 -0
- package/frontend/favicon.svg +3 -0
- package/frontend/google55e5ec47ee14a5f8.html +1 -0
- package/frontend/index.html +234 -0
- package/frontend/manifest.json +17 -0
- package/frontend/pdf.worker.min.mjs +21 -0
- package/frontend/robots.txt +5 -0
- package/frontend/sitemap.xml +27 -0
- package/package.json +81 -0
- package/src/agent/index.js +1066 -0
- package/src/agent/location.js +51 -0
- package/src/agent/prompt.js +548 -0
- package/src/agent/tools.js +4372 -0
- package/src/browser/detect.js +68 -0
- package/src/browser/session.js +1109 -0
- package/src/config.js +137 -0
- package/src/email/client.js +503 -0
- package/src/index.js +557 -0
- package/src/inference/anthropic.js +113 -0
- package/src/inference/google.js +373 -0
- package/src/inference/index.js +81 -0
- package/src/inference/ollama.js +383 -0
- package/src/inference/openai.js +140 -0
- package/src/inference/openrouter.js +378 -0
- package/src/inference/xai.js +200 -0
- package/src/logBridge.js +9 -0
- package/src/models.js +146 -0
- package/src/remote/client.js +225 -0
- package/src/scheduler/cron.js +243 -0
- package/src/server.js +3876 -0
- package/src/storage/db.js +1135 -0
- package/src/storage/supabase.js +364 -0
- package/src/tunnel/cloudflare.js +241 -0
- package/src/ui/components/App.jsx +687 -0
- package/src/ui/components/BrowserSelect.jsx +111 -0
- package/src/ui/components/FilePicker.jsx +472 -0
- package/src/ui/components/Header.jsx +444 -0
- package/src/ui/components/HelpPanel.jsx +173 -0
- package/src/ui/components/HistoryPanel.jsx +158 -0
- package/src/ui/components/MessageList.jsx +235 -0
- package/src/ui/components/ModelSelector.jsx +304 -0
- package/src/ui/components/PromptInput.jsx +515 -0
- package/src/ui/components/StreamingResponse.jsx +134 -0
- package/src/ui/components/ThinkingIndicator.jsx +365 -0
- package/src/ui/components/ToolExecution.jsx +714 -0
- package/src/ui/components/index.js +82 -0
- package/src/ui/context/TerminalContext.jsx +150 -0
- package/src/ui/context/index.js +13 -0
- package/src/ui/hooks/index.js +16 -0
- package/src/ui/hooks/useChatState.js +675 -0
- package/src/ui/hooks/useCommands.js +280 -0
- package/src/ui/hooks/useFileAttachments.js +216 -0
- package/src/ui/hooks/useKeyboardShortcuts.js +173 -0
- package/src/ui/hooks/useNotifications.js +185 -0
- package/src/ui/hooks/useTerminalSize.js +151 -0
- package/src/ui/hooks/useWebSocket.js +273 -0
- package/src/ui/index.js +94 -0
- package/src/ui/ink-runner.js +22 -0
- package/src/ui/utils/formatters.js +424 -0
- package/src/ui/utils/index.js +6 -0
- package/src/ui/utils/markdown.js +166 -0
|
@@ -0,0 +1,4372 @@
|
|
|
1
|
+
import {
|
|
2
|
+
readFileSync,
|
|
3
|
+
writeFileSync,
|
|
4
|
+
readdirSync,
|
|
5
|
+
statSync,
|
|
6
|
+
existsSync,
|
|
7
|
+
mkdirSync,
|
|
8
|
+
lstatSync,
|
|
9
|
+
realpathSync,
|
|
10
|
+
} from "fs";
|
|
11
|
+
import { execSync, spawn } from "child_process";
|
|
12
|
+
import { homedir, tmpdir } from "os";
|
|
13
|
+
import { join, dirname, resolve, normalize, isAbsolute, relative } from "path";
|
|
14
|
+
import {
|
|
15
|
+
createScheduledTask,
|
|
16
|
+
getScheduledTasks,
|
|
17
|
+
deleteScheduledTask,
|
|
18
|
+
} from "../storage/db.js";
|
|
19
|
+
import cron from "node-cron";
|
|
20
|
+
import {
|
|
21
|
+
reloadTask,
|
|
22
|
+
cancelTask,
|
|
23
|
+
scheduleOneTimeTask,
|
|
24
|
+
cancelOneTimeTask,
|
|
25
|
+
} from "../scheduler/cron.js";
|
|
26
|
+
import { ProxyAgent, fetch as undiciFetch } from "undici";
|
|
27
|
+
import {
|
|
28
|
+
getBrowserPage,
|
|
29
|
+
closeBrowser,
|
|
30
|
+
isBrowserActive,
|
|
31
|
+
findElement,
|
|
32
|
+
extractPageContent,
|
|
33
|
+
executeActionSequence,
|
|
34
|
+
fetchHtmlWithBrowser,
|
|
35
|
+
} from "../browser/session.js";
|
|
36
|
+
|
|
37
|
+
/** Proxy URL for web search requests (fetch + browser). Use WEBSEARCH_PROXY or HTTPS_PROXY. */
|
|
38
|
+
function getWebSearchProxy() {
|
|
39
|
+
return (
|
|
40
|
+
process.env.WEBSEARCH_PROXY ||
|
|
41
|
+
process.env.HTTPS_PROXY ||
|
|
42
|
+
process.env.HTTP_PROXY ||
|
|
43
|
+
null
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ============================================
|
|
48
|
+
// Security Configuration
|
|
49
|
+
// ============================================
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Sensitive paths that should NEVER be accessed
|
|
53
|
+
* These are blocked regardless of sandbox settings
|
|
54
|
+
*/
|
|
55
|
+
const BLOCKED_PATHS = [
|
|
56
|
+
// SSH keys and credentials
|
|
57
|
+
"/.ssh/",
|
|
58
|
+
"/.gnupg/",
|
|
59
|
+
"/.aws/",
|
|
60
|
+
"/.azure/",
|
|
61
|
+
"/.gcloud/",
|
|
62
|
+
"/.config/gcloud/",
|
|
63
|
+
|
|
64
|
+
// System files
|
|
65
|
+
"/etc/passwd",
|
|
66
|
+
"/etc/shadow",
|
|
67
|
+
"/etc/sudoers",
|
|
68
|
+
"/etc/ssh/",
|
|
69
|
+
|
|
70
|
+
// Secrets and tokens
|
|
71
|
+
"/.npmrc",
|
|
72
|
+
"/.pypirc",
|
|
73
|
+
"/.netrc",
|
|
74
|
+
"/.env",
|
|
75
|
+
"/credentials",
|
|
76
|
+
"/secrets/",
|
|
77
|
+
"/tokens/",
|
|
78
|
+
|
|
79
|
+
// Browser data
|
|
80
|
+
"/.mozilla/",
|
|
81
|
+
"/.chrome/",
|
|
82
|
+
"/.config/google-chrome/",
|
|
83
|
+
"/Library/Keychains/",
|
|
84
|
+
|
|
85
|
+
// macOS specific
|
|
86
|
+
"/Library/Keychains/",
|
|
87
|
+
"/private/etc/",
|
|
88
|
+
];
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Patterns that indicate dangerous commands (regex)
|
|
92
|
+
* More comprehensive than simple string matching
|
|
93
|
+
*/
|
|
94
|
+
const DANGEROUS_COMMAND_PATTERNS = [
|
|
95
|
+
// Recursive deletion of root or system directories
|
|
96
|
+
/rm\s+(-[rfvI]+\s+)*[\/~]\s*$/i,
|
|
97
|
+
/rm\s+(-[rfvI]+\s+)*\/\s/i,
|
|
98
|
+
/rm\s+(-[rfvI]+\s+)*\/\*/i,
|
|
99
|
+
/rm\s+(-[rfvI]+\s+)*~\//i,
|
|
100
|
+
|
|
101
|
+
// Format/destroy disks
|
|
102
|
+
/mkfs/i,
|
|
103
|
+
/dd\s+.*of\s*=\s*\/dev\//i,
|
|
104
|
+
/wipefs/i,
|
|
105
|
+
/fdisk/i,
|
|
106
|
+
/parted/i,
|
|
107
|
+
|
|
108
|
+
// Fork bomb
|
|
109
|
+
/:\(\)\s*\{/,
|
|
110
|
+
/\.\s*\/dev\/null/,
|
|
111
|
+
|
|
112
|
+
// System modification
|
|
113
|
+
/chmod\s+(-[rwxR]+\s+)*777\s+\//i,
|
|
114
|
+
/chown\s+.*\/$/i,
|
|
115
|
+
|
|
116
|
+
// Dangerous redirections
|
|
117
|
+
/>\s*\/dev\/sd/i,
|
|
118
|
+
/>\s*\/dev\/nv/i,
|
|
119
|
+
/>\s*\/etc\//i,
|
|
120
|
+
|
|
121
|
+
// Privilege escalation attempts
|
|
122
|
+
/sudo\s+su/i,
|
|
123
|
+
/sudo\s+-i/i,
|
|
124
|
+
/sudo\s+bash/i,
|
|
125
|
+
/sudo\s+sh/i,
|
|
126
|
+
|
|
127
|
+
// Network attacks
|
|
128
|
+
/curl\s+.*\|\s*(sudo\s+)?bash/i,
|
|
129
|
+
/wget\s+.*\|\s*(sudo\s+)?bash/i,
|
|
130
|
+
/curl\s+.*\|\s*(sudo\s+)?sh/i,
|
|
131
|
+
/wget\s+.*\|\s*(sudo\s+)?sh/i,
|
|
132
|
+
|
|
133
|
+
// Python/Ruby/Node one-liners for system commands
|
|
134
|
+
/python[23]?\s+-c\s+["']import\s+os;?\s*os\.(system|popen|exec)/i,
|
|
135
|
+
/ruby\s+-e\s+["'`].*system/i,
|
|
136
|
+
/node\s+-e\s+["'`].*exec/i,
|
|
137
|
+
|
|
138
|
+
// Encoded command execution
|
|
139
|
+
/base64\s+-d.*\|.*bash/i,
|
|
140
|
+
/echo\s+.*\|\s*base64\s+-d\s*\|/i,
|
|
141
|
+
|
|
142
|
+
// Modifying shell configs
|
|
143
|
+
/>\s*~\/\.(bashrc|zshrc|profile|bash_profile)/i,
|
|
144
|
+
/echo\s+.*>>\s*~\/\.(bashrc|zshrc|profile)/i,
|
|
145
|
+
|
|
146
|
+
// Cron manipulation
|
|
147
|
+
/crontab\s+-r/i,
|
|
148
|
+
/rm\s+.*cron/i,
|
|
149
|
+
];
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Default timeout for tool execution (ms)
|
|
153
|
+
*/
|
|
154
|
+
const DEFAULT_TOOL_TIMEOUT = 30000;
|
|
155
|
+
|
|
156
|
+
// ============================================
|
|
157
|
+
// Working Directory State Management
|
|
158
|
+
// ============================================
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Current working directory for the agent session
|
|
162
|
+
* This persists across tool calls within a session
|
|
163
|
+
*/
|
|
164
|
+
let currentWorkingDirectory = process.cwd();
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Get the current working directory
|
|
168
|
+
* @returns {string} - The current working directory path
|
|
169
|
+
*/
|
|
170
|
+
export function getAgentWorkingDirectory() {
|
|
171
|
+
return currentWorkingDirectory;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Set the current working directory
|
|
176
|
+
* @param {string} path - The new working directory path
|
|
177
|
+
*/
|
|
178
|
+
export function setAgentWorkingDirectory(path) {
|
|
179
|
+
currentWorkingDirectory = path;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Reset working directory to process cwd (for new sessions)
|
|
184
|
+
*/
|
|
185
|
+
export function resetAgentWorkingDirectory() {
|
|
186
|
+
currentWorkingDirectory = process.cwd();
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Per-tool timeout overrides
|
|
191
|
+
*/
|
|
192
|
+
const TOOL_TIMEOUTS = {
|
|
193
|
+
execute_command: 30000,
|
|
194
|
+
web_search: 25000,
|
|
195
|
+
fetch_url: 15000,
|
|
196
|
+
browser_navigate: 45000, // Increased for retry logic
|
|
197
|
+
browser_click: 25000, // Element detection tries multiple strategies (2s each) + click action
|
|
198
|
+
browser_type: 25000, // Element detection + typing
|
|
199
|
+
browser_read: 10000,
|
|
200
|
+
browser_screenshot: 15000,
|
|
201
|
+
browser_launch: 20000, // Browser startup can be slow
|
|
202
|
+
browser_close: 5000,
|
|
203
|
+
browser_interact: 120000, // Composite tool: can execute multiple actions (2 min max)
|
|
204
|
+
read_file: 5000,
|
|
205
|
+
write_file: 5000,
|
|
206
|
+
list_directory: 5000,
|
|
207
|
+
search_files: 15000,
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
/** Max chars of page content to auto-append after navigate/click/type (avoids extra browser_read round-trip). */
|
|
211
|
+
const BROWSER_AUTO_READ_MAX_LENGTH = 6000;
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Extract page content and truncate for inclusion in tool output.
|
|
215
|
+
* Returns empty string on error so the main result is still useful.
|
|
216
|
+
*/
|
|
217
|
+
async function getTruncatedPageContent(
|
|
218
|
+
page,
|
|
219
|
+
maxLen = BROWSER_AUTO_READ_MAX_LENGTH,
|
|
220
|
+
) {
|
|
221
|
+
try {
|
|
222
|
+
const content = await extractPageContent(page);
|
|
223
|
+
if (content.length <= maxLen) return content;
|
|
224
|
+
return (
|
|
225
|
+
content.substring(0, maxLen) +
|
|
226
|
+
"\n\n... (truncated; use browser_read for full content)"
|
|
227
|
+
);
|
|
228
|
+
} catch {
|
|
229
|
+
return "";
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Retry configuration for operations that may fail transiently
|
|
235
|
+
*/
|
|
236
|
+
const RETRY_CONFIG = {
|
|
237
|
+
maxRetries: 3,
|
|
238
|
+
baseDelayMs: 1000,
|
|
239
|
+
maxDelayMs: 10000,
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Sleep for a specified duration
|
|
244
|
+
* @param {number} ms - Milliseconds to sleep
|
|
245
|
+
*/
|
|
246
|
+
function sleep(ms) {
|
|
247
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// ============================================
|
|
251
|
+
// Search Query Refinement & Date Injection
|
|
252
|
+
// ============================================
|
|
253
|
+
|
|
254
|
+
const TIME_SENSITIVE_PATTERNS = [
|
|
255
|
+
/\b(weather|forecast|temperature|temp)\b/i,
|
|
256
|
+
/\b(today|tonight|tomorrow|this week|this weekend|right now|currently)\b/i,
|
|
257
|
+
/\b(latest|breaking|recent|current|live)\b/i,
|
|
258
|
+
/\b(stock price|share price|market|trading)\b/i,
|
|
259
|
+
/\b(score|scores|game|match|playing)\b/i,
|
|
260
|
+
/\b(open now|hours|is .+ open|closed)\b/i,
|
|
261
|
+
/\b(news|headlines)\b/i,
|
|
262
|
+
/\b(price of|cost of|how much)\b/i,
|
|
263
|
+
];
|
|
264
|
+
|
|
265
|
+
const HAS_DATE_PATTERN =
|
|
266
|
+
/\b(20\d{2}|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b/i;
|
|
267
|
+
|
|
268
|
+
function refineSearchQuery(query) {
|
|
269
|
+
const trimmed = (query || "").trim();
|
|
270
|
+
if (!trimmed) return trimmed;
|
|
271
|
+
|
|
272
|
+
const isTimeSensitive = TIME_SENSITIVE_PATTERNS.some((p) => p.test(trimmed));
|
|
273
|
+
if (!isTimeSensitive) return trimmed;
|
|
274
|
+
|
|
275
|
+
if (HAS_DATE_PATTERN.test(trimmed)) return trimmed;
|
|
276
|
+
|
|
277
|
+
const now = new Date();
|
|
278
|
+
const month = now.toLocaleString("en-US", { month: "long" });
|
|
279
|
+
const year = now.getFullYear();
|
|
280
|
+
return `${trimmed} ${month} ${year}`;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// ============================================
|
|
284
|
+
// Search Result Cache (TTL-based)
|
|
285
|
+
// ============================================
|
|
286
|
+
|
|
287
|
+
class SearchCache {
|
|
288
|
+
constructor(ttlMs = 5 * 60 * 1000, maxEntries = 50) {
|
|
289
|
+
this._cache = new Map();
|
|
290
|
+
this._ttl = ttlMs;
|
|
291
|
+
this._max = maxEntries;
|
|
292
|
+
}
|
|
293
|
+
_normalizeKey(query) {
|
|
294
|
+
return (query || "").trim().toLowerCase().replace(/\s+/g, " ");
|
|
295
|
+
}
|
|
296
|
+
get(query) {
|
|
297
|
+
const key = this._normalizeKey(query);
|
|
298
|
+
const entry = this._cache.get(key);
|
|
299
|
+
if (!entry) return null;
|
|
300
|
+
if (Date.now() - entry.ts > this._ttl) {
|
|
301
|
+
this._cache.delete(key);
|
|
302
|
+
return null;
|
|
303
|
+
}
|
|
304
|
+
return entry.data;
|
|
305
|
+
}
|
|
306
|
+
set(query, data) {
|
|
307
|
+
const key = this._normalizeKey(query);
|
|
308
|
+
if (this._cache.size >= this._max) {
|
|
309
|
+
const oldest = this._cache.keys().next().value;
|
|
310
|
+
this._cache.delete(oldest);
|
|
311
|
+
}
|
|
312
|
+
this._cache.set(key, { data, ts: Date.now() });
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const searchCache = new SearchCache();
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Execute a function with exponential backoff retry
|
|
320
|
+
* @param {Function} fn - Async function to execute
|
|
321
|
+
* @param {object} options - Retry options
|
|
322
|
+
* @returns {Promise<any>} - Result of the function
|
|
323
|
+
*/
|
|
324
|
+
async function withRetry(fn, options = {}) {
|
|
325
|
+
const maxRetries = options.maxRetries || RETRY_CONFIG.maxRetries;
|
|
326
|
+
const baseDelay = options.baseDelayMs || RETRY_CONFIG.baseDelayMs;
|
|
327
|
+
const maxDelay = options.maxDelayMs || RETRY_CONFIG.maxDelayMs;
|
|
328
|
+
const retryOn = options.retryOn || (() => true); // Default: retry on all errors
|
|
329
|
+
|
|
330
|
+
let lastError;
|
|
331
|
+
|
|
332
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
333
|
+
try {
|
|
334
|
+
return await fn(attempt);
|
|
335
|
+
} catch (err) {
|
|
336
|
+
lastError = err;
|
|
337
|
+
|
|
338
|
+
// Check if we should retry this error
|
|
339
|
+
if (!retryOn(err) || attempt === maxRetries) {
|
|
340
|
+
throw err;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Calculate delay with exponential backoff and jitter
|
|
344
|
+
const delay = Math.min(
|
|
345
|
+
baseDelay * Math.pow(2, attempt) + Math.random() * 1000,
|
|
346
|
+
maxDelay,
|
|
347
|
+
);
|
|
348
|
+
|
|
349
|
+
console.log(
|
|
350
|
+
`[Retry] Attempt ${attempt + 1} failed: ${err.message}. Retrying in ${Math.round(delay)}ms...`,
|
|
351
|
+
);
|
|
352
|
+
await sleep(delay);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
throw lastError;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// ============================================
|
|
360
|
+
// Memory Search Utilities
|
|
361
|
+
// ============================================
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Cosine similarity between two vectors
|
|
365
|
+
* @param {Array<number>} a - First vector
|
|
366
|
+
* @param {Array<number>} b - Second vector
|
|
367
|
+
* @returns {number} Similarity score between 0 and 1
|
|
368
|
+
*/
|
|
369
|
+
function cosineSimilarity(a, b) {
|
|
370
|
+
if (!a || !b || a.length !== b.length) return 0;
|
|
371
|
+
let dotProduct = 0,
|
|
372
|
+
normA = 0,
|
|
373
|
+
normB = 0;
|
|
374
|
+
for (let i = 0; i < a.length; i++) {
|
|
375
|
+
dotProduct += a[i] * b[i];
|
|
376
|
+
normA += a[i] * a[i];
|
|
377
|
+
normB += b[i] * b[i];
|
|
378
|
+
}
|
|
379
|
+
if (normA === 0 || normB === 0) return 0;
|
|
380
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Generate embedding for a query string using OpenAI or Google API
|
|
385
|
+
* @param {string} query - Text to embed
|
|
386
|
+
* @param {object} config - Config with API keys
|
|
387
|
+
* @returns {Array<number>|null} Embedding vector or null if unavailable
|
|
388
|
+
*/
|
|
389
|
+
async function generateQueryEmbedding(query, config) {
|
|
390
|
+
// Try OpenAI first
|
|
391
|
+
if (config.apiKeys?.openai) {
|
|
392
|
+
try {
|
|
393
|
+
const response = await fetch("https://api.openai.com/v1/embeddings", {
|
|
394
|
+
method: "POST",
|
|
395
|
+
headers: {
|
|
396
|
+
Authorization: `Bearer ${config.apiKeys.openai}`,
|
|
397
|
+
"Content-Type": "application/json",
|
|
398
|
+
},
|
|
399
|
+
body: JSON.stringify({
|
|
400
|
+
model: "text-embedding-3-small",
|
|
401
|
+
input: query,
|
|
402
|
+
}),
|
|
403
|
+
});
|
|
404
|
+
if (response.ok) {
|
|
405
|
+
const data = await response.json();
|
|
406
|
+
return data.data[0].embedding;
|
|
407
|
+
}
|
|
408
|
+
} catch (err) {
|
|
409
|
+
console.warn("[Memory] OpenAI embedding failed:", err.message);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Fallback to Google
|
|
414
|
+
if (config.apiKeys?.google) {
|
|
415
|
+
try {
|
|
416
|
+
const response = await fetch(
|
|
417
|
+
`https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedContent?key=${config.apiKeys.google}`,
|
|
418
|
+
{
|
|
419
|
+
method: "POST",
|
|
420
|
+
headers: { "Content-Type": "application/json" },
|
|
421
|
+
body: JSON.stringify({
|
|
422
|
+
content: { parts: [{ text: query }] },
|
|
423
|
+
}),
|
|
424
|
+
},
|
|
425
|
+
);
|
|
426
|
+
if (response.ok) {
|
|
427
|
+
const data = await response.json();
|
|
428
|
+
return data.embedding.values;
|
|
429
|
+
}
|
|
430
|
+
} catch (err) {
|
|
431
|
+
console.warn("[Memory] Google embedding failed:", err.message);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
return null; // No embedding available, will use text search
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* Tool definitions with metadata and execution functions
|
|
440
|
+
*/
|
|
441
|
+
export const TOOLS = {
|
|
442
|
+
// ============================================
|
|
443
|
+
// Filesystem Tools
|
|
444
|
+
// ============================================
|
|
445
|
+
|
|
446
|
+
read_file: {
|
|
447
|
+
description: "Read the contents of a file",
|
|
448
|
+
parameters: {
|
|
449
|
+
path: {
|
|
450
|
+
type: "string",
|
|
451
|
+
description: "Path to the file to read",
|
|
452
|
+
required: true,
|
|
453
|
+
},
|
|
454
|
+
},
|
|
455
|
+
execute: async ({ path: inputPath }, config) => {
|
|
456
|
+
try {
|
|
457
|
+
const resolvedPath = resolvePath(inputPath);
|
|
458
|
+
|
|
459
|
+
// Check for path traversal attempts
|
|
460
|
+
if (hasPathTraversal(inputPath, resolvedPath)) {
|
|
461
|
+
return `Error: Path traversal detected`;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// Check permissions (throws on failure)
|
|
465
|
+
checkReadPermission(resolvedPath, config);
|
|
466
|
+
|
|
467
|
+
if (!existsSync(resolvedPath)) {
|
|
468
|
+
return `Error: File not found: ${inputPath}`;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
const content = readFileSync(resolvedPath, "utf-8");
|
|
472
|
+
return content;
|
|
473
|
+
} catch (err) {
|
|
474
|
+
return `Error reading file: ${err.message}`;
|
|
475
|
+
}
|
|
476
|
+
},
|
|
477
|
+
},
|
|
478
|
+
|
|
479
|
+
write_file: {
|
|
480
|
+
description: "Write content to a file (creates directories if needed)",
|
|
481
|
+
parameters: {
|
|
482
|
+
path: {
|
|
483
|
+
type: "string",
|
|
484
|
+
description: "Path to the file to write",
|
|
485
|
+
required: true,
|
|
486
|
+
},
|
|
487
|
+
content: {
|
|
488
|
+
type: "string",
|
|
489
|
+
description: "Content to write",
|
|
490
|
+
required: true,
|
|
491
|
+
},
|
|
492
|
+
},
|
|
493
|
+
execute: async ({ path: inputPath, content }, config, snapshotFn) => {
|
|
494
|
+
try {
|
|
495
|
+
const resolvedPath = resolvePath(inputPath);
|
|
496
|
+
|
|
497
|
+
// Check for path traversal attempts
|
|
498
|
+
if (hasPathTraversal(inputPath, resolvedPath)) {
|
|
499
|
+
return `Error: Path traversal detected`;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Check permissions (throws on failure)
|
|
503
|
+
checkWritePermission(resolvedPath, config);
|
|
504
|
+
|
|
505
|
+
// Also check the directory for write permission
|
|
506
|
+
const dir = dirname(resolvedPath);
|
|
507
|
+
checkWritePermission(dir, config);
|
|
508
|
+
|
|
509
|
+
// Whether we will create the parent dir (for undo on regeneration)
|
|
510
|
+
const dirExisted = existsSync(dir);
|
|
511
|
+
const createdDir = dirExisted ? null : dir;
|
|
512
|
+
|
|
513
|
+
// Capture snapshot BEFORE writing (for undo on regeneration)
|
|
514
|
+
if (snapshotFn) {
|
|
515
|
+
const fileExisted = existsSync(resolvedPath);
|
|
516
|
+
let oldContent = null;
|
|
517
|
+
if (fileExisted) {
|
|
518
|
+
try {
|
|
519
|
+
oldContent = readFileSync(resolvedPath, "utf-8");
|
|
520
|
+
} catch (readErr) {
|
|
521
|
+
// If we can't read (e.g., binary file), skip snapshotting
|
|
522
|
+
console.warn(
|
|
523
|
+
"[write_file] Could not read file for snapshot:",
|
|
524
|
+
readErr.message,
|
|
525
|
+
);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
snapshotFn({
|
|
529
|
+
path: resolvedPath,
|
|
530
|
+
existed: fileExisted,
|
|
531
|
+
content: oldContent,
|
|
532
|
+
createdDir,
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// Create directory if it doesn't exist
|
|
537
|
+
if (!dirExisted) {
|
|
538
|
+
mkdirSync(dir, { recursive: true });
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
writeFileSync(resolvedPath, content, "utf-8");
|
|
542
|
+
return { path: inputPath, content };
|
|
543
|
+
} catch (err) {
|
|
544
|
+
return `Error writing file: ${err.message}`;
|
|
545
|
+
}
|
|
546
|
+
},
|
|
547
|
+
},
|
|
548
|
+
|
|
549
|
+
edit_file: {
|
|
550
|
+
description:
|
|
551
|
+
"Edit a file by replacing specific text. Use this for precise edits instead of rewriting entire files.",
|
|
552
|
+
parameters: {
|
|
553
|
+
path: {
|
|
554
|
+
type: "string",
|
|
555
|
+
description: "Path to the file to edit",
|
|
556
|
+
required: true,
|
|
557
|
+
},
|
|
558
|
+
old_string: {
|
|
559
|
+
type: "string",
|
|
560
|
+
description:
|
|
561
|
+
"The exact text to find and replace (must match exactly including whitespace)",
|
|
562
|
+
required: true,
|
|
563
|
+
},
|
|
564
|
+
new_string: {
|
|
565
|
+
type: "string",
|
|
566
|
+
description: "The text to replace it with",
|
|
567
|
+
required: true,
|
|
568
|
+
},
|
|
569
|
+
replace_all: {
|
|
570
|
+
type: "boolean",
|
|
571
|
+
description:
|
|
572
|
+
"Replace all occurrences (default: false, only replaces first match)",
|
|
573
|
+
required: false,
|
|
574
|
+
},
|
|
575
|
+
},
|
|
576
|
+
execute: async (
|
|
577
|
+
{ path: inputPath, old_string, new_string, replace_all = false },
|
|
578
|
+
config,
|
|
579
|
+
snapshotFn,
|
|
580
|
+
) => {
|
|
581
|
+
try {
|
|
582
|
+
const resolvedPath = resolvePath(inputPath);
|
|
583
|
+
|
|
584
|
+
// Check for path traversal attempts
|
|
585
|
+
if (hasPathTraversal(inputPath, resolvedPath)) {
|
|
586
|
+
return `Error: Path traversal detected`;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
// Check permissions (throws on failure)
|
|
590
|
+
checkReadPermission(resolvedPath, config);
|
|
591
|
+
checkWritePermission(resolvedPath, config);
|
|
592
|
+
|
|
593
|
+
if (!existsSync(resolvedPath)) {
|
|
594
|
+
return `Error: File not found: ${inputPath}`;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Read current content
|
|
598
|
+
const originalContent = readFileSync(resolvedPath, "utf-8");
|
|
599
|
+
|
|
600
|
+
// Check if old_string exists in file
|
|
601
|
+
if (!originalContent.includes(old_string)) {
|
|
602
|
+
// Provide helpful error with context
|
|
603
|
+
const lines = originalContent.split("\n");
|
|
604
|
+
const preview = lines.slice(0, 10).join("\n");
|
|
605
|
+
return `Error: Could not find the text to replace in ${inputPath}.\n\nSearched for:\n---\n${old_string.substring(0, 200)}${old_string.length > 200 ? "..." : ""}\n---\n\nFile preview (first 10 lines):\n---\n${preview}\n---\n\nMake sure the old_string matches exactly, including whitespace and indentation.`;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// Capture snapshot BEFORE writing (for undo on regeneration)
|
|
609
|
+
if (snapshotFn) {
|
|
610
|
+
snapshotFn({
|
|
611
|
+
path: resolvedPath,
|
|
612
|
+
existed: true, // edit_file only works on existing files
|
|
613
|
+
content: originalContent,
|
|
614
|
+
});
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Count occurrences
|
|
618
|
+
const occurrences = (
|
|
619
|
+
originalContent.match(new RegExp(escapeRegExp(old_string), "g")) || []
|
|
620
|
+
).length;
|
|
621
|
+
|
|
622
|
+
// Perform replacement
|
|
623
|
+
let newContent;
|
|
624
|
+
if (replace_all) {
|
|
625
|
+
newContent = originalContent.split(old_string).join(new_string);
|
|
626
|
+
} else {
|
|
627
|
+
// Replace only first occurrence
|
|
628
|
+
const index = originalContent.indexOf(old_string);
|
|
629
|
+
newContent =
|
|
630
|
+
originalContent.substring(0, index) +
|
|
631
|
+
new_string +
|
|
632
|
+
originalContent.substring(index + old_string.length);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// Write the modified content
|
|
636
|
+
writeFileSync(resolvedPath, newContent, "utf-8");
|
|
637
|
+
|
|
638
|
+
// Generate a simple diff for the result
|
|
639
|
+
const replacedCount = replace_all ? occurrences : 1;
|
|
640
|
+
const remainingOccurrences = occurrences - replacedCount;
|
|
641
|
+
|
|
642
|
+
// Create a unified diff-like output
|
|
643
|
+
const diffResult = generateSimpleDiff(
|
|
644
|
+
old_string,
|
|
645
|
+
new_string,
|
|
646
|
+
inputPath,
|
|
647
|
+
);
|
|
648
|
+
|
|
649
|
+
let resultMessage = `Successfully edited ${inputPath}\n`;
|
|
650
|
+
resultMessage += `Replaced ${replacedCount} occurrence${replacedCount > 1 ? "s" : ""}`;
|
|
651
|
+
if (remainingOccurrences > 0) {
|
|
652
|
+
resultMessage += ` (${remainingOccurrences} more occurrence${remainingOccurrences > 1 ? "s" : ""} remain)`;
|
|
653
|
+
}
|
|
654
|
+
resultMessage += `\n\n${diffResult}`;
|
|
655
|
+
|
|
656
|
+
return resultMessage;
|
|
657
|
+
} catch (err) {
|
|
658
|
+
return `Error editing file: ${err.message}`;
|
|
659
|
+
}
|
|
660
|
+
},
|
|
661
|
+
},
|
|
662
|
+
|
|
663
|
+
list_directory: {
|
|
664
|
+
description: "List files and directories in a path",
|
|
665
|
+
parameters: {
|
|
666
|
+
path: {
|
|
667
|
+
type: "string",
|
|
668
|
+
description: "Directory path to list",
|
|
669
|
+
required: true,
|
|
670
|
+
},
|
|
671
|
+
},
|
|
672
|
+
execute: async ({ path: inputPath }, config) => {
|
|
673
|
+
try {
|
|
674
|
+
const resolvedPath = resolvePath(inputPath);
|
|
675
|
+
|
|
676
|
+
// Check for path traversal attempts
|
|
677
|
+
if (hasPathTraversal(inputPath, resolvedPath)) {
|
|
678
|
+
return `Error: Path traversal detected`;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// Check permissions (throws on failure)
|
|
682
|
+
checkReadPermission(resolvedPath, config);
|
|
683
|
+
|
|
684
|
+
if (!existsSync(resolvedPath)) {
|
|
685
|
+
return `Error: Directory not found: ${inputPath}`;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
const entries = readdirSync(resolvedPath);
|
|
689
|
+
const detailed = entries.map((name) => {
|
|
690
|
+
const fullPath = join(resolvedPath, name);
|
|
691
|
+
try {
|
|
692
|
+
const stat = statSync(fullPath);
|
|
693
|
+
return {
|
|
694
|
+
name,
|
|
695
|
+
type: stat.isDirectory() ? "directory" : "file",
|
|
696
|
+
size: stat.isFile() ? stat.size : null,
|
|
697
|
+
};
|
|
698
|
+
} catch {
|
|
699
|
+
return { name, type: "unknown" };
|
|
700
|
+
}
|
|
701
|
+
});
|
|
702
|
+
|
|
703
|
+
// Filter out entries that would lead to blocked paths
|
|
704
|
+
const filtered = detailed.filter((entry) => {
|
|
705
|
+
const fullPath = join(resolvedPath, entry.name);
|
|
706
|
+
return !isBlockedPath(fullPath);
|
|
707
|
+
});
|
|
708
|
+
|
|
709
|
+
return JSON.stringify(filtered, null, 2);
|
|
710
|
+
} catch (err) {
|
|
711
|
+
return `Error listing directory: ${err.message}`;
|
|
712
|
+
}
|
|
713
|
+
},
|
|
714
|
+
},
|
|
715
|
+
|
|
716
|
+
search_files: {
|
|
717
|
+
description: "Search for files by name pattern",
|
|
718
|
+
parameters: {
|
|
719
|
+
query: {
|
|
720
|
+
type: "string",
|
|
721
|
+
description: "Search pattern (supports * wildcard)",
|
|
722
|
+
required: true,
|
|
723
|
+
},
|
|
724
|
+
path: {
|
|
725
|
+
type: "string",
|
|
726
|
+
description: "Directory to search in (default: current)",
|
|
727
|
+
required: false,
|
|
728
|
+
},
|
|
729
|
+
maxdepth: {
|
|
730
|
+
type: "number",
|
|
731
|
+
description: "Maximum directory depth to search (default: 5, max: 10)",
|
|
732
|
+
required: false,
|
|
733
|
+
},
|
|
734
|
+
},
|
|
735
|
+
execute: async ({ query, path: inputPath, maxdepth }, config) => {
|
|
736
|
+
try {
|
|
737
|
+
const searchPath = resolvePath(inputPath || ".");
|
|
738
|
+
|
|
739
|
+
// Check for path traversal attempts
|
|
740
|
+
if (inputPath && hasPathTraversal(inputPath, searchPath)) {
|
|
741
|
+
return `Error: Path traversal detected`;
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
// Check permissions (throws on failure)
|
|
745
|
+
checkReadPermission(searchPath, config);
|
|
746
|
+
|
|
747
|
+
// Sanitize query to prevent command injection
|
|
748
|
+
// Only allow alphanumeric, dots, underscores, hyphens, and wildcards
|
|
749
|
+
const sanitizedQuery = query.replace(/[^a-zA-Z0-9._\-*?]/g, "");
|
|
750
|
+
if (sanitizedQuery !== query) {
|
|
751
|
+
console.warn(
|
|
752
|
+
`[search_files] Query sanitized: "${query}" -> "${sanitizedQuery}"`,
|
|
753
|
+
);
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// Limit maxdepth to prevent timeout on large directory trees
|
|
757
|
+
const depth = Math.min(Math.max(1, maxdepth || 5), 10);
|
|
758
|
+
|
|
759
|
+
// Use find command with sanitized input and depth limit
|
|
760
|
+
// -maxdepth prevents searching deep nested directories (e.g., node_modules)
|
|
761
|
+
const timeout = getToolTimeout("search_files");
|
|
762
|
+
const result = execSync(
|
|
763
|
+
`find "${searchPath}" -maxdepth ${depth} -name "${sanitizedQuery}" -type f 2>/dev/null | head -50`,
|
|
764
|
+
{ encoding: "utf-8", timeout, killSignal: "SIGKILL" },
|
|
765
|
+
);
|
|
766
|
+
|
|
767
|
+
// Filter results to exclude blocked paths
|
|
768
|
+
const lines = result
|
|
769
|
+
.trim()
|
|
770
|
+
.split("\n")
|
|
771
|
+
.filter((line) => {
|
|
772
|
+
return line && !isBlockedPath(line);
|
|
773
|
+
});
|
|
774
|
+
|
|
775
|
+
const resultText =
|
|
776
|
+
lines.join("\n") || "No files found matching the pattern.";
|
|
777
|
+
return depth < 10
|
|
778
|
+
? `${resultText}\n\n(Searched up to ${depth} levels deep. Use maxdepth parameter to search deeper.)`
|
|
779
|
+
: resultText;
|
|
780
|
+
} catch (err) {
|
|
781
|
+
if (
|
|
782
|
+
err.killed ||
|
|
783
|
+
err.signal === "SIGKILL" ||
|
|
784
|
+
err.code === "ETIMEDOUT"
|
|
785
|
+
) {
|
|
786
|
+
return `Error: Search timed out. Try searching a more specific directory or reducing maxdepth.`;
|
|
787
|
+
}
|
|
788
|
+
return `Error searching: ${err.message}`;
|
|
789
|
+
}
|
|
790
|
+
},
|
|
791
|
+
},
|
|
792
|
+
|
|
793
|
+
// ============================================
|
|
794
|
+
// Shell Tools
|
|
795
|
+
// ============================================
|
|
796
|
+
|
|
797
|
+
set_working_directory: {
|
|
798
|
+
description:
|
|
799
|
+
"Change the current working directory for future commands. This persists across tool calls.",
|
|
800
|
+
parameters: {
|
|
801
|
+
path: {
|
|
802
|
+
type: "string",
|
|
803
|
+
description:
|
|
804
|
+
"Path to the directory to switch to (absolute or relative)",
|
|
805
|
+
required: true,
|
|
806
|
+
},
|
|
807
|
+
},
|
|
808
|
+
execute: async ({ path: inputPath }, config) => {
|
|
809
|
+
try {
|
|
810
|
+
// Resolve the path (handles ~, relative paths, etc.)
|
|
811
|
+
let resolvedPath;
|
|
812
|
+
if (inputPath.startsWith("~")) {
|
|
813
|
+
resolvedPath = join(homedir(), inputPath.slice(1));
|
|
814
|
+
} else if (isAbsolute(inputPath)) {
|
|
815
|
+
resolvedPath = normalize(inputPath);
|
|
816
|
+
} else {
|
|
817
|
+
// Relative path - resolve from current working directory
|
|
818
|
+
resolvedPath = resolve(currentWorkingDirectory, inputPath);
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
// Check if path exists and is a directory
|
|
822
|
+
if (!existsSync(resolvedPath)) {
|
|
823
|
+
return `Error: Directory not found: ${inputPath}`;
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
const stat = statSync(resolvedPath);
|
|
827
|
+
if (!stat.isDirectory()) {
|
|
828
|
+
return `Error: Path is not a directory: ${inputPath}`;
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// Check for blocked paths
|
|
832
|
+
if (isBlockedPath(resolvedPath)) {
|
|
833
|
+
return `Error: Access denied - cannot switch to sensitive directory`;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// Check read permission
|
|
837
|
+
checkReadPermission(resolvedPath, config);
|
|
838
|
+
|
|
839
|
+
// Update the working directory state
|
|
840
|
+
const previousDir = currentWorkingDirectory;
|
|
841
|
+
currentWorkingDirectory = resolvedPath;
|
|
842
|
+
|
|
843
|
+
return `Changed working directory:\n From: ${previousDir}\n To: ${resolvedPath}\n\nFuture commands will run in this directory.`;
|
|
844
|
+
} catch (err) {
|
|
845
|
+
return `Error changing directory: ${err.message}`;
|
|
846
|
+
}
|
|
847
|
+
},
|
|
848
|
+
},
|
|
849
|
+
|
|
850
|
+
get_working_directory: {
|
|
851
|
+
description: "Get the current working directory",
|
|
852
|
+
parameters: {},
|
|
853
|
+
execute: async (params, config) => {
|
|
854
|
+
return `Current working directory: ${currentWorkingDirectory}`;
|
|
855
|
+
},
|
|
856
|
+
},
|
|
857
|
+
|
|
858
|
+
execute_command: {
|
|
859
|
+
description: "Execute a shell command in the current working directory",
|
|
860
|
+
parameters: {
|
|
861
|
+
command: {
|
|
862
|
+
type: "string",
|
|
863
|
+
description: "Command to execute",
|
|
864
|
+
required: true,
|
|
865
|
+
},
|
|
866
|
+
},
|
|
867
|
+
execute: async ({ command }, config, shellUndoFn) => {
|
|
868
|
+
if (!config.permissions?.shell) {
|
|
869
|
+
return "Error: Shell commands are disabled in configuration.";
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
// Validate command against dangerous patterns
|
|
873
|
+
const validation = validateCommand(command);
|
|
874
|
+
if (!validation.valid) {
|
|
875
|
+
console.warn(
|
|
876
|
+
`[execute_command] Blocked: ${validation.reason} - Command: ${command.substring(0, 100)}`,
|
|
877
|
+
);
|
|
878
|
+
return `Error: This command is blocked for safety reasons (${validation.reason}).`;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
try {
|
|
882
|
+
const timeout = getToolTimeout("execute_command");
|
|
883
|
+
const result = execSync(command, {
|
|
884
|
+
encoding: "utf-8",
|
|
885
|
+
timeout,
|
|
886
|
+
maxBuffer: 1024 * 1024, // 1MB output limit
|
|
887
|
+
cwd: currentWorkingDirectory, // Use the agent's working directory
|
|
888
|
+
// Run in a restricted environment
|
|
889
|
+
env: {
|
|
890
|
+
...process.env,
|
|
891
|
+
// Prevent some shell behaviors
|
|
892
|
+
HISTFILE: "/dev/null",
|
|
893
|
+
HISTSIZE: "0",
|
|
894
|
+
},
|
|
895
|
+
});
|
|
896
|
+
// Record reversible command for undo on regeneration (Strategy 1)
|
|
897
|
+
if (shellUndoFn) {
|
|
898
|
+
const entry = parseReversibleCommand(command);
|
|
899
|
+
if (entry) {
|
|
900
|
+
shellUndoFn(entry);
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
return result || "(Command completed with no output)";
|
|
904
|
+
} catch (err) {
|
|
905
|
+
// Check if it was a timeout
|
|
906
|
+
if (err.killed) {
|
|
907
|
+
return `Error: Command timed out after ${getToolTimeout("execute_command") / 1000} seconds`;
|
|
908
|
+
}
|
|
909
|
+
return `Command failed: ${err.message}\n${err.stderr || ""}`;
|
|
910
|
+
}
|
|
911
|
+
},
|
|
912
|
+
},
|
|
913
|
+
|
|
914
|
+
// ============================================
|
|
915
|
+
// Email Tools (MyMX for receiving, Resend for sending)
|
|
916
|
+
// ============================================
|
|
917
|
+
|
|
918
|
+
check_email: {
|
|
919
|
+
description:
|
|
920
|
+
"List recently received emails. With MyMX, emails are automatically received via webhooks and stored as chats.",
|
|
921
|
+
parameters: {
|
|
922
|
+
limit: {
|
|
923
|
+
type: "number",
|
|
924
|
+
description: "Max emails to show (default: 10)",
|
|
925
|
+
required: false,
|
|
926
|
+
},
|
|
927
|
+
},
|
|
928
|
+
execute: async ({ limit = 10 }, config) => {
|
|
929
|
+
if (!config.permissions?.email) {
|
|
930
|
+
return "Error: Email access is disabled in configuration.";
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
try {
|
|
934
|
+
const { checkEmail } = await import("../email/client.js");
|
|
935
|
+
return await checkEmail(config);
|
|
936
|
+
} catch (err) {
|
|
937
|
+
return err.message;
|
|
938
|
+
}
|
|
939
|
+
},
|
|
940
|
+
},
|
|
941
|
+
|
|
942
|
+
send_email: {
|
|
943
|
+
description:
|
|
944
|
+
"Send an email via Resend API. Requires Resend to be configured in Settings → Email Integration.",
|
|
945
|
+
parameters: {
|
|
946
|
+
to: {
|
|
947
|
+
type: "string",
|
|
948
|
+
description: "Recipient email address",
|
|
949
|
+
required: true,
|
|
950
|
+
},
|
|
951
|
+
subject: { type: "string", description: "Email subject", required: true },
|
|
952
|
+
body: {
|
|
953
|
+
type: "string",
|
|
954
|
+
description: "Email body content (plain text)",
|
|
955
|
+
required: true,
|
|
956
|
+
},
|
|
957
|
+
},
|
|
958
|
+
execute: async ({ to, subject, body }, config) => {
|
|
959
|
+
if (!config.permissions?.email) {
|
|
960
|
+
return "Error: Email access is disabled in configuration.";
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
try {
|
|
964
|
+
const { sendEmail } = await import("../email/client.js");
|
|
965
|
+
const result = await sendEmail(config, to, subject, body);
|
|
966
|
+
return `✅ Email sent successfully to ${to}\nMessage ID: ${result.messageId}`;
|
|
967
|
+
} catch (err) {
|
|
968
|
+
return `Error sending email: ${err.message}`;
|
|
969
|
+
}
|
|
970
|
+
},
|
|
971
|
+
},
|
|
972
|
+
|
|
973
|
+
// ============================================
|
|
974
|
+
// Scheduling Tools
|
|
975
|
+
// ============================================
|
|
976
|
+
|
|
977
|
+
schedule_task: {
|
|
978
|
+
description:
|
|
979
|
+
"Schedule a RECURRING task at specified times (cron syntax). For one-time 'in X minutes', use schedule_task_once instead.",
|
|
980
|
+
parameters: {
|
|
981
|
+
cron_expression: {
|
|
982
|
+
type: "string",
|
|
983
|
+
description:
|
|
984
|
+
'Cron expression: minute hour day month weekday (e.g. "0 9 * * *" = 9am daily, "*/15 * * * *" = every 15 min)',
|
|
985
|
+
required: true,
|
|
986
|
+
},
|
|
987
|
+
task_description: {
|
|
988
|
+
type: "string",
|
|
989
|
+
description: "What the task should do",
|
|
990
|
+
required: true,
|
|
991
|
+
},
|
|
992
|
+
},
|
|
993
|
+
execute: async ({ cron_expression, task_description }, config) => {
|
|
994
|
+
try {
|
|
995
|
+
if (!cron.validate(cron_expression)) {
|
|
996
|
+
return `Error: Invalid cron expression "${cron_expression}". Use format: minute hour day month weekday (e.g. "0 9 * * *" for 9am daily).`;
|
|
997
|
+
}
|
|
998
|
+
const id = createScheduledTask(cron_expression, task_description);
|
|
999
|
+
const ok = reloadTask(id, config);
|
|
1000
|
+
if (!ok) {
|
|
1001
|
+
return `Task was saved (ID: ${id}) but failed to schedule. Check server logs.`;
|
|
1002
|
+
}
|
|
1003
|
+
return `Recurring task scheduled (ID: ${id}). Will run according to: ${cron_expression}`;
|
|
1004
|
+
} catch (err) {
|
|
1005
|
+
return `Error scheduling task: ${err.message}`;
|
|
1006
|
+
}
|
|
1007
|
+
},
|
|
1008
|
+
},
|
|
1009
|
+
|
|
1010
|
+
schedule_task_once: {
|
|
1011
|
+
description:
|
|
1012
|
+
"Schedule a ONE-TIME task to run once after N minutes. Use this for 'in 5 minutes' or 'run once in 10 minutes'. Does not persist if CLI restarts.",
|
|
1013
|
+
parameters: {
|
|
1014
|
+
run_in_minutes: {
|
|
1015
|
+
type: "number",
|
|
1016
|
+
description:
|
|
1017
|
+
"How many minutes from now to run the task (e.g. 5 for 'in 5 minutes')",
|
|
1018
|
+
required: true,
|
|
1019
|
+
},
|
|
1020
|
+
task_description: {
|
|
1021
|
+
type: "string",
|
|
1022
|
+
description: "What the task should do",
|
|
1023
|
+
required: true,
|
|
1024
|
+
},
|
|
1025
|
+
},
|
|
1026
|
+
execute: async ({ run_in_minutes, task_description }, config) => {
|
|
1027
|
+
try {
|
|
1028
|
+
const minutes = Math.max(
|
|
1029
|
+
1,
|
|
1030
|
+
Math.min(1440, Number(run_in_minutes) || 1),
|
|
1031
|
+
); // 1 min to 24h
|
|
1032
|
+
const id = scheduleOneTimeTask(task_description, minutes, config);
|
|
1033
|
+
return `One-time task scheduled (ID: ${id}). Will run once in ${minutes} minute(s).`;
|
|
1034
|
+
} catch (err) {
|
|
1035
|
+
return `Error scheduling one-time task: ${err.message}`;
|
|
1036
|
+
}
|
|
1037
|
+
},
|
|
1038
|
+
},
|
|
1039
|
+
|
|
1040
|
+
list_scheduled_tasks: {
|
|
1041
|
+
description: "List all scheduled tasks",
|
|
1042
|
+
parameters: {},
|
|
1043
|
+
execute: async (params, config) => {
|
|
1044
|
+
try {
|
|
1045
|
+
const tasks = getScheduledTasks(false);
|
|
1046
|
+
if (tasks.length === 0) {
|
|
1047
|
+
return "No scheduled tasks.";
|
|
1048
|
+
}
|
|
1049
|
+
return JSON.stringify(tasks, null, 2);
|
|
1050
|
+
} catch (err) {
|
|
1051
|
+
return `Error listing tasks: ${err.message}`;
|
|
1052
|
+
}
|
|
1053
|
+
},
|
|
1054
|
+
},
|
|
1055
|
+
|
|
1056
|
+
cancel_task: {
|
|
1057
|
+
description: "Cancel a scheduled task",
|
|
1058
|
+
parameters: {
|
|
1059
|
+
task_id: {
|
|
1060
|
+
type: "number",
|
|
1061
|
+
description: "ID of the task to cancel",
|
|
1062
|
+
required: true,
|
|
1063
|
+
},
|
|
1064
|
+
},
|
|
1065
|
+
execute: async ({ task_id }, config) => {
|
|
1066
|
+
try {
|
|
1067
|
+
// Stop the running cron job first
|
|
1068
|
+
cancelTask(task_id);
|
|
1069
|
+
// Then remove from database
|
|
1070
|
+
deleteScheduledTask(task_id);
|
|
1071
|
+
return `Task ${task_id} cancelled successfully.`;
|
|
1072
|
+
} catch (err) {
|
|
1073
|
+
return `Error cancelling task: ${err.message}`;
|
|
1074
|
+
}
|
|
1075
|
+
},
|
|
1076
|
+
},
|
|
1077
|
+
|
|
1078
|
+
// ============================================
|
|
1079
|
+
// Web Tools
|
|
1080
|
+
// ============================================
|
|
1081
|
+
|
|
1082
|
+
web_search: {
|
|
1083
|
+
description:
|
|
1084
|
+
"Search the web for current information. Returns titles, URLs, summaries, and enriched page content for top results. For comprehensive data (like earnings reports, event lists), use numResults=10+ and consider making multiple searches with different date/keyword variations.",
|
|
1085
|
+
parameters: {
|
|
1086
|
+
query: {
|
|
1087
|
+
type: "string",
|
|
1088
|
+
description:
|
|
1089
|
+
"Search query - be specific with dates, company names, or keywords",
|
|
1090
|
+
required: true,
|
|
1091
|
+
},
|
|
1092
|
+
numResults: {
|
|
1093
|
+
type: "number",
|
|
1094
|
+
description:
|
|
1095
|
+
"Number of results (default: 8, max: 15). Use higher values for comprehensive data gathering.",
|
|
1096
|
+
required: false,
|
|
1097
|
+
},
|
|
1098
|
+
},
|
|
1099
|
+
execute: async ({ query, numResults = 8 }, config) => {
|
|
1100
|
+
try {
|
|
1101
|
+
const maxResults = Math.min(numResults, 15);
|
|
1102
|
+
|
|
1103
|
+
// Refine query with date injection for time-sensitive searches
|
|
1104
|
+
const refinedQuery = refineSearchQuery(query);
|
|
1105
|
+
if (refinedQuery !== query) {
|
|
1106
|
+
console.log("[WebSearch] Refined query:", refinedQuery);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// Check cache first
|
|
1110
|
+
const cacheKey = `${refinedQuery}:${maxResults}`;
|
|
1111
|
+
const cached = searchCache.get(cacheKey);
|
|
1112
|
+
if (cached) {
|
|
1113
|
+
console.log("[WebSearch] Cache hit for:", refinedQuery);
|
|
1114
|
+
return cached;
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
// Try backend route if available (unified search with Exa/Tavily/Marginalia)
|
|
1118
|
+
const backendUrl = config?.remote?.backendUrl;
|
|
1119
|
+
let results = null;
|
|
1120
|
+
if (backendUrl) {
|
|
1121
|
+
try {
|
|
1122
|
+
const resp = await fetchWithTimeout(
|
|
1123
|
+
`${backendUrl.replace(/\/$/, "")}/api/web-search`,
|
|
1124
|
+
{
|
|
1125
|
+
method: "POST",
|
|
1126
|
+
headers: { "Content-Type": "application/json" },
|
|
1127
|
+
body: JSON.stringify({
|
|
1128
|
+
query: refinedQuery,
|
|
1129
|
+
numResults: maxResults,
|
|
1130
|
+
maxPages: maxResults,
|
|
1131
|
+
enrichTop: Math.min(3, maxResults),
|
|
1132
|
+
}),
|
|
1133
|
+
},
|
|
1134
|
+
12000,
|
|
1135
|
+
);
|
|
1136
|
+
const data = await resp.json();
|
|
1137
|
+
if (data.success && data.pages?.length) {
|
|
1138
|
+
results = data.pages.map((p) => ({
|
|
1139
|
+
url: p.url,
|
|
1140
|
+
title: p.title || p.searchTitle || "",
|
|
1141
|
+
snippet: p.searchSnippet || p.excerpt || "",
|
|
1142
|
+
source: p.source || "",
|
|
1143
|
+
content: p.content || p.excerpt || "",
|
|
1144
|
+
engines: p.engines || [],
|
|
1145
|
+
}));
|
|
1146
|
+
console.log(
|
|
1147
|
+
"[WebSearch] Backend returned",
|
|
1148
|
+
results.length,
|
|
1149
|
+
"enriched results",
|
|
1150
|
+
);
|
|
1151
|
+
}
|
|
1152
|
+
} catch (e) {
|
|
1153
|
+
console.log(
|
|
1154
|
+
"[WebSearch] Backend unavailable, falling back to direct scraping:",
|
|
1155
|
+
e?.message,
|
|
1156
|
+
);
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
// Fallback to direct scraping if backend unavailable or returned nothing
|
|
1161
|
+
if (!results || results.length === 0) {
|
|
1162
|
+
results = await performWebSearch(refinedQuery, maxResults);
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
if (results.length === 0) {
|
|
1166
|
+
return (
|
|
1167
|
+
`No results found for: "${query}"\n\nSuggestions:\n` +
|
|
1168
|
+
`1. Try more specific keywords or add dates (e.g., "January 2026")\n` +
|
|
1169
|
+
`2. Try alternative sources (e.g., "earnings calendar site:yahoo.com")\n` +
|
|
1170
|
+
`3. Break down the query into smaller parts`
|
|
1171
|
+
);
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
// Auto-enrich top 3 results with page content if not already enriched
|
|
1175
|
+
const enrichCount = Math.min(3, results.length);
|
|
1176
|
+
const enrichPromises = [];
|
|
1177
|
+
for (let i = 0; i < enrichCount; i++) {
|
|
1178
|
+
const r = results[i];
|
|
1179
|
+
if (r.content && r.content.length > 200) continue;
|
|
1180
|
+
enrichPromises.push(
|
|
1181
|
+
fetchUrlContent(r.url, 8000)
|
|
1182
|
+
.then((content) => {
|
|
1183
|
+
if (content && content.length > 100) {
|
|
1184
|
+
r.content = content.slice(0, 4000);
|
|
1185
|
+
if (!r.snippet || r.snippet.length < 50) {
|
|
1186
|
+
r.snippet = extractBestSnippet(content, refinedQuery, 400);
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
})
|
|
1190
|
+
.catch(() => {}),
|
|
1191
|
+
);
|
|
1192
|
+
}
|
|
1193
|
+
if (enrichPromises.length > 0) {
|
|
1194
|
+
await Promise.allSettled(enrichPromises);
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
// Fix empty snippets from enriched content
|
|
1198
|
+
for (const r of results) {
|
|
1199
|
+
if ((!r.snippet || r.snippet.length < 20) && r.content) {
|
|
1200
|
+
r.snippet = extractBestSnippet(
|
|
1201
|
+
r.content,
|
|
1202
|
+
refinedQuery,
|
|
1203
|
+
400,
|
|
1204
|
+
);
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
// Format results with enriched content
|
|
1209
|
+
let output = `Search results for "${query}" (${results.length} results):\n\n`;
|
|
1210
|
+
output += results
|
|
1211
|
+
.map((r, i) => {
|
|
1212
|
+
let entry = `${i + 1}. ${r.title}\n Source: ${r.source}\n URL: ${r.url}\n Summary: ${r.snippet || "No summary available"}`;
|
|
1213
|
+
if (i < enrichCount && r.content && r.content.length > 200) {
|
|
1214
|
+
entry += `\n Content preview: ${r.content.slice(0, 1500)}`;
|
|
1215
|
+
}
|
|
1216
|
+
return entry;
|
|
1217
|
+
})
|
|
1218
|
+
.join("\n\n");
|
|
1219
|
+
|
|
1220
|
+
if (
|
|
1221
|
+
query.toLowerCase().includes("earnings") ||
|
|
1222
|
+
query.toLowerCase().includes("quarterly") ||
|
|
1223
|
+
query.toLowerCase().includes("reports") ||
|
|
1224
|
+
query.toLowerCase().includes("calendar")
|
|
1225
|
+
) {
|
|
1226
|
+
output +=
|
|
1227
|
+
"\n\n💡 TIP: For complete earnings/calendar data, consider:\n";
|
|
1228
|
+
output +=
|
|
1229
|
+
'- Searching for specific dates (e.g., "earnings January 27 2026")\n';
|
|
1230
|
+
output +=
|
|
1231
|
+
'- Using financial aggregator sites in your query (e.g., "site:yahoo.com/calendar" or "site:zacks.com")\n';
|
|
1232
|
+
output += "- Fetching promising URLs for detailed company lists\n";
|
|
1233
|
+
output += "- Making multiple searches for different days of the week";
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
// Cache the result
|
|
1237
|
+
searchCache.set(cacheKey, output);
|
|
1238
|
+
|
|
1239
|
+
return output;
|
|
1240
|
+
} catch (err) {
|
|
1241
|
+
return `Error searching: ${err.message}\n\nTIP: Try rephrasing your query or breaking it into smaller parts.`;
|
|
1242
|
+
}
|
|
1243
|
+
},
|
|
1244
|
+
},
|
|
1245
|
+
|
|
1246
|
+
fetch_url: {
|
|
1247
|
+
description:
|
|
1248
|
+
"Fetch full text content from a URL. Use this to get detailed data from earnings calendars, company lists, or news articles. Extracts and formats the text content.",
|
|
1249
|
+
parameters: {
|
|
1250
|
+
url: {
|
|
1251
|
+
type: "string",
|
|
1252
|
+
description:
|
|
1253
|
+
"URL to fetch - works best with news sites, financial data pages, and text-heavy content",
|
|
1254
|
+
required: true,
|
|
1255
|
+
},
|
|
1256
|
+
},
|
|
1257
|
+
execute: async ({ url }, config) => {
|
|
1258
|
+
try {
|
|
1259
|
+
const content = await fetchUrlContent(url, 15000); // Slightly longer timeout
|
|
1260
|
+
|
|
1261
|
+
if (!content || content.length < 100) {
|
|
1262
|
+
return (
|
|
1263
|
+
`No substantial content extracted from URL.\n\n` +
|
|
1264
|
+
`This might happen if:\n` +
|
|
1265
|
+
`1. The page requires JavaScript to render content\n` +
|
|
1266
|
+
`2. The site blocks automated access\n` +
|
|
1267
|
+
`3. The content is behind a login\n\n` +
|
|
1268
|
+
`TIP: Try web_search for related content from other sources.`
|
|
1269
|
+
);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// Add summary of content length
|
|
1273
|
+
const wordCount = content.split(/\s+/).length;
|
|
1274
|
+
return (
|
|
1275
|
+
`Fetched content from: ${url}\n` +
|
|
1276
|
+
`Content length: ~${wordCount} words\n\n` +
|
|
1277
|
+
`---\n\n${content}`
|
|
1278
|
+
);
|
|
1279
|
+
} catch (err) {
|
|
1280
|
+
return (
|
|
1281
|
+
`Error fetching URL: ${err.message}\n\n` +
|
|
1282
|
+
`TIP: This site may block automated access. Try:\n` +
|
|
1283
|
+
`1. web_search for similar content from other sources\n` +
|
|
1284
|
+
`2. A different URL from the search results`
|
|
1285
|
+
);
|
|
1286
|
+
}
|
|
1287
|
+
},
|
|
1288
|
+
},
|
|
1289
|
+
|
|
1290
|
+
// ============================================
|
|
1291
|
+
// Browser Control Tools
|
|
1292
|
+
// ============================================
|
|
1293
|
+
|
|
1294
|
+
browser_launch: {
|
|
1295
|
+
description:
|
|
1296
|
+
"Launch a browser for interactive web navigation. Call this before using other browser tools.",
|
|
1297
|
+
parameters: {
|
|
1298
|
+
headless: {
|
|
1299
|
+
type: "boolean",
|
|
1300
|
+
description: "Run browser invisibly (default: true)",
|
|
1301
|
+
required: false,
|
|
1302
|
+
},
|
|
1303
|
+
},
|
|
1304
|
+
execute: async ({ headless }, config) => {
|
|
1305
|
+
try {
|
|
1306
|
+
// Override config headless setting if parameter provided
|
|
1307
|
+
const browserConfig = { ...config };
|
|
1308
|
+
if (headless !== undefined) {
|
|
1309
|
+
browserConfig.browserHeadless = headless;
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
const page = await getBrowserPage(browserConfig);
|
|
1313
|
+
return `Browser launched successfully. Ready for navigation.`;
|
|
1314
|
+
} catch (err) {
|
|
1315
|
+
return `Error launching browser: ${err.message}`;
|
|
1316
|
+
}
|
|
1317
|
+
},
|
|
1318
|
+
},
|
|
1319
|
+
|
|
1320
|
+
browser_navigate: {
|
|
1321
|
+
description:
|
|
1322
|
+
"Navigate the browser to a URL. Automatically retries on network errors with exponential backoff. By default returns page content (title, URL, interactive elements, text) so you often don't need a separate browser_read; use readAfter: false to skip.",
|
|
1323
|
+
parameters: {
|
|
1324
|
+
url: {
|
|
1325
|
+
type: "string",
|
|
1326
|
+
description: "URL to navigate to",
|
|
1327
|
+
required: true,
|
|
1328
|
+
},
|
|
1329
|
+
readAfter: {
|
|
1330
|
+
type: "boolean",
|
|
1331
|
+
description:
|
|
1332
|
+
"Include page content in the result after navigation (default: true). Set false to only get title/URL.",
|
|
1333
|
+
required: false,
|
|
1334
|
+
},
|
|
1335
|
+
},
|
|
1336
|
+
execute: async ({ url, readAfter = true }, config) => {
|
|
1337
|
+
// Ensure URL has protocol
|
|
1338
|
+
let fullUrl = url;
|
|
1339
|
+
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
|
1340
|
+
fullUrl = "https://" + url;
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
// Check for commonly blocked sites and suggest alternatives
|
|
1344
|
+
const blockedSiteAlternatives = {
|
|
1345
|
+
"nasdaq.com":
|
|
1346
|
+
'TIP: nasdaq.com often blocks automated browsers. Try using web_search for "NASDAQ earnings calendar [date]" or fetch_url with a financial news article instead.',
|
|
1347
|
+
"bloomberg.com":
|
|
1348
|
+
"TIP: bloomberg.com has strict bot detection. Try web_search or fetch_url with a news aggregator.",
|
|
1349
|
+
"linkedin.com":
|
|
1350
|
+
"TIP: linkedin.com blocks automated access. Try web_search for LinkedIn profile information.",
|
|
1351
|
+
};
|
|
1352
|
+
|
|
1353
|
+
const domain = fullUrl
|
|
1354
|
+
.replace(/^https?:\/\//, "")
|
|
1355
|
+
.split("/")[0]
|
|
1356
|
+
.replace("www.", "");
|
|
1357
|
+
const alternative = Object.entries(blockedSiteAlternatives).find(([d]) =>
|
|
1358
|
+
domain.includes(d),
|
|
1359
|
+
);
|
|
1360
|
+
|
|
1361
|
+
try {
|
|
1362
|
+
const page = await getBrowserPage(config);
|
|
1363
|
+
|
|
1364
|
+
// Use retry logic with exponential backoff
|
|
1365
|
+
const result = await withRetry(
|
|
1366
|
+
async (attempt) => {
|
|
1367
|
+
console.log(
|
|
1368
|
+
`[Browser] Navigating to: ${fullUrl} (attempt ${attempt + 1})`,
|
|
1369
|
+
);
|
|
1370
|
+
|
|
1371
|
+
// Try different wait strategies on retries
|
|
1372
|
+
const waitStrategy =
|
|
1373
|
+
attempt === 0
|
|
1374
|
+
? "domcontentloaded"
|
|
1375
|
+
: attempt === 1
|
|
1376
|
+
? "load"
|
|
1377
|
+
: "networkidle";
|
|
1378
|
+
|
|
1379
|
+
await page.goto(fullUrl, {
|
|
1380
|
+
waitUntil: waitStrategy,
|
|
1381
|
+
timeout: 20000,
|
|
1382
|
+
});
|
|
1383
|
+
|
|
1384
|
+
// Wait a bit for dynamic content (reduced for speed; auto-read still gets content)
|
|
1385
|
+
await page.waitForTimeout(1000);
|
|
1386
|
+
|
|
1387
|
+
const title = await page.title();
|
|
1388
|
+
const currentUrl = page.url();
|
|
1389
|
+
|
|
1390
|
+
// Check for bot/block pages only with specific signals (avoid false positives on normal content)
|
|
1391
|
+
const pageContent = await page.content();
|
|
1392
|
+
const lower = pageContent.toLowerCase();
|
|
1393
|
+
const isBlockPage =
|
|
1394
|
+
lower.includes("checking your browser") ||
|
|
1395
|
+
lower.includes("cf-browser-verification") ||
|
|
1396
|
+
/access\s+denied/i.test(pageContent) ||
|
|
1397
|
+
lower.includes("unusual traffic from your computer") ||
|
|
1398
|
+
lower.includes("please complete the security check") ||
|
|
1399
|
+
lower.includes("g-recaptcha") ||
|
|
1400
|
+
(lower.includes("challenge-platform") &&
|
|
1401
|
+
lower.includes("cloudflare")) ||
|
|
1402
|
+
// Short page with clear block wording (avoids "blocked" in article text)
|
|
1403
|
+
(pageContent.length < 8000 &&
|
|
1404
|
+
/(you have been blocked|your (access|request) (has been )?blocked|blocked (by|from) )/i.test(
|
|
1405
|
+
pageContent,
|
|
1406
|
+
));
|
|
1407
|
+
if (isBlockPage) {
|
|
1408
|
+
throw new Error(
|
|
1409
|
+
"Bot detection triggered - site is blocking automated access",
|
|
1410
|
+
);
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
return { title, currentUrl };
|
|
1414
|
+
},
|
|
1415
|
+
{
|
|
1416
|
+
maxRetries: 2,
|
|
1417
|
+
baseDelayMs: 2000,
|
|
1418
|
+
// Retry on network and HTTP2 errors
|
|
1419
|
+
retryOn: (err) => {
|
|
1420
|
+
const retryableErrors = [
|
|
1421
|
+
"ERR_HTTP2",
|
|
1422
|
+
"ERR_CONNECTION",
|
|
1423
|
+
"ERR_TIMED_OUT",
|
|
1424
|
+
"ETIMEDOUT",
|
|
1425
|
+
"ECONNRESET",
|
|
1426
|
+
"Navigation timeout",
|
|
1427
|
+
"net::ERR_",
|
|
1428
|
+
];
|
|
1429
|
+
return retryableErrors.some((e) => err.message.includes(e));
|
|
1430
|
+
},
|
|
1431
|
+
},
|
|
1432
|
+
);
|
|
1433
|
+
|
|
1434
|
+
let out = `Navigated to: ${result.title}\nURL: ${result.currentUrl}`;
|
|
1435
|
+
if (readAfter) {
|
|
1436
|
+
const content = await getTruncatedPageContent(page);
|
|
1437
|
+
if (content) out += `\n\n--- Page content ---\n${content}`;
|
|
1438
|
+
}
|
|
1439
|
+
return out;
|
|
1440
|
+
} catch (err) {
|
|
1441
|
+
let errorMsg = `Error navigating: ${err.message}`;
|
|
1442
|
+
|
|
1443
|
+
// Add helpful alternative suggestion
|
|
1444
|
+
if (alternative) {
|
|
1445
|
+
errorMsg += `\n\n${alternative[1]}`;
|
|
1446
|
+
} else if (
|
|
1447
|
+
err.message.includes("Bot detection") ||
|
|
1448
|
+
err.message.includes("ERR_HTTP2")
|
|
1449
|
+
) {
|
|
1450
|
+
errorMsg +=
|
|
1451
|
+
"\n\nThis site may be blocking automated browsers. Consider using:\n";
|
|
1452
|
+
errorMsg +=
|
|
1453
|
+
"1. web_search to find the information from other sources\n";
|
|
1454
|
+
errorMsg +=
|
|
1455
|
+
"2. fetch_url to get content from a news article about this topic\n";
|
|
1456
|
+
errorMsg +=
|
|
1457
|
+
"3. A more specific search query to find direct links to the data";
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
return errorMsg;
|
|
1461
|
+
}
|
|
1462
|
+
},
|
|
1463
|
+
},
|
|
1464
|
+
|
|
1465
|
+
browser_click: {
|
|
1466
|
+
description:
|
|
1467
|
+
'Click an element on the current page. For links, waits for navigation and returns the new URL. By default returns page content so you often don\'t need a separate browser_read. Prefer a[href="/path"] from browser_read output for reliable link clicks. Supports CSS, :has-text(), or link text. Auto-retries with force for overlays.',
|
|
1468
|
+
parameters: {
|
|
1469
|
+
selector: {
|
|
1470
|
+
type: "string",
|
|
1471
|
+
description:
|
|
1472
|
+
"CSS selector, Playwright selector (:has-text(), :text()), text content, or element name to click",
|
|
1473
|
+
required: true,
|
|
1474
|
+
},
|
|
1475
|
+
force: {
|
|
1476
|
+
type: "boolean",
|
|
1477
|
+
description:
|
|
1478
|
+
"Force click even if element is covered or not visible (default: false)",
|
|
1479
|
+
required: false,
|
|
1480
|
+
},
|
|
1481
|
+
readAfter: {
|
|
1482
|
+
type: "boolean",
|
|
1483
|
+
description:
|
|
1484
|
+
"Include page content in the result after click (default: true). Set false to only get title/URL.",
|
|
1485
|
+
required: false,
|
|
1486
|
+
},
|
|
1487
|
+
},
|
|
1488
|
+
execute: async ({ selector, force = false, readAfter = true }, config) => {
|
|
1489
|
+
try {
|
|
1490
|
+
const page = await getBrowserPage(config);
|
|
1491
|
+
|
|
1492
|
+
console.log("[Browser] Clicking:", selector, force ? "(forced)" : "");
|
|
1493
|
+
const element = await findElement(page, selector);
|
|
1494
|
+
await element.scrollIntoViewIfNeeded();
|
|
1495
|
+
|
|
1496
|
+
const clickOptions = { timeout: 5000, force };
|
|
1497
|
+
const performClick = async () => {
|
|
1498
|
+
try {
|
|
1499
|
+
await element.click(clickOptions);
|
|
1500
|
+
} catch (clickErr) {
|
|
1501
|
+
if (!force) {
|
|
1502
|
+
const retryWithForce =
|
|
1503
|
+
clickErr.message.includes("intercepts pointer events") ||
|
|
1504
|
+
clickErr.message.includes("element is not visible") ||
|
|
1505
|
+
clickErr.message.includes("element is outside the viewport");
|
|
1506
|
+
if (retryWithForce) {
|
|
1507
|
+
console.log(
|
|
1508
|
+
"[Browser] Element not clickable (overlay/not visible/viewport), retrying with force: true",
|
|
1509
|
+
);
|
|
1510
|
+
await element.click({ ...clickOptions, force: true });
|
|
1511
|
+
return;
|
|
1512
|
+
}
|
|
1513
|
+
}
|
|
1514
|
+
throw clickErr;
|
|
1515
|
+
}
|
|
1516
|
+
};
|
|
1517
|
+
|
|
1518
|
+
// If it's a link, wait for navigation so we return the new page and agent knows the click worked
|
|
1519
|
+
const isLink = await element
|
|
1520
|
+
.evaluate(
|
|
1521
|
+
(el) =>
|
|
1522
|
+
el.tagName === "A" &&
|
|
1523
|
+
el.getAttribute("href") &&
|
|
1524
|
+
!el.getAttribute("href").startsWith("#"),
|
|
1525
|
+
)
|
|
1526
|
+
.catch(() => false);
|
|
1527
|
+
|
|
1528
|
+
if (isLink) {
|
|
1529
|
+
await Promise.all([
|
|
1530
|
+
page
|
|
1531
|
+
.waitForNavigation({
|
|
1532
|
+
waitUntil: "domcontentloaded",
|
|
1533
|
+
timeout: 6000,
|
|
1534
|
+
})
|
|
1535
|
+
.catch(() => null),
|
|
1536
|
+
performClick(),
|
|
1537
|
+
]);
|
|
1538
|
+
const title = await page.title();
|
|
1539
|
+
const url = page.url();
|
|
1540
|
+
let out = `Clicked "${selector}". Navigated to: ${title}\nURL: ${url}`;
|
|
1541
|
+
if (readAfter) {
|
|
1542
|
+
const content = await getTruncatedPageContent(page);
|
|
1543
|
+
if (content) out += `\n\n--- Page content ---\n${content}`;
|
|
1544
|
+
}
|
|
1545
|
+
return out;
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
await performClick();
|
|
1549
|
+
await page.waitForTimeout(400);
|
|
1550
|
+
const title = await page.title();
|
|
1551
|
+
let out = `Clicked "${selector}". Current page: ${title}`;
|
|
1552
|
+
if (readAfter) {
|
|
1553
|
+
const content = await getTruncatedPageContent(page);
|
|
1554
|
+
if (content) out += `\n\n--- Page content ---\n${content}`;
|
|
1555
|
+
}
|
|
1556
|
+
return out;
|
|
1557
|
+
} catch (err) {
|
|
1558
|
+
// Provide more helpful error messages
|
|
1559
|
+
if (err.message.includes("element is not visible")) {
|
|
1560
|
+
return `Error: Element "${selector}" exists but is not visible. Try using force: true or scrolling to the element first.`;
|
|
1561
|
+
}
|
|
1562
|
+
if (err.message.includes("element is outside of the viewport")) {
|
|
1563
|
+
return `Error: Element "${selector}" is outside the viewport. Try scrolling to it first.`;
|
|
1564
|
+
}
|
|
1565
|
+
if (err.message.includes("intercepts pointer events")) {
|
|
1566
|
+
return `Error: Another element is covering "${selector}". Try using force: true or closing any modals/popups first.`;
|
|
1567
|
+
}
|
|
1568
|
+
if (err.message.includes("Could not find element")) {
|
|
1569
|
+
return `Error: ${err.message}\n\nTip: Try using different selector strategies:\n- Text: "Submit" or "button:Submit"\n- Playwright: 'button:has-text("Submit")'\n- CSS: "#submit-btn" or ".submit-button"`;
|
|
1570
|
+
}
|
|
1571
|
+
return `Error clicking element: ${err.message}`;
|
|
1572
|
+
}
|
|
1573
|
+
},
|
|
1574
|
+
},
|
|
1575
|
+
|
|
1576
|
+
browser_type: {
|
|
1577
|
+
description:
|
|
1578
|
+
"Type text into an input field on the current page. By default returns page content after typing so you often don't need a separate browser_read.",
|
|
1579
|
+
parameters: {
|
|
1580
|
+
selector: {
|
|
1581
|
+
type: "string",
|
|
1582
|
+
description:
|
|
1583
|
+
"CSS selector, placeholder text, or label of the input field",
|
|
1584
|
+
required: true,
|
|
1585
|
+
},
|
|
1586
|
+
text: { type: "string", description: "Text to type", required: true },
|
|
1587
|
+
submit: {
|
|
1588
|
+
type: "boolean",
|
|
1589
|
+
description: "Press Enter after typing to submit (default: false)",
|
|
1590
|
+
required: false,
|
|
1591
|
+
},
|
|
1592
|
+
readAfter: {
|
|
1593
|
+
type: "boolean",
|
|
1594
|
+
description:
|
|
1595
|
+
"Include page content in the result after typing (default: true). Set false to only get title.",
|
|
1596
|
+
required: false,
|
|
1597
|
+
},
|
|
1598
|
+
},
|
|
1599
|
+
execute: async (
|
|
1600
|
+
{ selector, text, submit = false, readAfter = true },
|
|
1601
|
+
config,
|
|
1602
|
+
) => {
|
|
1603
|
+
try {
|
|
1604
|
+
const page = await getBrowserPage(config);
|
|
1605
|
+
|
|
1606
|
+
console.log("[Browser] Typing into:", selector);
|
|
1607
|
+
const element = await findElement(page, selector);
|
|
1608
|
+
|
|
1609
|
+
// Clear existing content and type new text
|
|
1610
|
+
await element.fill(text);
|
|
1611
|
+
|
|
1612
|
+
if (submit) {
|
|
1613
|
+
await element.press("Enter");
|
|
1614
|
+
await page.waitForTimeout(1500); // Wait for form submission
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
const title = await page.title();
|
|
1618
|
+
let out = `Typed "${text}" into "${selector}"${submit ? " and submitted" : ""}. Current page: ${title}`;
|
|
1619
|
+
if (readAfter) {
|
|
1620
|
+
const content = await getTruncatedPageContent(page);
|
|
1621
|
+
if (content) out += `\n\n--- Page content ---\n${content}`;
|
|
1622
|
+
}
|
|
1623
|
+
return out;
|
|
1624
|
+
} catch (err) {
|
|
1625
|
+
return `Error typing: ${err.message}`;
|
|
1626
|
+
}
|
|
1627
|
+
},
|
|
1628
|
+
},
|
|
1629
|
+
|
|
1630
|
+
browser_read: {
|
|
1631
|
+
description:
|
|
1632
|
+
'Read and extract full content from the current page. Returns page title, URL, interactive elements (with href for links so you can click via a[href="..."]), and text content. Use when you need full content or when navigate/click/type didn\'t include content (e.g. readAfter was false). For scoped read use a simple CSS selector (e.g. "#section_id"); Playwright selectors like :has-text() are not supported for the selector param.',
|
|
1633
|
+
parameters: {
|
|
1634
|
+
selector: {
|
|
1635
|
+
type: "string",
|
|
1636
|
+
description:
|
|
1637
|
+
"Optional CSS selector to scope content extraction (e.g. #section_id). Standard CSS only; no :has-text() etc.",
|
|
1638
|
+
required: false,
|
|
1639
|
+
},
|
|
1640
|
+
},
|
|
1641
|
+
execute: async ({ selector }, config) => {
|
|
1642
|
+
try {
|
|
1643
|
+
const page = await getBrowserPage(config);
|
|
1644
|
+
|
|
1645
|
+
console.log(
|
|
1646
|
+
"[Browser] Reading page content",
|
|
1647
|
+
selector ? `(selector: ${selector})` : "",
|
|
1648
|
+
);
|
|
1649
|
+
const content = await extractPageContent(page, selector);
|
|
1650
|
+
|
|
1651
|
+
return content;
|
|
1652
|
+
} catch (err) {
|
|
1653
|
+
return `Error reading page: ${err.message}`;
|
|
1654
|
+
}
|
|
1655
|
+
},
|
|
1656
|
+
},
|
|
1657
|
+
|
|
1658
|
+
browser_screenshot: {
|
|
1659
|
+
description: "Take a screenshot of the current page",
|
|
1660
|
+
parameters: {
|
|
1661
|
+
path: {
|
|
1662
|
+
type: "string",
|
|
1663
|
+
description:
|
|
1664
|
+
"File path to save screenshot (default: screenshot.png in current directory)",
|
|
1665
|
+
required: false,
|
|
1666
|
+
},
|
|
1667
|
+
fullPage: {
|
|
1668
|
+
type: "boolean",
|
|
1669
|
+
description: "Capture full scrollable page (default: false)",
|
|
1670
|
+
required: false,
|
|
1671
|
+
},
|
|
1672
|
+
},
|
|
1673
|
+
execute: async ({ path, fullPage = false }, config) => {
|
|
1674
|
+
try {
|
|
1675
|
+
const page = await getBrowserPage(config);
|
|
1676
|
+
|
|
1677
|
+
const screenshotPath = path || `screenshot-${Date.now()}.png`;
|
|
1678
|
+
const resolvedPath = resolve(screenshotPath);
|
|
1679
|
+
|
|
1680
|
+
console.log("[Browser] Taking screenshot:", resolvedPath);
|
|
1681
|
+
await page.screenshot({
|
|
1682
|
+
path: resolvedPath,
|
|
1683
|
+
fullPage,
|
|
1684
|
+
});
|
|
1685
|
+
|
|
1686
|
+
return `Screenshot saved to: ${resolvedPath}`;
|
|
1687
|
+
} catch (err) {
|
|
1688
|
+
return `Error taking screenshot: ${err.message}`;
|
|
1689
|
+
}
|
|
1690
|
+
},
|
|
1691
|
+
},
|
|
1692
|
+
|
|
1693
|
+
browser_close: {
|
|
1694
|
+
description:
|
|
1695
|
+
"Close the browser and free resources. Call when done with browser tasks.",
|
|
1696
|
+
parameters: {},
|
|
1697
|
+
execute: async (params, config) => {
|
|
1698
|
+
try {
|
|
1699
|
+
if (!isBrowserActive()) {
|
|
1700
|
+
return "Browser is not currently running.";
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
await closeBrowser();
|
|
1704
|
+
return "Browser closed successfully.";
|
|
1705
|
+
} catch (err) {
|
|
1706
|
+
return `Error closing browser: ${err.message}`;
|
|
1707
|
+
}
|
|
1708
|
+
},
|
|
1709
|
+
},
|
|
1710
|
+
|
|
1711
|
+
browser_interact: {
|
|
1712
|
+
description: `Execute multiple browser actions in a single call. Reduces latency by batching operations. By default appends final page content so you often don't need a separate browser_read. Use a trailing read action or readAfter for content.
|
|
1713
|
+
|
|
1714
|
+
Supported action types:
|
|
1715
|
+
- click: { type: "click", selector: "...", force?: boolean, waitForNavigation?: boolean }
|
|
1716
|
+
- type: { type: "type", selector: "...", text: "...", clear?: boolean, submit?: boolean, delay?: number }
|
|
1717
|
+
- fill: { type: "fill", selector: "...", text: "...", submit?: boolean }
|
|
1718
|
+
- wait: { type: "wait", selector?: "...", ms?: number, url?: "...", load?: "domcontentloaded|networkidle" }
|
|
1719
|
+
- scroll: { type: "scroll", selector?: "...", direction?: "up|down|top|bottom", amount?: number }
|
|
1720
|
+
- hover: { type: "hover", selector: "..." }
|
|
1721
|
+
- select: { type: "select", selector: "...", value: "..." }
|
|
1722
|
+
- check: { type: "check", selector: "...", uncheck?: boolean }
|
|
1723
|
+
- press: { type: "press", key: "Enter|Tab|...", selector?: "..." }
|
|
1724
|
+
- navigate: { type: "navigate", url: "...", waitUntil?: "domcontentloaded|networkidle" }
|
|
1725
|
+
- read: { type: "read", selector?: "...", maxLength?: number } - Extract page content (use at end of sequence)
|
|
1726
|
+
- evaluate: { type: "evaluate", script: "return document.title" }
|
|
1727
|
+
- screenshot: { type: "screenshot", path?: "...", fullPage?: boolean }
|
|
1728
|
+
|
|
1729
|
+
Each action can have:
|
|
1730
|
+
- continueOnError: boolean - If true, continue sequence even if this action fails
|
|
1731
|
+
- delayAfter: number - Wait N ms after this action completes
|
|
1732
|
+
- waitForNavigation: boolean - For click actions, wait for page navigation to complete`,
|
|
1733
|
+
parameters: {
|
|
1734
|
+
actions: {
|
|
1735
|
+
type: "array",
|
|
1736
|
+
description:
|
|
1737
|
+
"Array of action objects to execute in sequence (can be JSON string or array)",
|
|
1738
|
+
required: true,
|
|
1739
|
+
},
|
|
1740
|
+
readAfter: {
|
|
1741
|
+
type: "boolean",
|
|
1742
|
+
description:
|
|
1743
|
+
"Append final page content to the result after all actions (default: true). Set false to only get URL/title and action results.",
|
|
1744
|
+
required: false,
|
|
1745
|
+
},
|
|
1746
|
+
},
|
|
1747
|
+
execute: async (args, config) => {
|
|
1748
|
+
const readAfter = args.readAfter !== false;
|
|
1749
|
+
// Parse actions from args - handle both direct array and JSON string
|
|
1750
|
+
let actions = args.actions;
|
|
1751
|
+
|
|
1752
|
+
// If args itself is a string, try to parse it
|
|
1753
|
+
if (typeof args === "string") {
|
|
1754
|
+
try {
|
|
1755
|
+
const parsed = JSON.parse(args);
|
|
1756
|
+
actions = parsed.actions || parsed;
|
|
1757
|
+
} catch (e) {
|
|
1758
|
+
return `Error: Could not parse arguments as JSON: ${e.message}`;
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
|
|
1762
|
+
// If actions is a string, try to parse it as JSON
|
|
1763
|
+
if (typeof actions === "string") {
|
|
1764
|
+
try {
|
|
1765
|
+
actions = JSON.parse(actions);
|
|
1766
|
+
} catch (e) {
|
|
1767
|
+
return `Error: Could not parse actions as JSON array: ${e.message}\nReceived: ${actions.substring(0, 200)}`;
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1770
|
+
|
|
1771
|
+
// Validate actions array
|
|
1772
|
+
if (!actions || !Array.isArray(actions)) {
|
|
1773
|
+
return `Error: actions must be a non-empty array of action objects. Received type: ${typeof actions}, value: ${JSON.stringify(actions).substring(0, 200)}`;
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
if (actions.length === 0) {
|
|
1777
|
+
return "Error: actions array is empty";
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1780
|
+
try {
|
|
1781
|
+
const page = await getBrowserPage(config);
|
|
1782
|
+
|
|
1783
|
+
console.log(
|
|
1784
|
+
`[Browser] Executing ${actions.length} actions in sequence`,
|
|
1785
|
+
);
|
|
1786
|
+
const result = await executeActionSequence(page, actions);
|
|
1787
|
+
|
|
1788
|
+
// Format the result for readability
|
|
1789
|
+
let output = `**Browser Interaction Results**\n`;
|
|
1790
|
+
output += `Completed: ${result.completed}/${result.total} actions\n\n`;
|
|
1791
|
+
|
|
1792
|
+
result.results.forEach((r, i) => {
|
|
1793
|
+
const status = r.success ? "✓" : "✗";
|
|
1794
|
+
output += `${i + 1}. ${status} ${r.action}: ${r.message || r.error || "done"}\n`;
|
|
1795
|
+
if (r.result !== undefined) {
|
|
1796
|
+
output += ` Result: ${JSON.stringify(r.result).substring(0, 200)}\n`;
|
|
1797
|
+
}
|
|
1798
|
+
});
|
|
1799
|
+
|
|
1800
|
+
output += `\n**Final State:**\n`;
|
|
1801
|
+
output += `URL: ${result.finalState.url}\n`;
|
|
1802
|
+
output += `Title: ${result.finalState.title}\n`;
|
|
1803
|
+
|
|
1804
|
+
if (readAfter) {
|
|
1805
|
+
const page = await getBrowserPage(config);
|
|
1806
|
+
const content = await getTruncatedPageContent(page);
|
|
1807
|
+
if (content) output += `\n--- Page content ---\n${content}`;
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
if (result.lastError) {
|
|
1811
|
+
output += `\n⚠️ Last Error: ${result.lastError}`;
|
|
1812
|
+
}
|
|
1813
|
+
|
|
1814
|
+
return output;
|
|
1815
|
+
} catch (err) {
|
|
1816
|
+
return `Error executing browser actions: ${err.message}`;
|
|
1817
|
+
}
|
|
1818
|
+
},
|
|
1819
|
+
},
|
|
1820
|
+
|
|
1821
|
+
// ============================================
|
|
1822
|
+
// Memory Tools
|
|
1823
|
+
// ============================================
|
|
1824
|
+
|
|
1825
|
+
search_memory: {
|
|
1826
|
+
description:
|
|
1827
|
+
"Search through your chat memory/history to find past conversations. Searches both local CLI history and Supabase cloud chats (when authenticated). Use simple, specific keywords (names, topics, single words). Returns matching chats with context snippets showing where the match was found. Use read_memory to get full content.",
|
|
1828
|
+
parameters: {
|
|
1829
|
+
query: {
|
|
1830
|
+
type: "string",
|
|
1831
|
+
description:
|
|
1832
|
+
"Search query - use SIMPLE keywords like a name, topic, or single word. Avoid long phrases.",
|
|
1833
|
+
required: true,
|
|
1834
|
+
},
|
|
1835
|
+
limit: {
|
|
1836
|
+
type: "number",
|
|
1837
|
+
description: "Max results to return (default: 10)",
|
|
1838
|
+
required: false,
|
|
1839
|
+
},
|
|
1840
|
+
},
|
|
1841
|
+
execute: async ({ query, limit = 10 }, config) => {
|
|
1842
|
+
const { getAllEmbeddings, searchChatsText, getChat } =
|
|
1843
|
+
await import("../storage/db.js");
|
|
1844
|
+
|
|
1845
|
+
const allResults = [];
|
|
1846
|
+
|
|
1847
|
+
// --- 1. Search Supabase (cloud/CLI tier chats) when authenticated ---
|
|
1848
|
+
if (config.accessToken) {
|
|
1849
|
+
try {
|
|
1850
|
+
const { verifyToken, searchChatsInSupabase } =
|
|
1851
|
+
await import("../storage/supabase.js");
|
|
1852
|
+
const user = await verifyToken(config.accessToken);
|
|
1853
|
+
if (user?.id) {
|
|
1854
|
+
const supaResults = await searchChatsInSupabase(
|
|
1855
|
+
user.id,
|
|
1856
|
+
query,
|
|
1857
|
+
limit,
|
|
1858
|
+
);
|
|
1859
|
+
for (const r of supaResults) {
|
|
1860
|
+
const dateStr = r.updated_at
|
|
1861
|
+
? new Date(r.updated_at).toLocaleDateString()
|
|
1862
|
+
: "Unknown date";
|
|
1863
|
+
const snippetLine = r.snippet ? `\n ${r.snippet}` : "";
|
|
1864
|
+
allResults.push({
|
|
1865
|
+
source: "cloud",
|
|
1866
|
+
score: r.similarity,
|
|
1867
|
+
line: `- [Cloud Chat ${r.chatId}] "${r.title || "Untitled"}" (${dateStr}) - match: ${r.matchSource}${snippetLine}`,
|
|
1868
|
+
});
|
|
1869
|
+
}
|
|
1870
|
+
}
|
|
1871
|
+
} catch (err) {
|
|
1872
|
+
console.warn("[search_memory] Supabase search failed:", err.message);
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
// --- 2. Search local SQLite (vector search first, then text) ---
|
|
1877
|
+
try {
|
|
1878
|
+
const allEmbeddings = getAllEmbeddings();
|
|
1879
|
+
if (allEmbeddings.length > 0) {
|
|
1880
|
+
const queryEmbed = await generateQueryEmbedding(query, config);
|
|
1881
|
+
if (queryEmbed) {
|
|
1882
|
+
const scored = allEmbeddings
|
|
1883
|
+
.map((e) => ({
|
|
1884
|
+
chatId: e.chat_id,
|
|
1885
|
+
similarity: cosineSimilarity(queryEmbed, e.embedding),
|
|
1886
|
+
}))
|
|
1887
|
+
.sort((a, b) => b.similarity - a.similarity)
|
|
1888
|
+
.slice(0, limit);
|
|
1889
|
+
|
|
1890
|
+
const relevant = scored.filter((s) => s.similarity > 0.2);
|
|
1891
|
+
for (const s of relevant) {
|
|
1892
|
+
const chat = getChat(s.chatId);
|
|
1893
|
+
if (!chat) continue;
|
|
1894
|
+
const dateStr = chat.updated_at
|
|
1895
|
+
? new Date(chat.updated_at).toLocaleDateString()
|
|
1896
|
+
: "Unknown date";
|
|
1897
|
+
allResults.push({
|
|
1898
|
+
source: "local",
|
|
1899
|
+
score: s.similarity,
|
|
1900
|
+
line: `- [Local Chat ${s.chatId}] "${chat.title || "Untitled"}" (${dateStr}) - relevance: ${(s.similarity * 100).toFixed(0)}%`,
|
|
1901
|
+
});
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
if (!allResults.some((r) => r.source === "local")) {
|
|
1907
|
+
const textResults = searchChatsText(query, limit);
|
|
1908
|
+
for (const r of textResults) {
|
|
1909
|
+
const dateStr = r.updated_at
|
|
1910
|
+
? new Date(r.updated_at).toLocaleDateString()
|
|
1911
|
+
: "Unknown date";
|
|
1912
|
+
const matchInfo =
|
|
1913
|
+
r.match_count > 1 ? ` (${r.match_count} matches)` : "";
|
|
1914
|
+
const snippet = r.matching_snippet || "";
|
|
1915
|
+
allResults.push({
|
|
1916
|
+
source: "local",
|
|
1917
|
+
score: r.match_count > 1 ? 0.7 : 0.5,
|
|
1918
|
+
line: `- [Local Chat ${r.id}] "${r.title || "Untitled"}" (${dateStr})${matchInfo}\n ${snippet}`,
|
|
1919
|
+
});
|
|
1920
|
+
}
|
|
1921
|
+
}
|
|
1922
|
+
} catch (err) {
|
|
1923
|
+
console.warn("[search_memory] Local search failed:", err.message);
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
// --- 3. Merge and return ---
|
|
1927
|
+
if (allResults.length === 0) {
|
|
1928
|
+
return `No chats found matching "${query}". Try a different keyword - simpler, single words often work better than phrases.`;
|
|
1929
|
+
}
|
|
1930
|
+
|
|
1931
|
+
allResults.sort((a, b) => b.score - a.score);
|
|
1932
|
+
const formatted = allResults.slice(0, limit).map((r) => r.line);
|
|
1933
|
+
return `Found ${formatted.length} chats matching "${query}":\n\n${formatted.join("\n\n")}\n\nUse read_memory with a chat ID to see the full conversation.`;
|
|
1934
|
+
},
|
|
1935
|
+
},
|
|
1936
|
+
|
|
1937
|
+
read_memory: {
|
|
1938
|
+
description:
|
|
1939
|
+
"Read the full contents of a specific chat from your memory. Use search_memory first to find relevant chat IDs.",
|
|
1940
|
+
parameters: {
|
|
1941
|
+
chat_id: {
|
|
1942
|
+
type: "string",
|
|
1943
|
+
description: "The chat ID (UUID) to read (from search_memory results)",
|
|
1944
|
+
required: true,
|
|
1945
|
+
},
|
|
1946
|
+
},
|
|
1947
|
+
execute: async ({ chat_id }, config) => {
|
|
1948
|
+
const { getChatWithMessages } = await import("../storage/db.js");
|
|
1949
|
+
|
|
1950
|
+
try {
|
|
1951
|
+
const chat = getChatWithMessages(chat_id);
|
|
1952
|
+
if (!chat) {
|
|
1953
|
+
return `Error: Chat ${chat_id} not found.`;
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
// Format the conversation header
|
|
1957
|
+
const dateStr = chat.updated_at
|
|
1958
|
+
? new Date(chat.updated_at).toLocaleDateString()
|
|
1959
|
+
: "Unknown date";
|
|
1960
|
+
const header = `Chat: "${chat.title || "Untitled"}" (ID: ${chat.id})\nDate: ${dateStr}\nMessages: ${chat.messages.length}\n\n`;
|
|
1961
|
+
|
|
1962
|
+
// Format each message
|
|
1963
|
+
const conversation = chat.messages
|
|
1964
|
+
.map((m) => {
|
|
1965
|
+
const role = m.role === "user" ? "User" : "Assistant";
|
|
1966
|
+
const time = m.created_at
|
|
1967
|
+
? new Date(m.created_at).toLocaleTimeString()
|
|
1968
|
+
: "";
|
|
1969
|
+
// Truncate very long messages to avoid context overflow
|
|
1970
|
+
const content =
|
|
1971
|
+
m.content.length > 2000
|
|
1972
|
+
? m.content.substring(0, 2000) +
|
|
1973
|
+
"\n... [truncated - message continues for " +
|
|
1974
|
+
(m.content.length - 2000) +
|
|
1975
|
+
" more characters]"
|
|
1976
|
+
: m.content;
|
|
1977
|
+
return `[${time}] ${role}:\n${content}`;
|
|
1978
|
+
})
|
|
1979
|
+
.join("\n\n---\n\n");
|
|
1980
|
+
|
|
1981
|
+
return header + conversation;
|
|
1982
|
+
} catch (err) {
|
|
1983
|
+
return `Error reading chat: ${err.message}`;
|
|
1984
|
+
}
|
|
1985
|
+
},
|
|
1986
|
+
},
|
|
1987
|
+
|
|
1988
|
+
// ============================================
|
|
1989
|
+
// RAG (Document Search) Tools
|
|
1990
|
+
// ============================================
|
|
1991
|
+
|
|
1992
|
+
rag_search: {
|
|
1993
|
+
description:
|
|
1994
|
+
"Search through uploaded documents for relevant information. Use this when the user mentions documents with @ (like @guidebook). Returns relevant excerpts with citations.",
|
|
1995
|
+
parameters: {
|
|
1996
|
+
query: {
|
|
1997
|
+
type: "string",
|
|
1998
|
+
description: "What to search for in the documents",
|
|
1999
|
+
required: true,
|
|
2000
|
+
},
|
|
2001
|
+
documents: {
|
|
2002
|
+
type: "string",
|
|
2003
|
+
description:
|
|
2004
|
+
"Comma-separated list of document names to search (from @mentions)",
|
|
2005
|
+
required: false,
|
|
2006
|
+
},
|
|
2007
|
+
top_k: {
|
|
2008
|
+
type: "number",
|
|
2009
|
+
description: "Number of results to return (default: 8)",
|
|
2010
|
+
required: false,
|
|
2011
|
+
},
|
|
2012
|
+
},
|
|
2013
|
+
// Mark as conditionally available - only enabled when RAG documents are mentioned
|
|
2014
|
+
isConditional: true,
|
|
2015
|
+
execute: async ({ query, documents, top_k = 8 }, config) => {
|
|
2016
|
+
const { getAllRagDocuments, getRagChunksByDocuments } =
|
|
2017
|
+
await import("../storage/db.js");
|
|
2018
|
+
|
|
2019
|
+
try {
|
|
2020
|
+
// Get all RAG documents
|
|
2021
|
+
const allDocs = getAllRagDocuments();
|
|
2022
|
+
if (allDocs.length === 0) {
|
|
2023
|
+
return "No documents have been uploaded yet. Ask the user to upload documents using the Storage dropdown in the sidebar.";
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
// Filter to requested documents if specified
|
|
2027
|
+
let targetDocs = allDocs;
|
|
2028
|
+
if (documents) {
|
|
2029
|
+
const docNames = documents
|
|
2030
|
+
.split(",")
|
|
2031
|
+
.map((d) => d.trim().toLowerCase());
|
|
2032
|
+
targetDocs = allDocs.filter((doc) => {
|
|
2033
|
+
const docNameLower = doc.name.toLowerCase();
|
|
2034
|
+
return docNames.some(
|
|
2035
|
+
(name) =>
|
|
2036
|
+
docNameLower.includes(name) || name.includes(docNameLower),
|
|
2037
|
+
);
|
|
2038
|
+
});
|
|
2039
|
+
|
|
2040
|
+
if (targetDocs.length === 0) {
|
|
2041
|
+
const availableDocs = allDocs.map((d) => d.name).join(", ");
|
|
2042
|
+
return `No documents found matching: ${documents}\n\nAvailable documents: ${availableDocs}`;
|
|
2043
|
+
}
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
// Generate query embedding
|
|
2047
|
+
const queryEmbedding = await generateQueryEmbedding(query, config);
|
|
2048
|
+
if (!queryEmbedding) {
|
|
2049
|
+
return "Error: Could not generate query embedding. Make sure an OpenAI or Google API key is configured.";
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// Get chunks from target documents
|
|
2053
|
+
const docIds = targetDocs.map((d) => d.id);
|
|
2054
|
+
const chunks = getRagChunksByDocuments(docIds);
|
|
2055
|
+
|
|
2056
|
+
if (chunks.length === 0) {
|
|
2057
|
+
return `No content found in the specified documents. The documents may not have been properly indexed.`;
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
// Build a map of doc IDs to names for faster lookup
|
|
2061
|
+
// Use Number() to ensure consistent types (SQLite can return strings or ints)
|
|
2062
|
+
const docIdToName = new Map();
|
|
2063
|
+
targetDocs.forEach((d) => {
|
|
2064
|
+
docIdToName.set(Number(d.id), d.name);
|
|
2065
|
+
});
|
|
2066
|
+
|
|
2067
|
+
console.log(
|
|
2068
|
+
"[rag_search] Document ID map:",
|
|
2069
|
+
Object.fromEntries(docIdToName),
|
|
2070
|
+
);
|
|
2071
|
+
|
|
2072
|
+
// Score chunks by similarity
|
|
2073
|
+
const scored = chunks.map((chunk) => {
|
|
2074
|
+
// Parse embedding from blob
|
|
2075
|
+
let embedding;
|
|
2076
|
+
if (Buffer.isBuffer(chunk.embedding)) {
|
|
2077
|
+
embedding = Array.from(
|
|
2078
|
+
new Float32Array(
|
|
2079
|
+
chunk.embedding.buffer,
|
|
2080
|
+
chunk.embedding.byteOffset,
|
|
2081
|
+
chunk.embedding.length / 4,
|
|
2082
|
+
),
|
|
2083
|
+
);
|
|
2084
|
+
} else if (chunk.embedding instanceof Uint8Array) {
|
|
2085
|
+
embedding = Array.from(new Float32Array(chunk.embedding.buffer));
|
|
2086
|
+
} else {
|
|
2087
|
+
embedding = chunk.embedding;
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
const similarity = cosineSimilarity(queryEmbedding, embedding);
|
|
2091
|
+
|
|
2092
|
+
// Find document name for this chunk (ensure both IDs are numbers for comparison)
|
|
2093
|
+
// Note: DB returns docId (camelCase), pageNumber, chunkIndex
|
|
2094
|
+
const chunkDocId = Number(chunk.docId || chunk.doc_id);
|
|
2095
|
+
const docName = docIdToName.get(chunkDocId) || "Unknown Document";
|
|
2096
|
+
|
|
2097
|
+
return {
|
|
2098
|
+
text: chunk.text,
|
|
2099
|
+
docId: chunkDocId,
|
|
2100
|
+
docName,
|
|
2101
|
+
pageNumber: chunk.pageNumber || chunk.page_number,
|
|
2102
|
+
chunkIndex: chunk.chunkIndex || chunk.chunk_index,
|
|
2103
|
+
similarity,
|
|
2104
|
+
};
|
|
2105
|
+
});
|
|
2106
|
+
|
|
2107
|
+
// Sort by similarity and take top results
|
|
2108
|
+
// Use a lower threshold (25%) to catch more potentially relevant content
|
|
2109
|
+
const topResults = scored
|
|
2110
|
+
.filter((s) => s.similarity > 0.25) // Minimum similarity threshold (25%)
|
|
2111
|
+
.sort((a, b) => b.similarity - a.similarity)
|
|
2112
|
+
.slice(0, top_k);
|
|
2113
|
+
|
|
2114
|
+
console.log(
|
|
2115
|
+
"[rag_search] Top results:",
|
|
2116
|
+
topResults.map((r) => ({
|
|
2117
|
+
docName: r.docName,
|
|
2118
|
+
similarity: Math.round(r.similarity * 100),
|
|
2119
|
+
textPreview: r.text.substring(0, 50),
|
|
2120
|
+
})),
|
|
2121
|
+
);
|
|
2122
|
+
|
|
2123
|
+
if (topResults.length === 0) {
|
|
2124
|
+
return `No relevant content found for "${query}" in the specified documents. Try rephrasing your question or using different keywords.`;
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
// Format results with citations - make it very clear this is real document content
|
|
2128
|
+
const docNamesUsed = [
|
|
2129
|
+
...new Set(topResults.map((r) => r.docName)),
|
|
2130
|
+
].join(", ");
|
|
2131
|
+
let output = `📚 DOCUMENT SEARCH RESULTS from: ${docNamesUsed}\n`;
|
|
2132
|
+
output += `Found ${topResults.length} relevant sections. USE THIS CONTENT TO ANSWER:\n\n`;
|
|
2133
|
+
|
|
2134
|
+
topResults.forEach((result, i) => {
|
|
2135
|
+
const pageInfo = result.pageNumber
|
|
2136
|
+
? `, Page ${result.pageNumber}`
|
|
2137
|
+
: "";
|
|
2138
|
+
const relevance = Math.round(result.similarity * 100);
|
|
2139
|
+
|
|
2140
|
+
output += `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`;
|
|
2141
|
+
output += `📄 SOURCE [${i + 1}]: ${result.docName}${pageInfo}\n`;
|
|
2142
|
+
output += ` Relevance: ${relevance}%\n\n`;
|
|
2143
|
+
output += ` CONTENT:\n`;
|
|
2144
|
+
output += ` ${result.text.trim().split("\n").join("\n ")}\n\n`;
|
|
2145
|
+
});
|
|
2146
|
+
|
|
2147
|
+
output += `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n`;
|
|
2148
|
+
output += `⚠️ IMPORTANT: Use the CONTENT above to answer the user's question.\n`;
|
|
2149
|
+
output += `Cite sources as: "According to [${docNamesUsed}, Page X]..." or quote directly.`;
|
|
2150
|
+
|
|
2151
|
+
return output;
|
|
2152
|
+
} catch (err) {
|
|
2153
|
+
console.error("[rag_search] Error:", err);
|
|
2154
|
+
return `Error searching documents: ${err.message}`;
|
|
2155
|
+
}
|
|
2156
|
+
},
|
|
2157
|
+
},
|
|
2158
|
+
};
|
|
2159
|
+
|
|
2160
|
+
// ============================================
|
|
2161
|
+
// Web Search Helpers (exported for server use)
|
|
2162
|
+
// ============================================
|
|
2163
|
+
|
|
2164
|
+
const WEBSEARCH_DEBUG =
|
|
2165
|
+
process.env.WEBSEARCH_DEBUG === "1" || process.env.WEBSEARCH_DEBUG === "true";
|
|
2166
|
+
|
|
2167
|
+
function webSearchDebug(engine, msg, extra = {}) {
|
|
2168
|
+
if (!WEBSEARCH_DEBUG) return;
|
|
2169
|
+
const parts = [`[WebSearch:${engine}]`, msg];
|
|
2170
|
+
if (Object.keys(extra).length) parts.push(JSON.stringify(extra));
|
|
2171
|
+
console.log(parts.join(" "));
|
|
2172
|
+
}
|
|
2173
|
+
|
|
2174
|
+
/** When an engine returns 0 results: always log first 2.5k chars of HTML; if WEBSEARCH_DEBUG=1, write full HTML to a file and log path. */
|
|
2175
|
+
function logZeroResultHtml(engine, html) {
|
|
2176
|
+
const sampleLen = 2500;
|
|
2177
|
+
const sample =
|
|
2178
|
+
html.length <= sampleLen
|
|
2179
|
+
? html
|
|
2180
|
+
: html.substring(0, sampleLen) + "\n... (truncated)";
|
|
2181
|
+
console.log(
|
|
2182
|
+
"[WebSearch] " +
|
|
2183
|
+
engine +
|
|
2184
|
+
" HTML (first " +
|
|
2185
|
+
Math.min(html.length, sampleLen) +
|
|
2186
|
+
" chars):",
|
|
2187
|
+
);
|
|
2188
|
+
console.log(sample);
|
|
2189
|
+
if (WEBSEARCH_DEBUG && html.length > sampleLen) {
|
|
2190
|
+
const path = join(
|
|
2191
|
+
tmpdir(),
|
|
2192
|
+
`otherwise-websearch-${engine.toLowerCase()}-${Date.now()}.html`,
|
|
2193
|
+
);
|
|
2194
|
+
writeFileSync(path, html, "utf8");
|
|
2195
|
+
console.log("[WebSearch] " + engine + " full HTML written to: " + path);
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
// Rotating browser-like User-Agents and full headers to reduce bot detection
|
|
2200
|
+
const BROWSER_USER_AGENTS = [
|
|
2201
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
2202
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
2203
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
|
2204
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
|
|
2205
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
|
|
2206
|
+
];
|
|
2207
|
+
function getBrowserHeaders(origin) {
|
|
2208
|
+
const ua =
|
|
2209
|
+
BROWSER_USER_AGENTS[Math.floor(Math.random() * BROWSER_USER_AGENTS.length)];
|
|
2210
|
+
return {
|
|
2211
|
+
"User-Agent": ua,
|
|
2212
|
+
Accept:
|
|
2213
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
2214
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
2215
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
2216
|
+
Referer: origin ? `${origin}/` : undefined,
|
|
2217
|
+
"Sec-Fetch-Dest": "document",
|
|
2218
|
+
"Sec-Fetch-Mode": "navigate",
|
|
2219
|
+
"Sec-Fetch-Site": origin ? "same-origin" : "none",
|
|
2220
|
+
"Sec-Fetch-User": "?1",
|
|
2221
|
+
"Upgrade-Insecure-Requests": "1",
|
|
2222
|
+
"sec-ch-ua":
|
|
2223
|
+
'"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
|
2224
|
+
"sec-ch-ua-mobile": "?0",
|
|
2225
|
+
"sec-ch-ua-platform": '"Windows"',
|
|
2226
|
+
};
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
/** Google: use AdsBot UA so Google may serve static HTML instead of JS-only/captcha (bypass technique). */
|
|
2230
|
+
function getGoogleSearchHeaders() {
|
|
2231
|
+
return {
|
|
2232
|
+
"User-Agent": "AdsBot-Google (+http://www.google.com/adsbot.html)",
|
|
2233
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
2234
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
2235
|
+
};
|
|
2236
|
+
}
|
|
2237
|
+
|
|
2238
|
+
/** Bing: use bingbot UA so Bing may treat request as crawler and serve SERP HTML. */
|
|
2239
|
+
function getBingSearchHeaders() {
|
|
2240
|
+
return {
|
|
2241
|
+
"User-Agent":
|
|
2242
|
+
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
|
|
2243
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
2244
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
2245
|
+
};
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
/** Extract destination URL from Bing click redirect (bing.com/ck/a?....&u=BASE64). Supports URL-safe base64. */
|
|
2249
|
+
function resolveBingCkUrl(href) {
|
|
2250
|
+
if (!href || !href.includes("bing.com/ck/a")) return null;
|
|
2251
|
+
try {
|
|
2252
|
+
const u = new URL(href);
|
|
2253
|
+
const uParam = u.searchParams.get("u");
|
|
2254
|
+
if (!uParam) return null;
|
|
2255
|
+
// Bing may use URL-safe base64 (- and _ instead of + and /)
|
|
2256
|
+
const base64 = uParam.replace(/-/g, "+").replace(/_/g, "/");
|
|
2257
|
+
const decoded = Buffer.from(base64, "base64").toString("utf8");
|
|
2258
|
+
if (decoded.startsWith("a1")) return decoded.slice(2); // Bing sometimes prefixes with a1
|
|
2259
|
+
if (/^https?:\/\//i.test(decoded)) return decoded;
|
|
2260
|
+
return null;
|
|
2261
|
+
} catch {
|
|
2262
|
+
return null;
|
|
2263
|
+
}
|
|
2264
|
+
}
|
|
2265
|
+
|
|
2266
|
+
/** Extract destination URL from Yahoo redirect (r.search.yahoo.com/.../RU=encoded/RK=...). */
|
|
2267
|
+
function resolveYahooRedirect(href) {
|
|
2268
|
+
if (!href || !href.includes("r.search.yahoo")) return null;
|
|
2269
|
+
try {
|
|
2270
|
+
const match = href.match(/[?/]RU=([^/]+)(?:\/|$)/i);
|
|
2271
|
+
if (!match) return null;
|
|
2272
|
+
const decoded = decodeURIComponent(match[1].replace(/\+/g, " "));
|
|
2273
|
+
if (/^https?:\/\//i.test(decoded)) return decoded;
|
|
2274
|
+
return null;
|
|
2275
|
+
} catch {
|
|
2276
|
+
return null;
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2279
|
+
|
|
2280
|
+
async function fetchWithTimeout(url, options = {}, timeoutMs = 10000) {
|
|
2281
|
+
const controller = new AbortController();
|
|
2282
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
2283
|
+
const useProxy = options && options.useWebSearchProxy && getWebSearchProxy();
|
|
2284
|
+
const opts = { ...options };
|
|
2285
|
+
delete opts.useWebSearchProxy;
|
|
2286
|
+
const fetchOpts = {
|
|
2287
|
+
...opts,
|
|
2288
|
+
signal: controller.signal,
|
|
2289
|
+
redirect: "follow",
|
|
2290
|
+
};
|
|
2291
|
+
try {
|
|
2292
|
+
const response = useProxy
|
|
2293
|
+
? await undiciFetch(url, {
|
|
2294
|
+
...fetchOpts,
|
|
2295
|
+
dispatcher: new ProxyAgent(useProxy),
|
|
2296
|
+
})
|
|
2297
|
+
: await fetch(url, fetchOpts);
|
|
2298
|
+
clearTimeout(timeout);
|
|
2299
|
+
return response;
|
|
2300
|
+
} catch (err) {
|
|
2301
|
+
clearTimeout(timeout);
|
|
2302
|
+
throw err;
|
|
2303
|
+
}
|
|
2304
|
+
}
|
|
2305
|
+
|
|
2306
|
+
/**
|
|
2307
|
+
* Normalize URL for deduplication (strip fragment, trailing slash, lowercase host)
|
|
2308
|
+
*/
|
|
2309
|
+
function normalizeUrlForDedup(url) {
|
|
2310
|
+
try {
|
|
2311
|
+
const u = new URL(url);
|
|
2312
|
+
u.hash = "";
|
|
2313
|
+
u.searchParams.sort();
|
|
2314
|
+
let path = u.pathname.replace(/\/+$/, "") || "/";
|
|
2315
|
+
u.pathname = path;
|
|
2316
|
+
u.hostname = u.hostname.toLowerCase();
|
|
2317
|
+
return u.toString();
|
|
2318
|
+
} catch {
|
|
2319
|
+
return url;
|
|
2320
|
+
}
|
|
2321
|
+
}
|
|
2322
|
+
|
|
2323
|
+
/**
|
|
2324
|
+
* Extract the most relevant snippet from page content by scoring paragraphs
|
|
2325
|
+
* against query terms instead of blindly taking the first N chars.
|
|
2326
|
+
*/
|
|
2327
|
+
function extractBestSnippet(content, query, maxLen = 400) {
|
|
2328
|
+
if (!content || !query) return (content || "").slice(0, maxLen);
|
|
2329
|
+
|
|
2330
|
+
const queryTerms = new Set(
|
|
2331
|
+
query
|
|
2332
|
+
.toLowerCase()
|
|
2333
|
+
.split(/\W+/)
|
|
2334
|
+
.filter((t) => t.length > 1),
|
|
2335
|
+
);
|
|
2336
|
+
if (queryTerms.size === 0) return content.slice(0, maxLen);
|
|
2337
|
+
|
|
2338
|
+
const paragraphs = content
|
|
2339
|
+
.split(/\n{2,}|\r\n{2,}/)
|
|
2340
|
+
.map((p) => p.replace(/\s+/g, " ").trim())
|
|
2341
|
+
.filter((p) => p.length > 30 && p.length < 2000);
|
|
2342
|
+
|
|
2343
|
+
if (paragraphs.length === 0) return content.slice(0, maxLen);
|
|
2344
|
+
|
|
2345
|
+
let bestScore = -1;
|
|
2346
|
+
let bestParagraph = paragraphs[0];
|
|
2347
|
+
|
|
2348
|
+
for (const para of paragraphs) {
|
|
2349
|
+
const lower = para.toLowerCase();
|
|
2350
|
+
// Skip nav/boilerplate-like paragraphs
|
|
2351
|
+
if (/^(menu|navigation|skip to|cookie|accept|sign in|log in)/i.test(para))
|
|
2352
|
+
continue;
|
|
2353
|
+
let score = 0;
|
|
2354
|
+
for (const term of queryTerms) {
|
|
2355
|
+
const idx = lower.indexOf(term);
|
|
2356
|
+
if (idx !== -1) {
|
|
2357
|
+
score += 10;
|
|
2358
|
+
if (idx < 50) score += 5; // Term appears early
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
// Prefer mid-length paragraphs (not too short, not too long)
|
|
2362
|
+
if (para.length > 60 && para.length < 800) score += 3;
|
|
2363
|
+
if (score > bestScore) {
|
|
2364
|
+
bestScore = score;
|
|
2365
|
+
bestParagraph = para;
|
|
2366
|
+
}
|
|
2367
|
+
}
|
|
2368
|
+
|
|
2369
|
+
return bestParagraph.slice(0, maxLen);
|
|
2370
|
+
}
|
|
2371
|
+
|
|
2372
|
+
/**
|
|
2373
|
+
* Run multiple search engines in parallel, aggregate and dedupe results.
|
|
2374
|
+
* Uses Google, Bing, Brave, DuckDuckGo, and Startpage (all no-API scrape);
|
|
2375
|
+
* results appearing in multiple engines are ranked higher.
|
|
2376
|
+
*/
|
|
2377
|
+
export async function performWebSearch(query, numResults = 8) {
|
|
2378
|
+
console.log(
|
|
2379
|
+
"[WebSearch] Searching for:",
|
|
2380
|
+
query,
|
|
2381
|
+
"(requesting",
|
|
2382
|
+
numResults,
|
|
2383
|
+
"results)",
|
|
2384
|
+
);
|
|
2385
|
+
|
|
2386
|
+
// Per-engine result cap (request more so we have enough after dedupe)
|
|
2387
|
+
const perEngine = Math.min(numResults + 8, 20);
|
|
2388
|
+
|
|
2389
|
+
const engineFns = [
|
|
2390
|
+
{ name: "Google", fn: () => searchGoogle(query, perEngine) },
|
|
2391
|
+
{ name: "Bing", fn: () => searchBing(query, perEngine) },
|
|
2392
|
+
{ name: "Brave", fn: () => searchBrave(query, perEngine) },
|
|
2393
|
+
{ name: "DuckDuckGo", fn: () => searchDuckDuckGo(query, perEngine) },
|
|
2394
|
+
{ name: "Startpage", fn: () => searchStartpage(query, perEngine) },
|
|
2395
|
+
{ name: "Yahoo", fn: () => searchYahoo(query, perEngine) },
|
|
2396
|
+
{ name: "Ecosia", fn: () => searchEcosia(query, perEngine) },
|
|
2397
|
+
];
|
|
2398
|
+
|
|
2399
|
+
const settled = await Promise.allSettled(engineFns.map((e) => e.fn()));
|
|
2400
|
+
|
|
2401
|
+
const byNormalized = new Map(); // normalizedUrl -> { result, engines[], firstSeenOrder }
|
|
2402
|
+
|
|
2403
|
+
let order = 0;
|
|
2404
|
+
for (let i = 0; i < settled.length; i++) {
|
|
2405
|
+
const status = settled[i];
|
|
2406
|
+
const engineName = engineFns[i].name;
|
|
2407
|
+
if (status.status === "rejected") {
|
|
2408
|
+
console.error(
|
|
2409
|
+
"[WebSearch]",
|
|
2410
|
+
engineName,
|
|
2411
|
+
"error:",
|
|
2412
|
+
status.reason?.message,
|
|
2413
|
+
);
|
|
2414
|
+
continue;
|
|
2415
|
+
}
|
|
2416
|
+
const list = status.value || [];
|
|
2417
|
+
console.log("[WebSearch]", engineName, "returned", list.length, "results");
|
|
2418
|
+
for (const r of list) {
|
|
2419
|
+
const key = normalizeUrlForDedup(r.url);
|
|
2420
|
+
const existing = byNormalized.get(key);
|
|
2421
|
+
if (existing) {
|
|
2422
|
+
existing.engines.push(engineName);
|
|
2423
|
+
if (!existing.result.snippet && r.snippet) {
|
|
2424
|
+
existing.result.snippet = r.snippet;
|
|
2425
|
+
}
|
|
2426
|
+
if (!existing.result.title && r.title) {
|
|
2427
|
+
existing.result.title = r.title;
|
|
2428
|
+
}
|
|
2429
|
+
} else {
|
|
2430
|
+
byNormalized.set(key, {
|
|
2431
|
+
result: {
|
|
2432
|
+
url: r.url,
|
|
2433
|
+
title: r.title,
|
|
2434
|
+
snippet: r.snippet,
|
|
2435
|
+
source: r.source,
|
|
2436
|
+
},
|
|
2437
|
+
engines: [engineName],
|
|
2438
|
+
firstSeenOrder: order++,
|
|
2439
|
+
});
|
|
2440
|
+
}
|
|
2441
|
+
}
|
|
2442
|
+
}
|
|
2443
|
+
|
|
2444
|
+
// Relevance scoring: query-term matches in title/snippet + cross-engine count + order
|
|
2445
|
+
const queryTerms = new Set(
|
|
2446
|
+
query
|
|
2447
|
+
.toLowerCase()
|
|
2448
|
+
.split(/\W+/)
|
|
2449
|
+
.filter((t) => t.length > 1),
|
|
2450
|
+
);
|
|
2451
|
+
|
|
2452
|
+
const sorted = [...byNormalized.values()]
|
|
2453
|
+
.sort((a, b) => {
|
|
2454
|
+
const textA = `${a.result.title} ${a.result.snippet}`.toLowerCase();
|
|
2455
|
+
const textB = `${b.result.title} ${b.result.snippet}`.toLowerCase();
|
|
2456
|
+
const termHitsA = [...queryTerms].filter((t) => textA.includes(t)).length;
|
|
2457
|
+
const termHitsB = [...queryTerms].filter((t) => textB.includes(t)).length;
|
|
2458
|
+
const aScore =
|
|
2459
|
+
termHitsA * 500 + a.engines.length * 1000 - a.firstSeenOrder;
|
|
2460
|
+
const bScore =
|
|
2461
|
+
termHitsB * 500 + b.engines.length * 1000 - b.firstSeenOrder;
|
|
2462
|
+
return bScore - aScore;
|
|
2463
|
+
})
|
|
2464
|
+
.slice(0, numResults)
|
|
2465
|
+
.map((x) => ({ ...x.result, engines: x.engines }));
|
|
2466
|
+
|
|
2467
|
+
console.log("[WebSearch] Total unique results after dedupe:", sorted.length);
|
|
2468
|
+
return sorted;
|
|
2469
|
+
}
|
|
2470
|
+
|
|
2471
|
+
/** Parse DuckDuckGo HTML SERP and return results array. */
|
|
2472
|
+
function parseDuckDuckGoResults(html, numResults) {
|
|
2473
|
+
const results = [];
|
|
2474
|
+
const seenUrls = new Set();
|
|
2475
|
+
function add(url, title, snippet) {
|
|
2476
|
+
if (url.includes("uddg=")) {
|
|
2477
|
+
const uddgMatch = url.match(/uddg=([^&]+)/);
|
|
2478
|
+
if (uddgMatch) url = decodeURIComponent(uddgMatch[1]);
|
|
2479
|
+
}
|
|
2480
|
+
if (!url.startsWith("http") || seenUrls.has(url)) return;
|
|
2481
|
+
seenUrls.add(url);
|
|
2482
|
+
let source = "";
|
|
2483
|
+
try {
|
|
2484
|
+
source = new URL(url).hostname.replace("www.", "");
|
|
2485
|
+
} catch {}
|
|
2486
|
+
results.push({
|
|
2487
|
+
url,
|
|
2488
|
+
title: title || source,
|
|
2489
|
+
snippet: snippet || `From ${source}`,
|
|
2490
|
+
source,
|
|
2491
|
+
});
|
|
2492
|
+
}
|
|
2493
|
+
const resultPattern =
|
|
2494
|
+
/<a[^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?<a[^>]*class="result__snippet"[^>]*>([\s\S]*?)<\/a>/gi;
|
|
2495
|
+
let match;
|
|
2496
|
+
while (
|
|
2497
|
+
(match = resultPattern.exec(html)) !== null &&
|
|
2498
|
+
results.length < numResults
|
|
2499
|
+
) {
|
|
2500
|
+
let snippet = match[3].replace(/<[^>]+>/g, " ").trim();
|
|
2501
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 250);
|
|
2502
|
+
add(match[1], decodeHtmlEntities(match[2].trim()), snippet);
|
|
2503
|
+
}
|
|
2504
|
+
if (results.length < numResults) {
|
|
2505
|
+
const uddgRegex = /<a[^>]*href="([^"]*uddg=[^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
|
|
2506
|
+
let um;
|
|
2507
|
+
while (
|
|
2508
|
+
(um = uddgRegex.exec(html)) !== null &&
|
|
2509
|
+
results.length < numResults
|
|
2510
|
+
) {
|
|
2511
|
+
const title = decodeHtmlEntities(um[2].replace(/<[^>]+>/g, "").trim());
|
|
2512
|
+
if (title.length < 3 || title.length > 300) continue;
|
|
2513
|
+
add(um[1], title, "");
|
|
2514
|
+
}
|
|
2515
|
+
}
|
|
2516
|
+
if (results.length < numResults) {
|
|
2517
|
+
const blocks = html.split(/class="[^"]*result__body[^"]*"[^>]*>/i);
|
|
2518
|
+
for (let i = 1; i < blocks.length && results.length < numResults; i++) {
|
|
2519
|
+
const block = blocks[i].substring(0, 2000);
|
|
2520
|
+
const linkMatch = block.match(/<a[^>]*href="([^"]+)"[^>]*>([^<]+)<\/a>/i);
|
|
2521
|
+
if (linkMatch)
|
|
2522
|
+
add(linkMatch[1], decodeHtmlEntities(linkMatch[2].trim()), "");
|
|
2523
|
+
}
|
|
2524
|
+
}
|
|
2525
|
+
return results;
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
/** Search DuckDuckGo HTML version. Retries with headless browser if fetch returns 0 results. */
|
|
2529
|
+
async function searchDuckDuckGo(query, numResults) {
|
|
2530
|
+
const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
|
|
2531
|
+
const headers = getBrowserHeaders("https://html.duckduckgo.com");
|
|
2532
|
+
headers["Sec-Fetch-Site"] = "none";
|
|
2533
|
+
|
|
2534
|
+
const response = await fetchWithTimeout(
|
|
2535
|
+
url,
|
|
2536
|
+
{ headers, useWebSearchProxy: true },
|
|
2537
|
+
12000,
|
|
2538
|
+
);
|
|
2539
|
+
let html = await response.text();
|
|
2540
|
+
|
|
2541
|
+
let results = parseDuckDuckGoResults(html, numResults);
|
|
2542
|
+
|
|
2543
|
+
if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
|
|
2544
|
+
try {
|
|
2545
|
+
console.log("[WebSearch] DuckDuckGo retrying with headless browser...");
|
|
2546
|
+
html = await fetchHtmlWithBrowser(url);
|
|
2547
|
+
results = parseDuckDuckGoResults(html, numResults);
|
|
2548
|
+
} catch (e) {
|
|
2549
|
+
console.log(
|
|
2550
|
+
"[WebSearch] DuckDuckGo browser fetch failed:",
|
|
2551
|
+
e?.message || e,
|
|
2552
|
+
);
|
|
2553
|
+
}
|
|
2554
|
+
}
|
|
2555
|
+
|
|
2556
|
+
if (results.length === 0) {
|
|
2557
|
+
const uddgCount = (html.match(/uddg=/g) || []).length;
|
|
2558
|
+
const resultACount = (html.match(/result__a/g) || []).length;
|
|
2559
|
+
const resultBodyCount = (html.match(/result__body/g) || []).length;
|
|
2560
|
+
console.log(
|
|
2561
|
+
"[WebSearch] DuckDuckGo: 0 results | htmlLen=",
|
|
2562
|
+
html.length,
|
|
2563
|
+
"| uddg=",
|
|
2564
|
+
uddgCount,
|
|
2565
|
+
"result__a=",
|
|
2566
|
+
resultACount,
|
|
2567
|
+
"result__body=",
|
|
2568
|
+
resultBodyCount,
|
|
2569
|
+
);
|
|
2570
|
+
logZeroResultHtml("DuckDuckGo", html);
|
|
2571
|
+
}
|
|
2572
|
+
console.log("[WebSearch] DuckDuckGo found:", results.length, "results");
|
|
2573
|
+
return results;
|
|
2574
|
+
}
|
|
2575
|
+
|
|
2576
|
+
/**
|
|
2577
|
+
* Decode HTML entities
|
|
2578
|
+
*/
|
|
2579
|
+
function decodeHtmlEntities(text) {
|
|
2580
|
+
return text
|
|
2581
|
+
.replace(/</g, "<")
|
|
2582
|
+
.replace(/>/g, ">")
|
|
2583
|
+
.replace(/&/g, "&")
|
|
2584
|
+
.replace(/"/g, '"')
|
|
2585
|
+
.replace(/'/g, "'")
|
|
2586
|
+
.replace(/'/g, "'")
|
|
2587
|
+
.replace(/ /g, " ")
|
|
2588
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, hex) =>
|
|
2589
|
+
String.fromCharCode(parseInt(hex, 16)),
|
|
2590
|
+
)
|
|
2591
|
+
.replace(/&#(\d+);/g, (_, dec) => String.fromCharCode(dec));
|
|
2592
|
+
}
|
|
2593
|
+
|
|
2594
|
+
/** Parse Brave SERP HTML and return results array. */
|
|
2595
|
+
function parseBraveResults(html, numResults) {
|
|
2596
|
+
const results = [];
|
|
2597
|
+
const seenUrls = new Set();
|
|
2598
|
+
const blocks = html.split(/data-type="web"/);
|
|
2599
|
+
for (let i = 1; i < blocks.length && results.length < numResults; i++) {
|
|
2600
|
+
const block = blocks[i].substring(0, 3000);
|
|
2601
|
+
const hrefMatch = block.match(/href="(https?:\/\/[^"]+)"/);
|
|
2602
|
+
if (!hrefMatch) continue;
|
|
2603
|
+
const resultUrl = hrefMatch[1];
|
|
2604
|
+
if (resultUrl.includes("brave.com") || seenUrls.has(resultUrl)) continue;
|
|
2605
|
+
const titleMatch = block.match(/title="([^"]{10,300})"/);
|
|
2606
|
+
let title = titleMatch ? decodeHtmlEntities(titleMatch[1].trim()) : "";
|
|
2607
|
+
if (!title) {
|
|
2608
|
+
const innerTitleMatch = block.match(
|
|
2609
|
+
/class="[^"]*title[^"]*"[^>]*>([^<]+)</,
|
|
2610
|
+
);
|
|
2611
|
+
if (innerTitleMatch)
|
|
2612
|
+
title = decodeHtmlEntities(innerTitleMatch[1].trim());
|
|
2613
|
+
}
|
|
2614
|
+
if (!title || title.length < 5) continue;
|
|
2615
|
+
let snippet = "";
|
|
2616
|
+
const snippetMatch = block.match(
|
|
2617
|
+
/class="[^"]*(?:snippet-description|description)[^"]*"[^>]*>([^<]+)/,
|
|
2618
|
+
);
|
|
2619
|
+
if (snippetMatch)
|
|
2620
|
+
snippet = decodeHtmlEntities(snippetMatch[1].trim())
|
|
2621
|
+
.replace(/\s+/g, " ")
|
|
2622
|
+
.slice(0, 300);
|
|
2623
|
+
seenUrls.add(resultUrl);
|
|
2624
|
+
let source = "";
|
|
2625
|
+
try {
|
|
2626
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
2627
|
+
} catch {}
|
|
2628
|
+
results.push({
|
|
2629
|
+
url: resultUrl,
|
|
2630
|
+
title,
|
|
2631
|
+
snippet: snippet || `From ${source}`,
|
|
2632
|
+
source,
|
|
2633
|
+
});
|
|
2634
|
+
}
|
|
2635
|
+
if (results.length < numResults) {
|
|
2636
|
+
const cardPattern =
|
|
2637
|
+
/class="enrichment-card-item[^"]*"[^>]*href="(https?:\/\/[^"]+)"/g;
|
|
2638
|
+
let cardMatch;
|
|
2639
|
+
while (
|
|
2640
|
+
(cardMatch = cardPattern.exec(html)) !== null &&
|
|
2641
|
+
results.length < numResults
|
|
2642
|
+
) {
|
|
2643
|
+
const cardUrl = cardMatch[1];
|
|
2644
|
+
if (seenUrls.has(cardUrl) || cardUrl.includes("brave.com")) continue;
|
|
2645
|
+
seenUrls.add(cardUrl);
|
|
2646
|
+
let source = "";
|
|
2647
|
+
try {
|
|
2648
|
+
source = new URL(cardUrl).hostname.replace("www.", "");
|
|
2649
|
+
} catch {}
|
|
2650
|
+
const cardContext = html.substring(
|
|
2651
|
+
cardMatch.index,
|
|
2652
|
+
cardMatch.index + 500,
|
|
2653
|
+
);
|
|
2654
|
+
const cardTitleMatch = cardContext.match(/title="([^"]+)"/);
|
|
2655
|
+
const title = cardTitleMatch
|
|
2656
|
+
? decodeHtmlEntities(cardTitleMatch[1].trim())
|
|
2657
|
+
: `Article from ${source}`;
|
|
2658
|
+
results.push({ url: cardUrl, title, snippet: `From ${source}`, source });
|
|
2659
|
+
}
|
|
2660
|
+
}
|
|
2661
|
+
return results;
|
|
2662
|
+
}
|
|
2663
|
+
|
|
2664
|
+
/**
|
|
2665
|
+
* Search Brave - reliable, privacy-focused. Retries with headless browser if fetch returns 0 results.
|
|
2666
|
+
*/
|
|
2667
|
+
async function searchBrave(query, numResults) {
|
|
2668
|
+
const url = `https://search.brave.com/search?q=${encodeURIComponent(query)}&source=web`;
|
|
2669
|
+
const response = await fetchWithTimeout(
|
|
2670
|
+
url,
|
|
2671
|
+
{
|
|
2672
|
+
headers: {
|
|
2673
|
+
"User-Agent":
|
|
2674
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
2675
|
+
Accept:
|
|
2676
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
2677
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
2678
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
2679
|
+
},
|
|
2680
|
+
useWebSearchProxy: true,
|
|
2681
|
+
},
|
|
2682
|
+
12000,
|
|
2683
|
+
);
|
|
2684
|
+
let html = await response.text();
|
|
2685
|
+
console.log("[WebSearch] Brave response:", html.length, "bytes");
|
|
2686
|
+
if (html.length < 5000) return [];
|
|
2687
|
+
|
|
2688
|
+
let results = parseBraveResults(html, numResults);
|
|
2689
|
+
|
|
2690
|
+
if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
|
|
2691
|
+
try {
|
|
2692
|
+
console.log("[WebSearch] Brave retrying with headless browser...");
|
|
2693
|
+
html = await fetchHtmlWithBrowser(url);
|
|
2694
|
+
results = parseBraveResults(html, numResults);
|
|
2695
|
+
} catch (e) {
|
|
2696
|
+
console.log("[WebSearch] Brave browser fetch failed:", e?.message || e);
|
|
2697
|
+
}
|
|
2698
|
+
}
|
|
2699
|
+
|
|
2700
|
+
console.log("[WebSearch] Brave found:", results.length, "results");
|
|
2701
|
+
return results;
|
|
2702
|
+
}
|
|
2703
|
+
|
|
2704
|
+
/** Parse Startpage SERP HTML and return results array. */
|
|
2705
|
+
function parseStartpageResults(html, numResults) {
|
|
2706
|
+
const results = [];
|
|
2707
|
+
const seenUrls = new Set();
|
|
2708
|
+
const patterns = [
|
|
2709
|
+
/<a[^>]*class="[^"]*result-title[^"]*"[^>]*href="([^"]+)"[^>]*>[\s\S]*?<h2[^>]*>([^<]+)<\/h2>[\s\S]*?<\/a>[\s\S]*?<p[^>]*class="[^"]*description[^"]*"[^>]*>([\s\S]*?)<\/p>/gi,
|
|
2710
|
+
/<a[^>]*class="[^"]*w-gl__result-title[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<p[^>]*class="[^"]*w-gl__description[^"]*"[^>]*>([\s\S]*?)<\/p>/gi,
|
|
2711
|
+
/<a[^>]*href="(https?:\/\/(?!.*startpage)[^"]+)"[^>]*class="[^"]*result[^"]*"[^>]*>([\s\S]*?)<\/a>[\s\S]{0,500}?<p[^>]*>([\s\S]*?)<\/p>/gi,
|
|
2712
|
+
];
|
|
2713
|
+
for (const pattern of patterns) {
|
|
2714
|
+
if (results.length >= numResults) break;
|
|
2715
|
+
let match;
|
|
2716
|
+
pattern.lastIndex = 0;
|
|
2717
|
+
while (
|
|
2718
|
+
(match = pattern.exec(html)) !== null &&
|
|
2719
|
+
results.length < numResults
|
|
2720
|
+
) {
|
|
2721
|
+
const resultUrl = match[1];
|
|
2722
|
+
let title = match[2].replace(/<[^>]+>/g, "").trim();
|
|
2723
|
+
title = decodeHtmlEntities(title);
|
|
2724
|
+
let snippet = match[3].replace(/<[^>]+>/g, " ").trim();
|
|
2725
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
2726
|
+
if (
|
|
2727
|
+
!resultUrl.startsWith("http") ||
|
|
2728
|
+
resultUrl.includes("startpage.com") ||
|
|
2729
|
+
seenUrls.has(resultUrl) ||
|
|
2730
|
+
!title ||
|
|
2731
|
+
title.length < 3
|
|
2732
|
+
)
|
|
2733
|
+
continue;
|
|
2734
|
+
seenUrls.add(resultUrl);
|
|
2735
|
+
let source = "";
|
|
2736
|
+
try {
|
|
2737
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
2738
|
+
} catch {}
|
|
2739
|
+
results.push({
|
|
2740
|
+
url: resultUrl,
|
|
2741
|
+
title,
|
|
2742
|
+
snippet: snippet || `From ${source}`,
|
|
2743
|
+
source,
|
|
2744
|
+
});
|
|
2745
|
+
}
|
|
2746
|
+
}
|
|
2747
|
+
if (results.length < numResults / 2) {
|
|
2748
|
+
const linkPattern =
|
|
2749
|
+
/<a[^>]*href="(https?:\/\/(?!.*startpage)[^"]+)"[^>]*>([^<]*(?:<[^>]*>[^<]*)*)<\/a>/gi;
|
|
2750
|
+
const descPattern =
|
|
2751
|
+
/<p[^>]*class="[^"]*(?:description|snippet|abstract)[^"]*"[^>]*>([\s\S]*?)<\/p>/gi;
|
|
2752
|
+
const links = [];
|
|
2753
|
+
const descs = [];
|
|
2754
|
+
let m;
|
|
2755
|
+
while ((m = linkPattern.exec(html)) !== null) {
|
|
2756
|
+
const url = m[1];
|
|
2757
|
+
const text = m[2].replace(/<[^>]+>/g, "").trim();
|
|
2758
|
+
if (
|
|
2759
|
+
url &&
|
|
2760
|
+
text &&
|
|
2761
|
+
text.length > 10 &&
|
|
2762
|
+
text.length < 200 &&
|
|
2763
|
+
!seenUrls.has(url)
|
|
2764
|
+
)
|
|
2765
|
+
links.push({ url, title: decodeHtmlEntities(text) });
|
|
2766
|
+
}
|
|
2767
|
+
while ((m = descPattern.exec(html)) !== null) {
|
|
2768
|
+
let d = m[1].replace(/<[^>]+>/g, " ").trim();
|
|
2769
|
+
descs.push(decodeHtmlEntities(d).replace(/\s+/g, " ").slice(0, 300));
|
|
2770
|
+
}
|
|
2771
|
+
for (let i = 0; i < links.length && results.length < numResults; i++) {
|
|
2772
|
+
if (seenUrls.has(links[i].url)) continue;
|
|
2773
|
+
seenUrls.add(links[i].url);
|
|
2774
|
+
let source = "";
|
|
2775
|
+
try {
|
|
2776
|
+
source = new URL(links[i].url).hostname.replace("www.", "");
|
|
2777
|
+
} catch {}
|
|
2778
|
+
results.push({
|
|
2779
|
+
url: links[i].url,
|
|
2780
|
+
title: links[i].title,
|
|
2781
|
+
snippet: descs[i] || `From ${source}`,
|
|
2782
|
+
source,
|
|
2783
|
+
});
|
|
2784
|
+
}
|
|
2785
|
+
}
|
|
2786
|
+
return results;
|
|
2787
|
+
}
|
|
2788
|
+
|
|
2789
|
+
/**
|
|
2790
|
+
* Search Startpage - reliable, privacy-focused. Retries with headless browser if fetch returns 0 results.
|
|
2791
|
+
*/
|
|
2792
|
+
async function searchStartpage(query, numResults) {
|
|
2793
|
+
const url = `https://www.startpage.com/sp/search?q=${encodeURIComponent(query)}&cat=web&pl=ext-ff&language=english`;
|
|
2794
|
+
const response = await fetchWithTimeout(
|
|
2795
|
+
url,
|
|
2796
|
+
{
|
|
2797
|
+
headers: {
|
|
2798
|
+
"User-Agent":
|
|
2799
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
2800
|
+
Accept:
|
|
2801
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
2802
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
2803
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
2804
|
+
},
|
|
2805
|
+
useWebSearchProxy: true,
|
|
2806
|
+
},
|
|
2807
|
+
12000,
|
|
2808
|
+
);
|
|
2809
|
+
let html = await response.text();
|
|
2810
|
+
console.log("[WebSearch] Startpage response:", html.length, "bytes");
|
|
2811
|
+
if (html.length < 5000) return [];
|
|
2812
|
+
|
|
2813
|
+
let results = parseStartpageResults(html, numResults);
|
|
2814
|
+
|
|
2815
|
+
if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
|
|
2816
|
+
try {
|
|
2817
|
+
console.log("[WebSearch] Startpage retrying with headless browser...");
|
|
2818
|
+
html = await fetchHtmlWithBrowser(url);
|
|
2819
|
+
results = parseStartpageResults(html, numResults);
|
|
2820
|
+
} catch (e) {
|
|
2821
|
+
console.log(
|
|
2822
|
+
"[WebSearch] Startpage browser fetch failed:",
|
|
2823
|
+
e?.message || e,
|
|
2824
|
+
);
|
|
2825
|
+
}
|
|
2826
|
+
}
|
|
2827
|
+
|
|
2828
|
+
console.log("[WebSearch] Startpage found:", results.length, "results");
|
|
2829
|
+
return results;
|
|
2830
|
+
}
|
|
2831
|
+
|
|
2832
|
+
/**
|
|
2833
|
+
* Extract destination URL from Google's /url?q=... redirect wrapper
|
|
2834
|
+
*/
|
|
2835
|
+
function extractGoogleResultUrl(href) {
|
|
2836
|
+
if (!href || !href.includes("url?")) return href;
|
|
2837
|
+
const m = href.match(/[?&]q=([^&]+)/);
|
|
2838
|
+
if (m) return decodeURIComponent(m[1]);
|
|
2839
|
+
return href;
|
|
2840
|
+
}
|
|
2841
|
+
|
|
2842
|
+
/** Parse Google SERP HTML and return results array (used by fetch and browser fallback). */
|
|
2843
|
+
function parseGoogleResults(html, numResults) {
|
|
2844
|
+
const results = [];
|
|
2845
|
+
const seenUrls = new Set();
|
|
2846
|
+
function addResult(resultUrl, title, snippet, source) {
|
|
2847
|
+
try {
|
|
2848
|
+
resultUrl = decodeURIComponent(resultUrl);
|
|
2849
|
+
} catch {}
|
|
2850
|
+
if (
|
|
2851
|
+
!resultUrl.startsWith("http") ||
|
|
2852
|
+
resultUrl.includes("google.com") ||
|
|
2853
|
+
seenUrls.has(resultUrl)
|
|
2854
|
+
)
|
|
2855
|
+
return false;
|
|
2856
|
+
let s = source;
|
|
2857
|
+
if (!s) {
|
|
2858
|
+
try {
|
|
2859
|
+
s = new URL(resultUrl).hostname.replace("www.", "");
|
|
2860
|
+
} catch {}
|
|
2861
|
+
}
|
|
2862
|
+
const finalTitle =
|
|
2863
|
+
title && title.length >= 2 ? title.slice(0, 300) : s || resultUrl;
|
|
2864
|
+
seenUrls.add(resultUrl);
|
|
2865
|
+
results.push({
|
|
2866
|
+
url: resultUrl,
|
|
2867
|
+
title: finalTitle,
|
|
2868
|
+
snippet: (snippet || `From ${s}`).slice(0, 300),
|
|
2869
|
+
source: s,
|
|
2870
|
+
});
|
|
2871
|
+
return true;
|
|
2872
|
+
}
|
|
2873
|
+
const gBlocks = html.split(/<div[^>]*class="[^"]*\bg\b[^"]*"[^>]*>/i);
|
|
2874
|
+
for (let i = 1; i < gBlocks.length && results.length < numResults; i++) {
|
|
2875
|
+
const block = gBlocks[i].substring(0, 4000);
|
|
2876
|
+
const hrefMatch = block.match(
|
|
2877
|
+
/href="(\/url\?q=([^"]+)|(https?:\/\/[^"]+))"/,
|
|
2878
|
+
);
|
|
2879
|
+
if (!hrefMatch) continue;
|
|
2880
|
+
let resultUrl = hrefMatch[1].startsWith("http")
|
|
2881
|
+
? hrefMatch[1]
|
|
2882
|
+
: extractGoogleResultUrl(hrefMatch[1]);
|
|
2883
|
+
const titleMatch =
|
|
2884
|
+
block.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i) ||
|
|
2885
|
+
block.match(/class="[^"]*LC20lb[^"]*"[^>]*>([^<]+)/i);
|
|
2886
|
+
const title = titleMatch
|
|
2887
|
+
? decodeHtmlEntities(titleMatch[1].replace(/<[^>]+>/g, "").trim())
|
|
2888
|
+
: "";
|
|
2889
|
+
const snippetMatch =
|
|
2890
|
+
block.match(/class="[^"]*VwiC3b[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
|
|
2891
|
+
block.match(
|
|
2892
|
+
/<span[^>]*class="[^"]*\b(?:st|s)\b[^"]*"[^>]*>([\s\S]*?)<\/span>/i,
|
|
2893
|
+
);
|
|
2894
|
+
let snippet = snippetMatch
|
|
2895
|
+
? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
|
|
2896
|
+
: "";
|
|
2897
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
2898
|
+
addResult(resultUrl, title, snippet, "");
|
|
2899
|
+
}
|
|
2900
|
+
if (results.length < numResults && /data-ved=/.test(html)) {
|
|
2901
|
+
const vedBlocks = html.split(/data-ved="[^"]*"/);
|
|
2902
|
+
for (let i = 1; i < vedBlocks.length && results.length < numResults; i++) {
|
|
2903
|
+
const block = vedBlocks[i].substring(0, 3000);
|
|
2904
|
+
const hrefMatch = block.match(
|
|
2905
|
+
/href="(\/url\?q=([^"]+)|(https?:\/\/[^"]+))"/,
|
|
2906
|
+
);
|
|
2907
|
+
if (!hrefMatch) continue;
|
|
2908
|
+
let resultUrl = hrefMatch[1].startsWith("http")
|
|
2909
|
+
? hrefMatch[1]
|
|
2910
|
+
: extractGoogleResultUrl(hrefMatch[1]);
|
|
2911
|
+
const titleMatch = block.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i);
|
|
2912
|
+
const title = titleMatch
|
|
2913
|
+
? decodeHtmlEntities(titleMatch[1].replace(/<[^>]+>/g, "").trim())
|
|
2914
|
+
: "";
|
|
2915
|
+
addResult(resultUrl, title, "", "");
|
|
2916
|
+
}
|
|
2917
|
+
}
|
|
2918
|
+
if (results.length < numResults) {
|
|
2919
|
+
const urlQRegex = /href="(\/url\?q=)([^"]+)"/gi;
|
|
2920
|
+
let urlMatch;
|
|
2921
|
+
while (
|
|
2922
|
+
(urlMatch = urlQRegex.exec(html)) !== null &&
|
|
2923
|
+
results.length < numResults
|
|
2924
|
+
) {
|
|
2925
|
+
const qValue = urlMatch[2];
|
|
2926
|
+
let resultUrl = "";
|
|
2927
|
+
try {
|
|
2928
|
+
resultUrl = decodeURIComponent(qValue.replace(/&/g, "&"));
|
|
2929
|
+
} catch {
|
|
2930
|
+
continue;
|
|
2931
|
+
}
|
|
2932
|
+
if (
|
|
2933
|
+
!resultUrl.startsWith("http") ||
|
|
2934
|
+
resultUrl.includes("google.com") ||
|
|
2935
|
+
seenUrls.has(resultUrl)
|
|
2936
|
+
)
|
|
2937
|
+
continue;
|
|
2938
|
+
const after = html.substring(
|
|
2939
|
+
urlMatch.index,
|
|
2940
|
+
Math.min(html.length, urlMatch.index + 1200),
|
|
2941
|
+
);
|
|
2942
|
+
const titleMatch =
|
|
2943
|
+
after.match(/<h3[^>]*>([\s\S]*?)<\/h3>/i) ||
|
|
2944
|
+
after.match(/<span[^>]*>([^<]{5,200})<\/span>/);
|
|
2945
|
+
const title = titleMatch
|
|
2946
|
+
? decodeHtmlEntities(
|
|
2947
|
+
titleMatch[1].replace(/<[^>]+>/g, "").trim(),
|
|
2948
|
+
).slice(0, 300)
|
|
2949
|
+
: "";
|
|
2950
|
+
addResult(resultUrl, title, "", "");
|
|
2951
|
+
}
|
|
2952
|
+
}
|
|
2953
|
+
if (results.length < numResults) {
|
|
2954
|
+
const urlQRegex2 = /\/url\?q=([^"&]+)/gi;
|
|
2955
|
+
let urlMatch2;
|
|
2956
|
+
while (
|
|
2957
|
+
(urlMatch2 = urlQRegex2.exec(html)) !== null &&
|
|
2958
|
+
results.length < numResults
|
|
2959
|
+
) {
|
|
2960
|
+
let resultUrl = "";
|
|
2961
|
+
try {
|
|
2962
|
+
resultUrl = decodeURIComponent(urlMatch2[1].replace(/&/g, "&"));
|
|
2963
|
+
} catch {
|
|
2964
|
+
continue;
|
|
2965
|
+
}
|
|
2966
|
+
if (
|
|
2967
|
+
!resultUrl.startsWith("http") ||
|
|
2968
|
+
resultUrl.includes("google.com") ||
|
|
2969
|
+
seenUrls.has(resultUrl)
|
|
2970
|
+
)
|
|
2971
|
+
continue;
|
|
2972
|
+
const after = html.substring(
|
|
2973
|
+
urlMatch2.index,
|
|
2974
|
+
Math.min(html.length, urlMatch2.index + 800),
|
|
2975
|
+
);
|
|
2976
|
+
const textBlock = after.match(/>([^<]{5,200})</);
|
|
2977
|
+
const title = textBlock
|
|
2978
|
+
? decodeHtmlEntities(textBlock[1].trim()).slice(0, 300)
|
|
2979
|
+
: "";
|
|
2980
|
+
addResult(resultUrl, title, "", "");
|
|
2981
|
+
}
|
|
2982
|
+
}
|
|
2983
|
+
if (results.length < numResults) {
|
|
2984
|
+
const citeRegex = /<cite[^>]*>([^<]+)<\/cite>/gi;
|
|
2985
|
+
let citeMatch;
|
|
2986
|
+
while (
|
|
2987
|
+
(citeMatch = citeRegex.exec(html)) !== null &&
|
|
2988
|
+
results.length < numResults
|
|
2989
|
+
) {
|
|
2990
|
+
const before = html.substring(
|
|
2991
|
+
Math.max(0, citeMatch.index - 1200),
|
|
2992
|
+
citeMatch.index,
|
|
2993
|
+
);
|
|
2994
|
+
const hrefAll = [...before.matchAll(/href="(\/url\?q=([^"]+))"/g)];
|
|
2995
|
+
const hrefMatch = hrefAll[hrefAll.length - 1];
|
|
2996
|
+
if (!hrefMatch) continue;
|
|
2997
|
+
let resultUrl = hrefMatch[1].startsWith("http")
|
|
2998
|
+
? hrefMatch[1]
|
|
2999
|
+
: extractGoogleResultUrl(hrefMatch[1]);
|
|
3000
|
+
try {
|
|
3001
|
+
resultUrl = decodeURIComponent(resultUrl);
|
|
3002
|
+
} catch {
|
|
3003
|
+
continue;
|
|
3004
|
+
}
|
|
3005
|
+
if (
|
|
3006
|
+
!resultUrl.startsWith("http") ||
|
|
3007
|
+
resultUrl.includes("google.com") ||
|
|
3008
|
+
seenUrls.has(resultUrl)
|
|
3009
|
+
)
|
|
3010
|
+
continue;
|
|
3011
|
+
const h3All = [...before.matchAll(/<h3[^>]*>([\s\S]*?)<\/h3>/gi)];
|
|
3012
|
+
const titleMatch = h3All[h3All.length - 1];
|
|
3013
|
+
const title = titleMatch
|
|
3014
|
+
? decodeHtmlEntities(
|
|
3015
|
+
titleMatch[1].replace(/<[^>]+>/g, "").trim(),
|
|
3016
|
+
).slice(0, 300)
|
|
3017
|
+
: "";
|
|
3018
|
+
addResult(resultUrl, title, "", "");
|
|
3019
|
+
}
|
|
3020
|
+
}
|
|
3021
|
+
return results;
|
|
3022
|
+
}
|
|
3023
|
+
|
|
3024
|
+
/**
|
|
3025
|
+
* Fetch Google results via SerpAPI (no captcha). Set SERPAPI_API_KEY to use.
|
|
3026
|
+
*/
|
|
3027
|
+
async function fetchSerpApiGoogle(query, numResults) {
|
|
3028
|
+
const apiKey = process.env.SERPAPI_API_KEY;
|
|
3029
|
+
if (!apiKey) return null;
|
|
3030
|
+
const url = `https://serpapi.com/search?engine=google&q=${encodeURIComponent(query)}&num=${Math.min(numResults + 5, 30)}&api_key=${apiKey}`;
|
|
3031
|
+
try {
|
|
3032
|
+
const response = await fetchWithTimeout(url, {}, 15000);
|
|
3033
|
+
if (!response.ok) return null;
|
|
3034
|
+
const data = await response.json();
|
|
3035
|
+
const organic = data.organic_results || [];
|
|
3036
|
+
const results = [];
|
|
3037
|
+
for (const r of organic) {
|
|
3038
|
+
if (results.length >= numResults) break;
|
|
3039
|
+
const link = r.link || r.redirect_link;
|
|
3040
|
+
if (!link || link.includes("google.com")) continue;
|
|
3041
|
+
let source = "";
|
|
3042
|
+
try {
|
|
3043
|
+
source = new URL(link).hostname.replace("www.", "");
|
|
3044
|
+
} catch {}
|
|
3045
|
+
results.push({
|
|
3046
|
+
url: link,
|
|
3047
|
+
title: (r.title || "").slice(0, 300),
|
|
3048
|
+
snippet: (r.snippet || `From ${source}`).slice(0, 300),
|
|
3049
|
+
source,
|
|
3050
|
+
});
|
|
3051
|
+
}
|
|
3052
|
+
if (results.length > 0)
|
|
3053
|
+
console.log(
|
|
3054
|
+
"[WebSearch] Google (SerpAPI) found:",
|
|
3055
|
+
results.length,
|
|
3056
|
+
"results",
|
|
3057
|
+
);
|
|
3058
|
+
return results;
|
|
3059
|
+
} catch (e) {
|
|
3060
|
+
console.log("[WebSearch] SerpAPI Google error:", e?.message || e);
|
|
3061
|
+
return null;
|
|
3062
|
+
}
|
|
3063
|
+
}
|
|
3064
|
+
|
|
3065
|
+
/**
|
|
3066
|
+
* Fetch Bing results via SerpAPI. Set SERPAPI_API_KEY to use.
|
|
3067
|
+
*/
|
|
3068
|
+
async function fetchSerpApiBing(query, numResults) {
|
|
3069
|
+
const apiKey = process.env.SERPAPI_API_KEY;
|
|
3070
|
+
if (!apiKey) return null;
|
|
3071
|
+
const url = `https://serpapi.com/search?engine=bing&q=${encodeURIComponent(query)}&count=${Math.min(numResults + 5, 30)}&api_key=${apiKey}`;
|
|
3072
|
+
try {
|
|
3073
|
+
const response = await fetchWithTimeout(url, {}, 15000);
|
|
3074
|
+
if (!response.ok) return null;
|
|
3075
|
+
const data = await response.json();
|
|
3076
|
+
const organic = data.organic_results || [];
|
|
3077
|
+
const results = [];
|
|
3078
|
+
for (const r of organic) {
|
|
3079
|
+
if (results.length >= numResults) break;
|
|
3080
|
+
const link = r.link;
|
|
3081
|
+
if (!link || link.includes("bing.com") || link.includes("microsoft.com"))
|
|
3082
|
+
continue;
|
|
3083
|
+
let source = "";
|
|
3084
|
+
try {
|
|
3085
|
+
source = new URL(link).hostname.replace("www.", "");
|
|
3086
|
+
} catch {}
|
|
3087
|
+
results.push({
|
|
3088
|
+
url: link,
|
|
3089
|
+
title: (r.title || "").slice(0, 300),
|
|
3090
|
+
snippet: (r.snippet || `From ${source}`).slice(0, 300),
|
|
3091
|
+
source,
|
|
3092
|
+
});
|
|
3093
|
+
}
|
|
3094
|
+
if (results.length > 0)
|
|
3095
|
+
console.log(
|
|
3096
|
+
"[WebSearch] Bing (SerpAPI) found:",
|
|
3097
|
+
results.length,
|
|
3098
|
+
"results",
|
|
3099
|
+
);
|
|
3100
|
+
return results;
|
|
3101
|
+
} catch (e) {
|
|
3102
|
+
console.log("[WebSearch] SerpAPI Bing error:", e?.message || e);
|
|
3103
|
+
return null;
|
|
3104
|
+
}
|
|
3105
|
+
}
|
|
3106
|
+
|
|
3107
|
+
/**
|
|
3108
|
+
* Search Google (no API) - scrape SERP HTML. Uses SerpAPI if SERPAPI_API_KEY set; else proxy if WEBSEARCH_PROXY/HTTPS_PROXY set. Retries with headless browser if fetch returns 0 results.
|
|
3109
|
+
*/
|
|
3110
|
+
async function searchGoogle(query, numResults) {
|
|
3111
|
+
const serpApiResults = await fetchSerpApiGoogle(query, numResults);
|
|
3112
|
+
if (serpApiResults && serpApiResults.length > 0) return serpApiResults;
|
|
3113
|
+
|
|
3114
|
+
const num = Math.min(numResults + 5, 30);
|
|
3115
|
+
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=${num}&ncr=1`;
|
|
3116
|
+
let headers = getGoogleSearchHeaders();
|
|
3117
|
+
const fetchOptions = { headers, useWebSearchProxy: true };
|
|
3118
|
+
|
|
3119
|
+
let response = await fetchWithTimeout(url, fetchOptions, 15000);
|
|
3120
|
+
let html = await response.text();
|
|
3121
|
+
|
|
3122
|
+
if (
|
|
3123
|
+
/Update your browser|isn't supported anymore/i.test(html) &&
|
|
3124
|
+
html.length < 15000
|
|
3125
|
+
) {
|
|
3126
|
+
headers = getBrowserHeaders("https://www.google.com");
|
|
3127
|
+
delete headers["Sec-Fetch-Site"];
|
|
3128
|
+
headers["Sec-Fetch-Site"] = "none";
|
|
3129
|
+
response = await fetchWithTimeout(
|
|
3130
|
+
url,
|
|
3131
|
+
{ headers, useWebSearchProxy: true },
|
|
3132
|
+
15000,
|
|
3133
|
+
);
|
|
3134
|
+
html = await response.text();
|
|
3135
|
+
}
|
|
3136
|
+
|
|
3137
|
+
const isBlocked =
|
|
3138
|
+
/unusual\s+traffic|we've\s+detected|detected\s+unusual|captcha|blocked|before you continue|consent\.google/i.test(
|
|
3139
|
+
html,
|
|
3140
|
+
) && html.length < 20000;
|
|
3141
|
+
if (isBlocked) {
|
|
3142
|
+
console.log(
|
|
3143
|
+
"[WebSearch] Google: page appears blocked/captcha (length=",
|
|
3144
|
+
html.length,
|
|
3145
|
+
")",
|
|
3146
|
+
);
|
|
3147
|
+
logZeroResultHtml("Google", html);
|
|
3148
|
+
return [];
|
|
3149
|
+
}
|
|
3150
|
+
|
|
3151
|
+
let results = parseGoogleResults(html, numResults);
|
|
3152
|
+
|
|
3153
|
+
// Skip browser retry when we already got captcha (browser would get same)
|
|
3154
|
+
const isCaptcha =
|
|
3155
|
+
/recaptcha|captcha-form|solveSimpleChallenge|data-sitekey/i.test(html);
|
|
3156
|
+
if (
|
|
3157
|
+
results.length === 0 &&
|
|
3158
|
+
process.env.WEBSEARCH_USE_BROWSER !== "0" &&
|
|
3159
|
+
!isCaptcha
|
|
3160
|
+
) {
|
|
3161
|
+
try {
|
|
3162
|
+
console.log("[WebSearch] Google retrying with headless browser...");
|
|
3163
|
+
html = await fetchHtmlWithBrowser(url);
|
|
3164
|
+
if (/recaptcha|captcha-form|solveSimpleChallenge/i.test(html))
|
|
3165
|
+
return results;
|
|
3166
|
+
results = parseGoogleResults(html, numResults);
|
|
3167
|
+
} catch (e) {
|
|
3168
|
+
console.log("[WebSearch] Google browser fetch failed:", e?.message || e);
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
|
|
3172
|
+
if (results.length === 0) {
|
|
3173
|
+
const gCount = (
|
|
3174
|
+
html.match(/<div[^>]*class="[^"]*\bg\b[^"]*"[^>]*>/gi) || []
|
|
3175
|
+
).length;
|
|
3176
|
+
const urlQCount = (html.match(/\/url\?q=/gi) || []).length;
|
|
3177
|
+
const citeCount = (html.match(/<cite[^>]*>/gi) || []).length;
|
|
3178
|
+
const h3Count = (html.match(/<h3[^>]*>/gi) || []).length;
|
|
3179
|
+
console.log(
|
|
3180
|
+
"[WebSearch] Google: 0 results | htmlLen=",
|
|
3181
|
+
html.length,
|
|
3182
|
+
"status=",
|
|
3183
|
+
response?.status,
|
|
3184
|
+
"| gBlocks=",
|
|
3185
|
+
gCount,
|
|
3186
|
+
"urlQ=",
|
|
3187
|
+
urlQCount,
|
|
3188
|
+
"cite=",
|
|
3189
|
+
citeCount,
|
|
3190
|
+
"h3=",
|
|
3191
|
+
h3Count,
|
|
3192
|
+
);
|
|
3193
|
+
logZeroResultHtml("Google", html);
|
|
3194
|
+
} else {
|
|
3195
|
+
console.log("[WebSearch] Google found:", results.length, "results");
|
|
3196
|
+
}
|
|
3197
|
+
return results;
|
|
3198
|
+
}
|
|
3199
|
+
|
|
3200
|
+
/** Parse Bing SERP HTML and return results array (used by fetch and browser fallback). */
|
|
3201
|
+
function parseBingResults(html, numResults) {
|
|
3202
|
+
const results = [];
|
|
3203
|
+
const seenUrls = new Set();
|
|
3204
|
+
function addResult(resultUrl, title, snippet) {
|
|
3205
|
+
if (
|
|
3206
|
+
resultUrl.includes("bing.com") ||
|
|
3207
|
+
resultUrl.includes("microsoft.com") ||
|
|
3208
|
+
seenUrls.has(resultUrl)
|
|
3209
|
+
)
|
|
3210
|
+
return false;
|
|
3211
|
+
if (!title || title.length < 2) return false;
|
|
3212
|
+
seenUrls.add(resultUrl);
|
|
3213
|
+
let source = "";
|
|
3214
|
+
try {
|
|
3215
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
3216
|
+
} catch {}
|
|
3217
|
+
results.push({
|
|
3218
|
+
url: resultUrl,
|
|
3219
|
+
title: title.slice(0, 300),
|
|
3220
|
+
snippet: (snippet || `From ${source}`).slice(0, 300),
|
|
3221
|
+
source,
|
|
3222
|
+
});
|
|
3223
|
+
return true;
|
|
3224
|
+
}
|
|
3225
|
+
function resolveUrl(href) {
|
|
3226
|
+
if (href && href.includes("bing.com/ck/a")) {
|
|
3227
|
+
const resolved = resolveBingCkUrl(href);
|
|
3228
|
+
if (
|
|
3229
|
+
resolved &&
|
|
3230
|
+
!resolved.includes("bing.com") &&
|
|
3231
|
+
!resolved.includes("microsoft.com")
|
|
3232
|
+
)
|
|
3233
|
+
return resolved;
|
|
3234
|
+
}
|
|
3235
|
+
return href;
|
|
3236
|
+
}
|
|
3237
|
+
// Match either double- or single-quoted href (browser-rendered HTML may use either)
|
|
3238
|
+
const linkRegexBing =
|
|
3239
|
+
/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
3240
|
+
// li or div (browser-rendered Bing often uses div for result blocks)
|
|
3241
|
+
const algoRegex = /<(?:li|div)[^>]*class="[^"]*b_(?:algo|ans)[^"]*"[^>]*>/gi;
|
|
3242
|
+
let algoMatch;
|
|
3243
|
+
const blocks = [];
|
|
3244
|
+
while ((algoMatch = algoRegex.exec(html)) !== null)
|
|
3245
|
+
blocks.push({ start: algoMatch.index, end: html.length });
|
|
3246
|
+
for (let i = 0; i < blocks.length && results.length < numResults; i++) {
|
|
3247
|
+
const start = blocks[i].start;
|
|
3248
|
+
const end = i + 1 < blocks.length ? blocks[i + 1].start : html.length;
|
|
3249
|
+
const block = html.substring(start, Math.min(start + 3500, end));
|
|
3250
|
+
linkRegexBing.lastIndex = 0;
|
|
3251
|
+
const allLinks = [...block.matchAll(linkRegexBing)];
|
|
3252
|
+
for (const linkMatch of allLinks) {
|
|
3253
|
+
let resultUrl = resolveUrl(linkMatch[1]);
|
|
3254
|
+
if (resultUrl.includes("bing.com") || resultUrl.includes("microsoft.com"))
|
|
3255
|
+
continue;
|
|
3256
|
+
let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
|
|
3257
|
+
title = decodeHtmlEntities(title);
|
|
3258
|
+
if (title.length < 2) continue;
|
|
3259
|
+
const snippetMatch =
|
|
3260
|
+
block.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
|
|
3261
|
+
block.match(
|
|
3262
|
+
/class="[^"]*b_caption[^"]*"[^>]*>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i,
|
|
3263
|
+
);
|
|
3264
|
+
let snippet = snippetMatch
|
|
3265
|
+
? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
|
|
3266
|
+
: "";
|
|
3267
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
3268
|
+
if (addResult(resultUrl, title, snippet)) break;
|
|
3269
|
+
}
|
|
3270
|
+
}
|
|
3271
|
+
if (results.length < numResults) {
|
|
3272
|
+
const algoBlocks = html.split(
|
|
3273
|
+
/<(?:li|div)[^>]*class="[^"]*b_algo[^"]*"[^>]*>/i,
|
|
3274
|
+
);
|
|
3275
|
+
for (let i = 1; i < algoBlocks.length && results.length < numResults; i++) {
|
|
3276
|
+
const block = algoBlocks[i].substring(0, 3500);
|
|
3277
|
+
linkRegexBing.lastIndex = 0;
|
|
3278
|
+
const allLinks = [...block.matchAll(linkRegexBing)];
|
|
3279
|
+
for (const linkMatch of allLinks) {
|
|
3280
|
+
let resultUrl = resolveUrl(linkMatch[1]);
|
|
3281
|
+
if (
|
|
3282
|
+
resultUrl.includes("bing.com") ||
|
|
3283
|
+
resultUrl.includes("microsoft.com")
|
|
3284
|
+
)
|
|
3285
|
+
continue;
|
|
3286
|
+
let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
|
|
3287
|
+
title = decodeHtmlEntities(title);
|
|
3288
|
+
if (title.length < 2) continue;
|
|
3289
|
+
const snippetMatch = block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
|
|
3290
|
+
let snippet = snippetMatch
|
|
3291
|
+
? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
|
|
3292
|
+
: "";
|
|
3293
|
+
snippet = decodeHtmlEntities(snippet)
|
|
3294
|
+
.replace(/\s+/g, " ")
|
|
3295
|
+
.slice(0, 300);
|
|
3296
|
+
if (addResult(resultUrl, title, snippet)) break;
|
|
3297
|
+
}
|
|
3298
|
+
}
|
|
3299
|
+
}
|
|
3300
|
+
if (results.length < numResults) {
|
|
3301
|
+
const h2Pattern =
|
|
3302
|
+
/<h2[^>]*>[\s\S]*?<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h2>[\s\S]{0,800}?<p[^>]*>([\s\S]*?)<\/p>/gi;
|
|
3303
|
+
let m;
|
|
3304
|
+
while ((m = h2Pattern.exec(html)) !== null && results.length < numResults) {
|
|
3305
|
+
let resultUrl = resolveUrl(m[1]);
|
|
3306
|
+
if (resultUrl.includes("bing.com") || resultUrl.includes("microsoft.com"))
|
|
3307
|
+
continue;
|
|
3308
|
+
const title = decodeHtmlEntities(m[2].replace(/<[^>]+>/g, "").trim());
|
|
3309
|
+
let snippet = m[3].replace(/<[^>]+>/g, " ").trim();
|
|
3310
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
3311
|
+
addResult(resultUrl, title, snippet);
|
|
3312
|
+
}
|
|
3313
|
+
}
|
|
3314
|
+
if (results.length < numResults) {
|
|
3315
|
+
linkRegexBing.lastIndex = 0;
|
|
3316
|
+
let lm;
|
|
3317
|
+
while (
|
|
3318
|
+
(lm = linkRegexBing.exec(html)) !== null &&
|
|
3319
|
+
results.length < numResults
|
|
3320
|
+
) {
|
|
3321
|
+
let url = resolveUrl(lm[1]);
|
|
3322
|
+
if (
|
|
3323
|
+
url.includes("bing.com") ||
|
|
3324
|
+
url.includes("microsoft.com") ||
|
|
3325
|
+
seenUrls.has(url)
|
|
3326
|
+
)
|
|
3327
|
+
continue;
|
|
3328
|
+
let title = lm[2].replace(/<[^>]+>/g, "").trim();
|
|
3329
|
+
title = decodeHtmlEntities(title);
|
|
3330
|
+
if (title.length < 4 || title.length > 250) continue;
|
|
3331
|
+
if (/^(http|www|search|images|video|news|maps)/i.test(title)) continue;
|
|
3332
|
+
addResult(url, title, "", "");
|
|
3333
|
+
}
|
|
3334
|
+
}
|
|
3335
|
+
return results;
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3338
|
+
/**
|
|
3339
|
+
* Search Bing (no API) - scrape SERP HTML. Uses SerpAPI if SERPAPI_API_KEY set; else proxy if WEBSEARCH_PROXY/HTTPS_PROXY set. Retries with headless browser if fetch returns 0 results.
|
|
3340
|
+
*/
|
|
3341
|
+
async function searchBing(query, numResults) {
|
|
3342
|
+
const serpApiResults = await fetchSerpApiBing(query, numResults);
|
|
3343
|
+
if (serpApiResults && serpApiResults.length > 0) return serpApiResults;
|
|
3344
|
+
|
|
3345
|
+
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${Math.min(numResults + 5, 30)}`;
|
|
3346
|
+
const headers = getBingSearchHeaders();
|
|
3347
|
+
|
|
3348
|
+
const response = await fetchWithTimeout(
|
|
3349
|
+
url,
|
|
3350
|
+
{ headers, useWebSearchProxy: true },
|
|
3351
|
+
15000,
|
|
3352
|
+
);
|
|
3353
|
+
let html = await response.text();
|
|
3354
|
+
|
|
3355
|
+
if (html.length < 2000) {
|
|
3356
|
+
console.log(
|
|
3357
|
+
"[WebSearch] Bing: response too short (length=",
|
|
3358
|
+
html.length,
|
|
3359
|
+
", status=",
|
|
3360
|
+
response.status,
|
|
3361
|
+
")",
|
|
3362
|
+
);
|
|
3363
|
+
logZeroResultHtml("Bing", html);
|
|
3364
|
+
return [];
|
|
3365
|
+
}
|
|
3366
|
+
|
|
3367
|
+
let results = parseBingResults(html, numResults);
|
|
3368
|
+
|
|
3369
|
+
if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
|
|
3370
|
+
try {
|
|
3371
|
+
console.log("[WebSearch] Bing retrying with headless browser...");
|
|
3372
|
+
html = await fetchHtmlWithBrowser(url);
|
|
3373
|
+
results = parseBingResults(html, numResults);
|
|
3374
|
+
} catch (e) {
|
|
3375
|
+
console.log("[WebSearch] Bing browser fetch failed:", e?.message || e);
|
|
3376
|
+
}
|
|
3377
|
+
}
|
|
3378
|
+
|
|
3379
|
+
if (results.length === 0) {
|
|
3380
|
+
const algoCount = (html.match(/b_algo|b_ans/gi) || []).length;
|
|
3381
|
+
const h2Count = (html.match(/<h2[^>]*>/gi) || []).length;
|
|
3382
|
+
const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
|
|
3383
|
+
console.log(
|
|
3384
|
+
"[WebSearch] Bing: 0 results | htmlLen=",
|
|
3385
|
+
html.length,
|
|
3386
|
+
"status=",
|
|
3387
|
+
response.status,
|
|
3388
|
+
"| b_algo=",
|
|
3389
|
+
algoCount,
|
|
3390
|
+
"h2=",
|
|
3391
|
+
h2Count,
|
|
3392
|
+
"links=",
|
|
3393
|
+
linkCount,
|
|
3394
|
+
);
|
|
3395
|
+
logZeroResultHtml("Bing", html);
|
|
3396
|
+
} else {
|
|
3397
|
+
console.log("[WebSearch] Bing found:", results.length, "results");
|
|
3398
|
+
}
|
|
3399
|
+
return results;
|
|
3400
|
+
}
|
|
3401
|
+
|
|
3402
|
+
/** Parse Yahoo SERP HTML and return results array (used by fetch and browser fallback). */
|
|
3403
|
+
function parseYahooResults(html, numResults) {
|
|
3404
|
+
const results = [];
|
|
3405
|
+
const seenUrls = new Set();
|
|
3406
|
+
const linkRegex =
|
|
3407
|
+
/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
3408
|
+
const avoidHosts =
|
|
3409
|
+
/yahoo\.com|bing\.com|google\.com|doubleclick|yimg\.com|search\.yahoo/i;
|
|
3410
|
+
let m;
|
|
3411
|
+
const candidates = [];
|
|
3412
|
+
while ((m = linkRegex.exec(html)) !== null) {
|
|
3413
|
+
let resultUrl = m[1];
|
|
3414
|
+
if (avoidHosts.test(resultUrl)) {
|
|
3415
|
+
const resolved = resolveYahooRedirect(resultUrl);
|
|
3416
|
+
if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
|
|
3417
|
+
else continue;
|
|
3418
|
+
}
|
|
3419
|
+
let title = m[2].replace(/<[^>]+>/g, "").trim();
|
|
3420
|
+
title = decodeHtmlEntities(title);
|
|
3421
|
+
if (!title) {
|
|
3422
|
+
try {
|
|
3423
|
+
title = new URL(resultUrl).hostname.replace("www.", "");
|
|
3424
|
+
} catch {
|
|
3425
|
+
continue;
|
|
3426
|
+
}
|
|
3427
|
+
}
|
|
3428
|
+
if (title.length < 2 || title.length > 400) continue;
|
|
3429
|
+
if (
|
|
3430
|
+
/^(Sign in|Mail|News|Sports|Finance|Weather|Settings|Help)$/i.test(title)
|
|
3431
|
+
)
|
|
3432
|
+
continue;
|
|
3433
|
+
candidates.push({ url: resultUrl, title, index: m.index });
|
|
3434
|
+
}
|
|
3435
|
+
if (
|
|
3436
|
+
candidates.length === 0 &&
|
|
3437
|
+
/#web|\.dd\s|class="[^"]*title[^"]*"/.test(html)
|
|
3438
|
+
) {
|
|
3439
|
+
const altHref =
|
|
3440
|
+
/<a[^>]*class="[^"]*(?:title|dd)[^"]*"[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
3441
|
+
altHref.lastIndex = 0;
|
|
3442
|
+
while (
|
|
3443
|
+
(m = altHref.exec(html)) !== null &&
|
|
3444
|
+
candidates.length < numResults * 2
|
|
3445
|
+
) {
|
|
3446
|
+
let resultUrl = m[1];
|
|
3447
|
+
if (avoidHosts.test(resultUrl)) {
|
|
3448
|
+
const resolved = resolveYahooRedirect(resultUrl);
|
|
3449
|
+
if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
|
|
3450
|
+
else continue;
|
|
3451
|
+
}
|
|
3452
|
+
let title = m[2].replace(/<[^>]+>/g, "").trim();
|
|
3453
|
+
title = decodeHtmlEntities(title);
|
|
3454
|
+
if (!title)
|
|
3455
|
+
try {
|
|
3456
|
+
title = new URL(resultUrl).hostname.replace("www.", "");
|
|
3457
|
+
} catch {
|
|
3458
|
+
continue;
|
|
3459
|
+
}
|
|
3460
|
+
if (title.length < 2 || title.length > 400) continue;
|
|
3461
|
+
candidates.push({ url: resultUrl, title, index: m.index });
|
|
3462
|
+
}
|
|
3463
|
+
}
|
|
3464
|
+
for (const c of candidates) {
|
|
3465
|
+
if (seenUrls.has(c.url)) continue;
|
|
3466
|
+
seenUrls.add(c.url);
|
|
3467
|
+
const context = html.substring(
|
|
3468
|
+
c.index,
|
|
3469
|
+
Math.min(html.length, c.index + 600),
|
|
3470
|
+
);
|
|
3471
|
+
const snippetMatch =
|
|
3472
|
+
context.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
|
|
3473
|
+
context.match(/class="[^"]*desc[^"]*"[^>]*>([\s\S]*?)<\//);
|
|
3474
|
+
let snippet = snippetMatch
|
|
3475
|
+
? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
|
|
3476
|
+
: "";
|
|
3477
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
3478
|
+
let source = "";
|
|
3479
|
+
try {
|
|
3480
|
+
source = new URL(c.url).hostname.replace("www.", "");
|
|
3481
|
+
} catch {}
|
|
3482
|
+
results.push({
|
|
3483
|
+
url: c.url,
|
|
3484
|
+
title: c.title.slice(0, 300),
|
|
3485
|
+
snippet: snippet || `From ${source}`,
|
|
3486
|
+
source,
|
|
3487
|
+
});
|
|
3488
|
+
if (results.length >= numResults) break;
|
|
3489
|
+
}
|
|
3490
|
+
if (results.length < numResults && candidates.length === 0) {
|
|
3491
|
+
const linkRegex2 =
|
|
3492
|
+
/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
3493
|
+
let m2;
|
|
3494
|
+
while (
|
|
3495
|
+
(m2 = linkRegex2.exec(html)) !== null &&
|
|
3496
|
+
results.length < numResults
|
|
3497
|
+
) {
|
|
3498
|
+
let resultUrl = m2[1];
|
|
3499
|
+
if (avoidHosts.test(resultUrl)) {
|
|
3500
|
+
const resolved = resolveYahooRedirect(resultUrl);
|
|
3501
|
+
if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
|
|
3502
|
+
else continue;
|
|
3503
|
+
}
|
|
3504
|
+
if (seenUrls.has(resultUrl)) continue;
|
|
3505
|
+
let title = m2[2].replace(/<[^>]+>/g, "").trim();
|
|
3506
|
+
title = decodeHtmlEntities(title);
|
|
3507
|
+
if (!title) {
|
|
3508
|
+
try {
|
|
3509
|
+
title = new URL(resultUrl).hostname.replace("www.", "");
|
|
3510
|
+
} catch {
|
|
3511
|
+
continue;
|
|
3512
|
+
}
|
|
3513
|
+
}
|
|
3514
|
+
if (title.length < 1 || title.length > 500) continue;
|
|
3515
|
+
if (
|
|
3516
|
+
/^(Sign in|Mail|News|Sports|Finance|Weather|Settings|Help)$/i.test(
|
|
3517
|
+
title,
|
|
3518
|
+
)
|
|
3519
|
+
)
|
|
3520
|
+
continue;
|
|
3521
|
+
seenUrls.add(resultUrl);
|
|
3522
|
+
let source = "";
|
|
3523
|
+
try {
|
|
3524
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
3525
|
+
} catch {}
|
|
3526
|
+
results.push({
|
|
3527
|
+
url: resultUrl,
|
|
3528
|
+
title: title.slice(0, 300),
|
|
3529
|
+
snippet: `From ${source}`,
|
|
3530
|
+
source,
|
|
3531
|
+
});
|
|
3532
|
+
}
|
|
3533
|
+
}
|
|
3534
|
+
if (results.length < numResults) {
|
|
3535
|
+
const blocks = html.split(/class="[^"]*(?:algo|srch|dd)[^"]*"[^>]*>/i);
|
|
3536
|
+
for (let i = 1; i < blocks.length && results.length < numResults; i++) {
|
|
3537
|
+
const block = blocks[i].substring(0, 1500);
|
|
3538
|
+
const linkMatch = block.match(
|
|
3539
|
+
/<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>([\s\S]*?)<\/a>/i,
|
|
3540
|
+
);
|
|
3541
|
+
if (!linkMatch) continue;
|
|
3542
|
+
let resultUrl = linkMatch[1];
|
|
3543
|
+
if (avoidHosts.test(resultUrl)) {
|
|
3544
|
+
const resolved = resolveYahooRedirect(resultUrl);
|
|
3545
|
+
if (resolved && !avoidHosts.test(resolved)) resultUrl = resolved;
|
|
3546
|
+
else continue;
|
|
3547
|
+
}
|
|
3548
|
+
let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
|
|
3549
|
+
title = decodeHtmlEntities(title);
|
|
3550
|
+
if (!title) {
|
|
3551
|
+
try {
|
|
3552
|
+
title = new URL(resultUrl).hostname.replace("www.", "");
|
|
3553
|
+
} catch {
|
|
3554
|
+
continue;
|
|
3555
|
+
}
|
|
3556
|
+
}
|
|
3557
|
+
if (title.length < 2) continue;
|
|
3558
|
+
if (seenUrls.has(resultUrl)) continue;
|
|
3559
|
+
seenUrls.add(resultUrl);
|
|
3560
|
+
let source = "";
|
|
3561
|
+
try {
|
|
3562
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
3563
|
+
} catch {}
|
|
3564
|
+
results.push({
|
|
3565
|
+
url: resultUrl,
|
|
3566
|
+
title: title.slice(0, 300),
|
|
3567
|
+
snippet: `From ${source}`,
|
|
3568
|
+
source,
|
|
3569
|
+
});
|
|
3570
|
+
}
|
|
3571
|
+
}
|
|
3572
|
+
return results;
|
|
3573
|
+
}
|
|
3574
|
+
|
|
3575
|
+
/**
|
|
3576
|
+
* Search Yahoo (no API) - scrape SERP HTML. Retries with headless browser if fetch returns 0 results.
|
|
3577
|
+
*/
|
|
3578
|
+
async function searchYahoo(query, numResults) {
|
|
3579
|
+
const url = `https://search.yahoo.com/search?p=${encodeURIComponent(query)}`;
|
|
3580
|
+
const headers = getBrowserHeaders("https://search.yahoo.com");
|
|
3581
|
+
headers["Sec-Fetch-Site"] = "none";
|
|
3582
|
+
|
|
3583
|
+
const response = await fetchWithTimeout(
|
|
3584
|
+
url,
|
|
3585
|
+
{ headers, useWebSearchProxy: true },
|
|
3586
|
+
12000,
|
|
3587
|
+
);
|
|
3588
|
+
let html = await response.text();
|
|
3589
|
+
if (html.length < 2000) return [];
|
|
3590
|
+
|
|
3591
|
+
let results = parseYahooResults(html, numResults);
|
|
3592
|
+
|
|
3593
|
+
if (results.length === 0 && process.env.WEBSEARCH_USE_BROWSER !== "0") {
|
|
3594
|
+
try {
|
|
3595
|
+
console.log("[WebSearch] Yahoo retrying with headless browser...");
|
|
3596
|
+
html = await fetchHtmlWithBrowser(url);
|
|
3597
|
+
results = parseYahooResults(html, numResults);
|
|
3598
|
+
} catch (e) {
|
|
3599
|
+
console.log("[WebSearch] Yahoo browser fetch failed:", e?.message || e);
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
|
|
3603
|
+
if (results.length === 0) {
|
|
3604
|
+
const algoCount = (html.match(/class="[^"]*algo[^"]*"/gi) || []).length;
|
|
3605
|
+
const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
|
|
3606
|
+
console.log(
|
|
3607
|
+
"[WebSearch] Yahoo: 0 results | htmlLen=",
|
|
3608
|
+
html.length,
|
|
3609
|
+
"| algo=",
|
|
3610
|
+
algoCount,
|
|
3611
|
+
"links=",
|
|
3612
|
+
linkCount,
|
|
3613
|
+
);
|
|
3614
|
+
logZeroResultHtml("Yahoo", html);
|
|
3615
|
+
}
|
|
3616
|
+
console.log("[WebSearch] Yahoo found:", results.length, "results");
|
|
3617
|
+
return results;
|
|
3618
|
+
}
|
|
3619
|
+
|
|
3620
|
+
/** Parse Ecosia SERP HTML and return results array (used by fetch and browser fallback). */
|
|
3621
|
+
function parseEcosiaResults(html, numResults) {
|
|
3622
|
+
const results = [];
|
|
3623
|
+
const seenUrls = new Set();
|
|
3624
|
+
const avoidEcosia = /ecosia\.org|duckduckgo\.com/i;
|
|
3625
|
+
const resultBlocks = html.split(
|
|
3626
|
+
/class="[^"]*(?:result|card|organic|abstract)[^"]*"[^>]*>/i,
|
|
3627
|
+
);
|
|
3628
|
+
for (let i = 1; i < resultBlocks.length && results.length < numResults; i++) {
|
|
3629
|
+
const block = resultBlocks[i].substring(0, 2500);
|
|
3630
|
+
const linkMatch = block.match(
|
|
3631
|
+
/<a[^>]*href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/i,
|
|
3632
|
+
);
|
|
3633
|
+
if (!linkMatch) continue;
|
|
3634
|
+
const resultUrl = linkMatch[1];
|
|
3635
|
+
if (avoidEcosia.test(resultUrl) || seenUrls.has(resultUrl)) continue;
|
|
3636
|
+
let title = linkMatch[2].replace(/<[^>]+>/g, "").trim();
|
|
3637
|
+
title = decodeHtmlEntities(title);
|
|
3638
|
+
if (!title || title.length < 3) continue;
|
|
3639
|
+
const snippetMatch =
|
|
3640
|
+
block.match(
|
|
3641
|
+
/class="[^"]*(?:snippet|description|abstract)[^"]*"[^>]*>([\s\S]*?)<\//i,
|
|
3642
|
+
) || block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
|
|
3643
|
+
let snippet = snippetMatch
|
|
3644
|
+
? snippetMatch[1].replace(/<[^>]+>/g, " ").trim()
|
|
3645
|
+
: "";
|
|
3646
|
+
snippet = decodeHtmlEntities(snippet).replace(/\s+/g, " ").slice(0, 300);
|
|
3647
|
+
seenUrls.add(resultUrl);
|
|
3648
|
+
let source = "";
|
|
3649
|
+
try {
|
|
3650
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
3651
|
+
} catch {}
|
|
3652
|
+
results.push({
|
|
3653
|
+
url: resultUrl,
|
|
3654
|
+
title: title.slice(0, 300),
|
|
3655
|
+
snippet: snippet || `From ${source}`,
|
|
3656
|
+
source,
|
|
3657
|
+
});
|
|
3658
|
+
}
|
|
3659
|
+
if (results.length < numResults) {
|
|
3660
|
+
const linkRegex = /<a[^>]*href="(https?:\/\/[^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
|
|
3661
|
+
let lm;
|
|
3662
|
+
while (
|
|
3663
|
+
(lm = linkRegex.exec(html)) !== null &&
|
|
3664
|
+
results.length < numResults
|
|
3665
|
+
) {
|
|
3666
|
+
const resultUrl = lm[1];
|
|
3667
|
+
if (avoidEcosia.test(resultUrl) || seenUrls.has(resultUrl)) continue;
|
|
3668
|
+
let title = lm[2].replace(/<[^>]+>/g, "").trim();
|
|
3669
|
+
title = decodeHtmlEntities(title);
|
|
3670
|
+
if (title.length < 3 || title.length > 250) continue;
|
|
3671
|
+
if (/^(Ecosia|Plant|Privacy|Settings|Donate|Sign)/i.test(title)) continue;
|
|
3672
|
+
seenUrls.add(resultUrl);
|
|
3673
|
+
let source = "";
|
|
3674
|
+
try {
|
|
3675
|
+
source = new URL(resultUrl).hostname.replace("www.", "");
|
|
3676
|
+
} catch {}
|
|
3677
|
+
results.push({
|
|
3678
|
+
url: resultUrl,
|
|
3679
|
+
title: title.slice(0, 300),
|
|
3680
|
+
snippet: `From ${source}`,
|
|
3681
|
+
source,
|
|
3682
|
+
});
|
|
3683
|
+
}
|
|
3684
|
+
}
|
|
3685
|
+
return results;
|
|
3686
|
+
}
|
|
3687
|
+
|
|
3688
|
+
/**
|
|
3689
|
+
* Search Ecosia (no API) - scrape SERP HTML. Retries with headless browser if fetch returns 0 (e.g. Cloudflare).
|
|
3690
|
+
*/
|
|
3691
|
+
async function searchEcosia(query, numResults) {
|
|
3692
|
+
const url = `https://www.ecosia.org/search?q=${encodeURIComponent(query)}`;
|
|
3693
|
+
const headers = getBrowserHeaders("https://www.ecosia.org");
|
|
3694
|
+
headers["Sec-Fetch-Site"] = "none";
|
|
3695
|
+
|
|
3696
|
+
const response = await fetchWithTimeout(
|
|
3697
|
+
url,
|
|
3698
|
+
{ headers, useWebSearchProxy: true },
|
|
3699
|
+
12000,
|
|
3700
|
+
);
|
|
3701
|
+
let html = await response.text();
|
|
3702
|
+
if (html.length < 2000) return [];
|
|
3703
|
+
|
|
3704
|
+
let results = parseEcosiaResults(html, numResults);
|
|
3705
|
+
|
|
3706
|
+
// Skip browser retry when Cloudflare challenge is present (page never settles, would timeout)
|
|
3707
|
+
const isCloudflareChallenge =
|
|
3708
|
+
/Just a moment|Enable JavaScript and cookies|cf_chl_opt|cF_chl_opt/i.test(
|
|
3709
|
+
html,
|
|
3710
|
+
);
|
|
3711
|
+
if (
|
|
3712
|
+
results.length === 0 &&
|
|
3713
|
+
process.env.WEBSEARCH_USE_BROWSER !== "0" &&
|
|
3714
|
+
!isCloudflareChallenge
|
|
3715
|
+
) {
|
|
3716
|
+
try {
|
|
3717
|
+
console.log("[WebSearch] Ecosia retrying with headless browser...");
|
|
3718
|
+
html = await fetchHtmlWithBrowser(url);
|
|
3719
|
+
results = parseEcosiaResults(html, numResults);
|
|
3720
|
+
} catch (e) {
|
|
3721
|
+
console.log("[WebSearch] Ecosia browser fetch failed:", e?.message || e);
|
|
3722
|
+
}
|
|
3723
|
+
}
|
|
3724
|
+
|
|
3725
|
+
if (results.length === 0) {
|
|
3726
|
+
const isCloudflare =
|
|
3727
|
+
/Just a moment|Enable JavaScript and cookies|cf_chl_opt|cF_chl_opt/i.test(
|
|
3728
|
+
html,
|
|
3729
|
+
);
|
|
3730
|
+
if (isCloudflare)
|
|
3731
|
+
console.log(
|
|
3732
|
+
"[WebSearch] Ecosia: Cloudflare challenge (page requires JavaScript; try VPN or browser)",
|
|
3733
|
+
);
|
|
3734
|
+
const resultBlockCount = (
|
|
3735
|
+
html.match(/class="[^"]*(?:result|card|organic|abstract)[^"]*"/gi) || []
|
|
3736
|
+
).length;
|
|
3737
|
+
const linkCount = (html.match(/<a[^>]*href="https?:\/\//gi) || []).length;
|
|
3738
|
+
console.log(
|
|
3739
|
+
"[WebSearch] Ecosia: 0 results | htmlLen=",
|
|
3740
|
+
html.length,
|
|
3741
|
+
"| resultBlocks=",
|
|
3742
|
+
resultBlockCount,
|
|
3743
|
+
"links=",
|
|
3744
|
+
linkCount,
|
|
3745
|
+
);
|
|
3746
|
+
logZeroResultHtml("Ecosia", html);
|
|
3747
|
+
}
|
|
3748
|
+
console.log("[WebSearch] Ecosia found:", results.length, "results");
|
|
3749
|
+
return results;
|
|
3750
|
+
}
|
|
3751
|
+
|
|
3752
|
+
/**
|
|
3753
|
+
* Fetch and extract text content from a URL
|
|
3754
|
+
* Enhanced to better handle structured data like tables and lists
|
|
3755
|
+
*/
|
|
3756
|
+
export async function fetchUrlContent(url, timeoutMs = 15000) {
|
|
3757
|
+
console.log("[FetchURL] Fetching:", url);
|
|
3758
|
+
|
|
3759
|
+
const response = await fetchWithTimeout(
|
|
3760
|
+
url,
|
|
3761
|
+
{
|
|
3762
|
+
headers: {
|
|
3763
|
+
"User-Agent":
|
|
3764
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
3765
|
+
Accept:
|
|
3766
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
3767
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
3768
|
+
"Cache-Control": "no-cache",
|
|
3769
|
+
},
|
|
3770
|
+
},
|
|
3771
|
+
timeoutMs,
|
|
3772
|
+
);
|
|
3773
|
+
|
|
3774
|
+
const html = await response.text();
|
|
3775
|
+
|
|
3776
|
+
// Extract title
|
|
3777
|
+
const titleMatch = html.match(/<title[^>]*>([^<]*)<\/title>/i);
|
|
3778
|
+
const title = titleMatch ? decodeHtmlEntities(titleMatch[1].trim()) : "";
|
|
3779
|
+
|
|
3780
|
+
// Extract tables and convert to readable format
|
|
3781
|
+
let tables = [];
|
|
3782
|
+
const tablePattern = /<table[^>]*>([\s\S]*?)<\/table>/gi;
|
|
3783
|
+
let tableMatch;
|
|
3784
|
+
while ((tableMatch = tablePattern.exec(html)) !== null && tables.length < 5) {
|
|
3785
|
+
const tableHtml = tableMatch[1];
|
|
3786
|
+
const rows = [];
|
|
3787
|
+
|
|
3788
|
+
// Extract headers
|
|
3789
|
+
const headerPattern = /<th[^>]*>([\s\S]*?)<\/th>/gi;
|
|
3790
|
+
const headers = [];
|
|
3791
|
+
let headerMatch;
|
|
3792
|
+
while ((headerMatch = headerPattern.exec(tableHtml)) !== null) {
|
|
3793
|
+
headers.push(
|
|
3794
|
+
decodeHtmlEntities(headerMatch[1].replace(/<[^>]+>/g, "").trim()),
|
|
3795
|
+
);
|
|
3796
|
+
}
|
|
3797
|
+
if (headers.length > 0) {
|
|
3798
|
+
rows.push(headers.join(" | "));
|
|
3799
|
+
rows.push(headers.map(() => "---").join(" | "));
|
|
3800
|
+
}
|
|
3801
|
+
|
|
3802
|
+
// Extract rows
|
|
3803
|
+
const rowPattern = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
|
|
3804
|
+
let rowMatch;
|
|
3805
|
+
let rowCount = 0;
|
|
3806
|
+
while ((rowMatch = rowPattern.exec(tableHtml)) !== null && rowCount < 50) {
|
|
3807
|
+
const cellPattern = /<td[^>]*>([\s\S]*?)<\/td>/gi;
|
|
3808
|
+
const cells = [];
|
|
3809
|
+
let cellMatch;
|
|
3810
|
+
while ((cellMatch = cellPattern.exec(rowMatch[1])) !== null) {
|
|
3811
|
+
cells.push(
|
|
3812
|
+
decodeHtmlEntities(cellMatch[1].replace(/<[^>]+>/g, "").trim()),
|
|
3813
|
+
);
|
|
3814
|
+
}
|
|
3815
|
+
if (cells.length > 0) {
|
|
3816
|
+
rows.push(cells.join(" | "));
|
|
3817
|
+
rowCount++;
|
|
3818
|
+
}
|
|
3819
|
+
}
|
|
3820
|
+
|
|
3821
|
+
if (rows.length > 1) {
|
|
3822
|
+
tables.push(rows.join("\n"));
|
|
3823
|
+
}
|
|
3824
|
+
}
|
|
3825
|
+
|
|
3826
|
+
// Extract lists (ul/ol) and convert to readable format
|
|
3827
|
+
let lists = [];
|
|
3828
|
+
const listPattern = /<(?:ul|ol)[^>]*>([\s\S]*?)<\/(?:ul|ol)>/gi;
|
|
3829
|
+
let listMatch;
|
|
3830
|
+
while ((listMatch = listPattern.exec(html)) !== null && lists.length < 10) {
|
|
3831
|
+
const listHtml = listMatch[1];
|
|
3832
|
+
const items = [];
|
|
3833
|
+
const itemPattern = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
|
3834
|
+
let itemMatch;
|
|
3835
|
+
while (
|
|
3836
|
+
(itemMatch = itemPattern.exec(listHtml)) !== null &&
|
|
3837
|
+
items.length < 30
|
|
3838
|
+
) {
|
|
3839
|
+
const itemText = decodeHtmlEntities(
|
|
3840
|
+
itemMatch[1]
|
|
3841
|
+
.replace(/<[^>]+>/g, " ")
|
|
3842
|
+
.replace(/\s+/g, " ")
|
|
3843
|
+
.trim(),
|
|
3844
|
+
);
|
|
3845
|
+
if (itemText.length > 5 && itemText.length < 500) {
|
|
3846
|
+
items.push(`• ${itemText}`);
|
|
3847
|
+
}
|
|
3848
|
+
}
|
|
3849
|
+
if (items.length > 2) {
|
|
3850
|
+
lists.push(items.join("\n"));
|
|
3851
|
+
}
|
|
3852
|
+
}
|
|
3853
|
+
|
|
3854
|
+
// Convert main HTML to text (removing scripts, styles, nav, footer)
|
|
3855
|
+
let text = html
|
|
3856
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
3857
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
3858
|
+
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, "")
|
|
3859
|
+
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, "")
|
|
3860
|
+
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, "")
|
|
3861
|
+
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, "")
|
|
3862
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
3863
|
+
// Preserve some structure
|
|
3864
|
+
.replace(/<\/h[1-6]>/gi, "\n\n")
|
|
3865
|
+
.replace(/<\/p>/gi, "\n")
|
|
3866
|
+
.replace(/<br\s*\/?>/gi, "\n")
|
|
3867
|
+
.replace(/<\/div>/gi, "\n")
|
|
3868
|
+
.replace(/<\/li>/gi, "\n")
|
|
3869
|
+
.replace(/<[^>]+>/g, " ")
|
|
3870
|
+
.replace(/ /g, " ")
|
|
3871
|
+
.replace(/&/g, "&")
|
|
3872
|
+
.replace(/</g, "<")
|
|
3873
|
+
.replace(/>/g, ">")
|
|
3874
|
+
.replace(/"/g, '"')
|
|
3875
|
+
.replace(/'/g, "'")
|
|
3876
|
+
.replace(/\n\s*\n\s*\n/g, "\n\n")
|
|
3877
|
+
.replace(/[ \t]+/g, " ")
|
|
3878
|
+
.trim();
|
|
3879
|
+
|
|
3880
|
+
// Limit main text but keep more for data-rich pages
|
|
3881
|
+
const maxTextLength = 12000;
|
|
3882
|
+
if (text.length > maxTextLength) {
|
|
3883
|
+
text = text.slice(0, maxTextLength) + "\n\n... (content truncated)";
|
|
3884
|
+
}
|
|
3885
|
+
|
|
3886
|
+
// Build the output
|
|
3887
|
+
let output = title ? `Title: ${title}\n\n` : "";
|
|
3888
|
+
|
|
3889
|
+
// Include structured data if found
|
|
3890
|
+
if (tables.length > 0) {
|
|
3891
|
+
output += `=== TABLES FOUND (${tables.length}) ===\n\n`;
|
|
3892
|
+
tables.forEach((table, i) => {
|
|
3893
|
+
output += `Table ${i + 1}:\n${table}\n\n`;
|
|
3894
|
+
});
|
|
3895
|
+
}
|
|
3896
|
+
|
|
3897
|
+
if (lists.length > 0 && lists.some((l) => l.split("\n").length > 3)) {
|
|
3898
|
+
output += `=== LISTS FOUND ===\n\n`;
|
|
3899
|
+
lists
|
|
3900
|
+
.filter((l) => l.split("\n").length > 3)
|
|
3901
|
+
.slice(0, 5)
|
|
3902
|
+
.forEach((list, i) => {
|
|
3903
|
+
output += `List ${i + 1}:\n${list}\n\n`;
|
|
3904
|
+
});
|
|
3905
|
+
}
|
|
3906
|
+
|
|
3907
|
+
output += `=== PAGE CONTENT ===\n\n${text}`;
|
|
3908
|
+
|
|
3909
|
+
console.log(
|
|
3910
|
+
"[FetchURL] Extracted",
|
|
3911
|
+
output.length,
|
|
3912
|
+
"chars,",
|
|
3913
|
+
tables.length,
|
|
3914
|
+
"tables,",
|
|
3915
|
+
lists.length,
|
|
3916
|
+
"lists",
|
|
3917
|
+
);
|
|
3918
|
+
return output;
|
|
3919
|
+
}
|
|
3920
|
+
|
|
3921
|
+
/**
|
|
3922
|
+
* Deep research - search and fetch multiple pages
|
|
3923
|
+
*/
|
|
3924
|
+
export async function deepWebResearch(query, numPages = 5) {
|
|
3925
|
+
const results = { success: false, query, pages: [] };
|
|
3926
|
+
|
|
3927
|
+
try {
|
|
3928
|
+
const searchResults = await performWebSearch(query, numPages + 2);
|
|
3929
|
+
if (searchResults.length === 0) {
|
|
3930
|
+
results.error = "No search results found";
|
|
3931
|
+
return results;
|
|
3932
|
+
}
|
|
3933
|
+
|
|
3934
|
+
for (const result of searchResults) {
|
|
3935
|
+
if (results.pages.length >= numPages) break;
|
|
3936
|
+
try {
|
|
3937
|
+
const content = await fetchUrlContent(result.url, 10000);
|
|
3938
|
+
if (content && content.length > 100) {
|
|
3939
|
+
results.pages.push({
|
|
3940
|
+
title: result.title,
|
|
3941
|
+
url: result.url,
|
|
3942
|
+
content: content.slice(0, 5000),
|
|
3943
|
+
excerpt: content.slice(0, 500),
|
|
3944
|
+
});
|
|
3945
|
+
}
|
|
3946
|
+
} catch (err) {
|
|
3947
|
+
// Include result even if fetch fails
|
|
3948
|
+
results.pages.push({
|
|
3949
|
+
title: result.title,
|
|
3950
|
+
url: result.url,
|
|
3951
|
+
content: result.snippet || "",
|
|
3952
|
+
excerpt: result.snippet || "",
|
|
3953
|
+
});
|
|
3954
|
+
}
|
|
3955
|
+
}
|
|
3956
|
+
|
|
3957
|
+
results.success = results.pages.length > 0;
|
|
3958
|
+
return results;
|
|
3959
|
+
} catch (err) {
|
|
3960
|
+
console.error("[DeepResearch] Error:", err);
|
|
3961
|
+
return results;
|
|
3962
|
+
}
|
|
3963
|
+
}
|
|
3964
|
+
|
|
3965
|
+
// ============================================
|
|
3966
|
+
// Helper Functions
|
|
3967
|
+
// ============================================
|
|
3968
|
+
|
|
3969
|
+
/**
|
|
3970
|
+
* Escape special regex characters in a string
|
|
3971
|
+
* @param {string} string - String to escape
|
|
3972
|
+
* @returns {string} - Escaped string safe for use in RegExp
|
|
3973
|
+
*/
|
|
3974
|
+
function escapeRegExp(string) {
|
|
3975
|
+
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
3976
|
+
}
|
|
3977
|
+
|
|
3978
|
+
/**
|
|
3979
|
+
* Generate a simple unified diff-like output for display
|
|
3980
|
+
* @param {string} oldStr - Original text
|
|
3981
|
+
* @param {string} newStr - New text
|
|
3982
|
+
* @param {string} filePath - Path to the file being edited
|
|
3983
|
+
* @returns {string} - Diff-like output
|
|
3984
|
+
*/
|
|
3985
|
+
function generateSimpleDiff(oldStr, newStr, filePath) {
|
|
3986
|
+
const oldLines = oldStr.split("\n");
|
|
3987
|
+
const newLines = newStr.split("\n");
|
|
3988
|
+
|
|
3989
|
+
let diff = `--- a/${filePath}\n+++ b/${filePath}\n`;
|
|
3990
|
+
|
|
3991
|
+
// Add context with line markers
|
|
3992
|
+
const maxOldLines = Math.min(oldLines.length, 20); // Limit for readability
|
|
3993
|
+
const maxNewLines = Math.min(newLines.length, 20);
|
|
3994
|
+
|
|
3995
|
+
diff += `@@ -1,${oldLines.length} +1,${newLines.length} @@\n`;
|
|
3996
|
+
|
|
3997
|
+
// Show removed lines (old content)
|
|
3998
|
+
for (let i = 0; i < maxOldLines; i++) {
|
|
3999
|
+
diff += `-${oldLines[i]}\n`;
|
|
4000
|
+
}
|
|
4001
|
+
if (oldLines.length > maxOldLines) {
|
|
4002
|
+
diff += `... (${oldLines.length - maxOldLines} more lines removed)\n`;
|
|
4003
|
+
}
|
|
4004
|
+
|
|
4005
|
+
// Show added lines (new content)
|
|
4006
|
+
for (let i = 0; i < maxNewLines; i++) {
|
|
4007
|
+
diff += `+${newLines[i]}\n`;
|
|
4008
|
+
}
|
|
4009
|
+
if (newLines.length > maxNewLines) {
|
|
4010
|
+
diff += `... (${newLines.length - maxNewLines} more lines added)\n`;
|
|
4011
|
+
}
|
|
4012
|
+
|
|
4013
|
+
return diff;
|
|
4014
|
+
}
|
|
4015
|
+
|
|
4016
|
+
/**
|
|
4017
|
+
* Resolve path with home directory expansion and normalization
|
|
4018
|
+
* @param {string} path - Path to resolve
|
|
4019
|
+
* @returns {string} - Normalized absolute path
|
|
4020
|
+
*/
|
|
4021
|
+
function resolvePath(path) {
|
|
4022
|
+
if (!path || typeof path !== "string") {
|
|
4023
|
+
throw new Error("Invalid path");
|
|
4024
|
+
}
|
|
4025
|
+
|
|
4026
|
+
let resolved;
|
|
4027
|
+
if (path.startsWith("~")) {
|
|
4028
|
+
resolved = join(homedir(), path.slice(1));
|
|
4029
|
+
} else {
|
|
4030
|
+
resolved = resolve(path);
|
|
4031
|
+
}
|
|
4032
|
+
|
|
4033
|
+
// Normalize to remove . and .. components
|
|
4034
|
+
return normalize(resolved);
|
|
4035
|
+
}
|
|
4036
|
+
|
|
4037
|
+
/**
|
|
4038
|
+
* Check if a path contains path traversal attempts
|
|
4039
|
+
* @param {string} inputPath - Original user-provided path
|
|
4040
|
+
* @param {string} resolvedPath - Resolved absolute path
|
|
4041
|
+
* @returns {boolean} - True if path traversal detected
|
|
4042
|
+
*/
|
|
4043
|
+
function hasPathTraversal(inputPath, resolvedPath) {
|
|
4044
|
+
// Check for obvious traversal patterns in input
|
|
4045
|
+
if (inputPath.includes("..")) {
|
|
4046
|
+
// Get the expected directory
|
|
4047
|
+
const baseDir = process.cwd();
|
|
4048
|
+
const inputResolved = resolve(baseDir, inputPath);
|
|
4049
|
+
|
|
4050
|
+
// If resolving .. would take us outside the base, it's traversal
|
|
4051
|
+
if (
|
|
4052
|
+
!inputResolved.startsWith(baseDir) &&
|
|
4053
|
+
!inputPath.startsWith("/") &&
|
|
4054
|
+
!inputPath.startsWith("~")
|
|
4055
|
+
) {
|
|
4056
|
+
return true;
|
|
4057
|
+
}
|
|
4058
|
+
}
|
|
4059
|
+
|
|
4060
|
+
return false;
|
|
4061
|
+
}
|
|
4062
|
+
|
|
4063
|
+
/**
|
|
4064
|
+
* Check if a path is in the blocked list
|
|
4065
|
+
* @param {string} path - Resolved absolute path
|
|
4066
|
+
* @returns {boolean} - True if path is blocked
|
|
4067
|
+
*/
|
|
4068
|
+
function isBlockedPath(path) {
|
|
4069
|
+
const normalizedPath = normalize(path).toLowerCase();
|
|
4070
|
+
|
|
4071
|
+
for (const blocked of BLOCKED_PATHS) {
|
|
4072
|
+
if (normalizedPath.includes(blocked.toLowerCase())) {
|
|
4073
|
+
return true;
|
|
4074
|
+
}
|
|
4075
|
+
}
|
|
4076
|
+
|
|
4077
|
+
// Also check if it's a symlink pointing to a blocked path
|
|
4078
|
+
try {
|
|
4079
|
+
if (existsSync(path)) {
|
|
4080
|
+
const stat = lstatSync(path);
|
|
4081
|
+
if (stat.isSymbolicLink()) {
|
|
4082
|
+
const realPath = realpathSync(path);
|
|
4083
|
+
for (const blocked of BLOCKED_PATHS) {
|
|
4084
|
+
if (realPath.toLowerCase().includes(blocked.toLowerCase())) {
|
|
4085
|
+
return true;
|
|
4086
|
+
}
|
|
4087
|
+
}
|
|
4088
|
+
}
|
|
4089
|
+
}
|
|
4090
|
+
} catch (err) {
|
|
4091
|
+
// If we can't stat, let the actual operation fail
|
|
4092
|
+
}
|
|
4093
|
+
|
|
4094
|
+
return false;
|
|
4095
|
+
}
|
|
4096
|
+
|
|
4097
|
+
/**
|
|
4098
|
+
* Check if path is allowed for reading
|
|
4099
|
+
* @param {string} path - Resolved absolute path
|
|
4100
|
+
* @param {object} config - Configuration with permissions
|
|
4101
|
+
* @throws {Error} - If path is not allowed
|
|
4102
|
+
*/
|
|
4103
|
+
function checkReadPermission(path, config) {
|
|
4104
|
+
// Check against blocked paths
|
|
4105
|
+
if (isBlockedPath(path)) {
|
|
4106
|
+
throw new Error(`Access denied: Cannot read sensitive path`);
|
|
4107
|
+
}
|
|
4108
|
+
|
|
4109
|
+
// Check config-based restrictions if provided
|
|
4110
|
+
if (config?.permissions?.fileRead === false) {
|
|
4111
|
+
throw new Error("File read access is disabled in configuration");
|
|
4112
|
+
}
|
|
4113
|
+
|
|
4114
|
+
// Check allowed paths if specified
|
|
4115
|
+
if (config?.permissions?.allowedReadPaths) {
|
|
4116
|
+
const allowed = config.permissions.allowedReadPaths.some((allowedPath) => {
|
|
4117
|
+
const resolvedAllowed = resolvePath(allowedPath);
|
|
4118
|
+
return path.startsWith(resolvedAllowed);
|
|
4119
|
+
});
|
|
4120
|
+
|
|
4121
|
+
if (!allowed) {
|
|
4122
|
+
throw new Error(`Access denied: Path not in allowed read paths`);
|
|
4123
|
+
}
|
|
4124
|
+
}
|
|
4125
|
+
|
|
4126
|
+
return true;
|
|
4127
|
+
}
|
|
4128
|
+
|
|
4129
|
+
/**
|
|
4130
|
+
* Check if path is allowed for writing
|
|
4131
|
+
* @param {string} path - Resolved absolute path
|
|
4132
|
+
* @param {object} config - Configuration with permissions
|
|
4133
|
+
* @throws {Error} - If path is not allowed
|
|
4134
|
+
*/
|
|
4135
|
+
function checkWritePermission(path, config) {
|
|
4136
|
+
// Check against blocked paths
|
|
4137
|
+
if (isBlockedPath(path)) {
|
|
4138
|
+
throw new Error(`Access denied: Cannot write to sensitive path`);
|
|
4139
|
+
}
|
|
4140
|
+
|
|
4141
|
+
// Check config-based restrictions if provided
|
|
4142
|
+
if (config?.permissions?.fileWrite === false) {
|
|
4143
|
+
throw new Error("File write access is disabled in configuration");
|
|
4144
|
+
}
|
|
4145
|
+
|
|
4146
|
+
// Check allowed paths if specified
|
|
4147
|
+
if (config?.permissions?.allowedWritePaths) {
|
|
4148
|
+
const allowed = config.permissions.allowedWritePaths.some((allowedPath) => {
|
|
4149
|
+
const resolvedAllowed = resolvePath(allowedPath);
|
|
4150
|
+
return path.startsWith(resolvedAllowed);
|
|
4151
|
+
});
|
|
4152
|
+
|
|
4153
|
+
if (!allowed) {
|
|
4154
|
+
throw new Error(`Access denied: Path not in allowed write paths`);
|
|
4155
|
+
}
|
|
4156
|
+
}
|
|
4157
|
+
|
|
4158
|
+
return true;
|
|
4159
|
+
}
|
|
4160
|
+
|
|
4161
|
+
/**
|
|
4162
|
+
* Validate and sanitize a shell command
|
|
4163
|
+
* @param {string} command - Command to validate
|
|
4164
|
+
* @returns {{ valid: boolean, reason?: string }}
|
|
4165
|
+
*/
|
|
4166
|
+
function validateCommand(command) {
|
|
4167
|
+
if (!command || typeof command !== "string") {
|
|
4168
|
+
return { valid: false, reason: "Invalid command" };
|
|
4169
|
+
}
|
|
4170
|
+
|
|
4171
|
+
// Check against dangerous patterns
|
|
4172
|
+
for (const pattern of DANGEROUS_COMMAND_PATTERNS) {
|
|
4173
|
+
if (pattern.test(command)) {
|
|
4174
|
+
return { valid: false, reason: "Command matches dangerous pattern" };
|
|
4175
|
+
}
|
|
4176
|
+
}
|
|
4177
|
+
|
|
4178
|
+
// Check for null bytes (command injection)
|
|
4179
|
+
if (command.includes("\0")) {
|
|
4180
|
+
return { valid: false, reason: "Command contains null bytes" };
|
|
4181
|
+
}
|
|
4182
|
+
|
|
4183
|
+
// Check for shell escape attempts
|
|
4184
|
+
if (command.includes("$(") || command.includes("`")) {
|
|
4185
|
+
// Allow backticks and $() for legitimate use, but warn
|
|
4186
|
+
console.warn(
|
|
4187
|
+
"[validateCommand] Command contains shell substitution - use with caution",
|
|
4188
|
+
);
|
|
4189
|
+
}
|
|
4190
|
+
|
|
4191
|
+
return { valid: true };
|
|
4192
|
+
}
|
|
4193
|
+
|
|
4194
|
+
/**
|
|
4195
|
+
* Parse a single reversible shell command for undo on regeneration (Strategy 1).
|
|
4196
|
+
* Returns { op, path?, path_src?, path_dest?, cwd } or null if not reversible.
|
|
4197
|
+
* Paths are resolved against currentWorkingDirectory and must stay within it.
|
|
4198
|
+
*/
|
|
4199
|
+
function parseReversibleCommand(command) {
|
|
4200
|
+
if (!command || typeof command !== "string") return null;
|
|
4201
|
+
const trimmed = command.trim();
|
|
4202
|
+
if (trimmed.includes("|") || trimmed.includes(";")) return null;
|
|
4203
|
+
|
|
4204
|
+
const cwd = currentWorkingDirectory;
|
|
4205
|
+
const resolveSafe = (arg) => {
|
|
4206
|
+
const resolved = resolve(cwd, arg);
|
|
4207
|
+
if (!resolved.startsWith(cwd) && resolved !== cwd) return null;
|
|
4208
|
+
return resolved;
|
|
4209
|
+
};
|
|
4210
|
+
|
|
4211
|
+
// mkdir [ -p ] path
|
|
4212
|
+
const mkdirMatch = trimmed.match(/^mkdir\s+(-p\s+)?(.+)$/);
|
|
4213
|
+
if (mkdirMatch) {
|
|
4214
|
+
const path = resolveSafe(mkdirMatch[2].trim());
|
|
4215
|
+
if (path == null) return null;
|
|
4216
|
+
return { op: "mkdir", path, cwd };
|
|
4217
|
+
}
|
|
4218
|
+
|
|
4219
|
+
// touch path
|
|
4220
|
+
const touchMatch = trimmed.match(/^touch\s+(.+)$/);
|
|
4221
|
+
if (touchMatch) {
|
|
4222
|
+
const path = resolveSafe(touchMatch[1].trim());
|
|
4223
|
+
if (path == null) return null;
|
|
4224
|
+
return { op: "touch", path, cwd };
|
|
4225
|
+
}
|
|
4226
|
+
|
|
4227
|
+
// cp [ -r ] src dest
|
|
4228
|
+
const cpMatch = trimmed.match(/^cp\s+(-[rR]\s+)?(\S+)\s+(\S+)$/);
|
|
4229
|
+
if (cpMatch) {
|
|
4230
|
+
const pathDest = resolveSafe(cpMatch[3]);
|
|
4231
|
+
if (pathDest == null) return null;
|
|
4232
|
+
return { op: "cp", path_dest: pathDest, cwd };
|
|
4233
|
+
}
|
|
4234
|
+
|
|
4235
|
+
// mv src dest
|
|
4236
|
+
const mvMatch = trimmed.match(/^mv\s+(\S+)\s+(\S+)$/);
|
|
4237
|
+
if (mvMatch) {
|
|
4238
|
+
const pathSrc = resolveSafe(mvMatch[1]);
|
|
4239
|
+
const pathDest = resolveSafe(mvMatch[2]);
|
|
4240
|
+
if (pathSrc == null || pathDest == null) return null;
|
|
4241
|
+
return { op: "mv", path_src: pathSrc, path_dest: pathDest, cwd };
|
|
4242
|
+
}
|
|
4243
|
+
|
|
4244
|
+
return null;
|
|
4245
|
+
}
|
|
4246
|
+
|
|
4247
|
+
/**
|
|
4248
|
+
* Get timeout for a specific tool
|
|
4249
|
+
* @param {string} toolName - Name of the tool
|
|
4250
|
+
* @returns {number} - Timeout in milliseconds
|
|
4251
|
+
*/
|
|
4252
|
+
function getToolTimeout(toolName) {
|
|
4253
|
+
return TOOL_TIMEOUTS[toolName] || DEFAULT_TOOL_TIMEOUT;
|
|
4254
|
+
}
|
|
4255
|
+
|
|
4256
|
+
/**
|
|
4257
|
+
* Tools that support snapshot callbacks for undo on regeneration
|
|
4258
|
+
*/
|
|
4259
|
+
const SNAPSHOT_TOOLS = ["write_file", "edit_file"];
|
|
4260
|
+
|
|
4261
|
+
/**
|
|
4262
|
+
* Execute a tool by name with timeout and error handling
|
|
4263
|
+
* @param {string} name - Tool name
|
|
4264
|
+
* @param {object} args - Tool arguments
|
|
4265
|
+
* @param {object} config - Configuration
|
|
4266
|
+
* @param {function} snapshotFn - Optional callback for capturing file snapshots (for undo on regeneration)
|
|
4267
|
+
* @param {function} shellUndoFn - Optional callback for recording reversible shell commands (Strategy 1)
|
|
4268
|
+
* @param {string|null} toolCallId - Optional ID for this tool call (for snapshot/shell undo traceability)
|
|
4269
|
+
* @returns {Promise<string>} - Tool result or error message
|
|
4270
|
+
*/
|
|
4271
|
+
export async function executeTool(
|
|
4272
|
+
name,
|
|
4273
|
+
args,
|
|
4274
|
+
config,
|
|
4275
|
+
snapshotFn = null,
|
|
4276
|
+
shellUndoFn = null,
|
|
4277
|
+
toolCallId = null,
|
|
4278
|
+
) {
|
|
4279
|
+
const tool = TOOLS[name];
|
|
4280
|
+
if (!tool) {
|
|
4281
|
+
return `Error: Unknown tool "${name}"`;
|
|
4282
|
+
}
|
|
4283
|
+
|
|
4284
|
+
// Validate required parameters
|
|
4285
|
+
for (const [param, spec] of Object.entries(tool.parameters || {})) {
|
|
4286
|
+
if (spec.required && (args[param] === undefined || args[param] === null)) {
|
|
4287
|
+
return `Error: Missing required parameter "${param}" for tool "${name}"`;
|
|
4288
|
+
}
|
|
4289
|
+
}
|
|
4290
|
+
|
|
4291
|
+
// Execute with timeout
|
|
4292
|
+
const timeout = getToolTimeout(name);
|
|
4293
|
+
|
|
4294
|
+
// Only pass snapshotFn to tools that support it; wrap to pass toolCallId (Strategy 3)
|
|
4295
|
+
const shouldPassSnapshot = SNAPSHOT_TOOLS.includes(name) && snapshotFn;
|
|
4296
|
+
const wrappedSnapshotFn =
|
|
4297
|
+
shouldPassSnapshot && toolCallId
|
|
4298
|
+
? (snapshot) => snapshotFn(snapshot, toolCallId)
|
|
4299
|
+
: snapshotFn;
|
|
4300
|
+
// execute_command gets shellUndoFn for reversible-command undo (Strategy 3: pass toolCallId)
|
|
4301
|
+
const shouldPassShellUndo = name === "execute_command" && shellUndoFn;
|
|
4302
|
+
const wrappedShellUndoFn =
|
|
4303
|
+
shouldPassShellUndo && toolCallId
|
|
4304
|
+
? (entry) => shellUndoFn(entry, toolCallId)
|
|
4305
|
+
: shellUndoFn;
|
|
4306
|
+
|
|
4307
|
+
const executeArgs = () => {
|
|
4308
|
+
if (shouldPassSnapshot)
|
|
4309
|
+
return tool.execute(args, config, wrappedSnapshotFn);
|
|
4310
|
+
if (shouldPassShellUndo)
|
|
4311
|
+
return tool.execute(args, config, wrappedShellUndoFn);
|
|
4312
|
+
return tool.execute(args, config);
|
|
4313
|
+
};
|
|
4314
|
+
|
|
4315
|
+
try {
|
|
4316
|
+
const result = await Promise.race([
|
|
4317
|
+
executeArgs(),
|
|
4318
|
+
new Promise((_, reject) =>
|
|
4319
|
+
setTimeout(
|
|
4320
|
+
() =>
|
|
4321
|
+
reject(
|
|
4322
|
+
new Error(`Tool "${name}" timed out after ${timeout / 1000}s`),
|
|
4323
|
+
),
|
|
4324
|
+
timeout,
|
|
4325
|
+
),
|
|
4326
|
+
),
|
|
4327
|
+
]);
|
|
4328
|
+
|
|
4329
|
+
return result;
|
|
4330
|
+
} catch (err) {
|
|
4331
|
+
// Handle permission errors specifically
|
|
4332
|
+
if (err.message.includes("Access denied")) {
|
|
4333
|
+
console.warn(`[executeTool] Permission denied for ${name}:`, err.message);
|
|
4334
|
+
return `Error: ${err.message}`;
|
|
4335
|
+
}
|
|
4336
|
+
|
|
4337
|
+
// Handle timeout
|
|
4338
|
+
if (err.message.includes("timed out")) {
|
|
4339
|
+
console.warn(`[executeTool] Timeout for ${name}:`, err.message);
|
|
4340
|
+
return `Error: ${err.message}`;
|
|
4341
|
+
}
|
|
4342
|
+
|
|
4343
|
+
// Generic error
|
|
4344
|
+
console.error(`[executeTool] Error executing ${name}:`, err);
|
|
4345
|
+
return `Error executing ${name}: ${err.message}`;
|
|
4346
|
+
}
|
|
4347
|
+
}
|
|
4348
|
+
|
|
4349
|
+
/**
|
|
4350
|
+
* Get tool descriptions for system prompt
|
|
4351
|
+
*/
|
|
4352
|
+
export function getToolDescriptions() {
|
|
4353
|
+
return Object.entries(TOOLS)
|
|
4354
|
+
.map(([name, tool]) => {
|
|
4355
|
+
const params = Object.entries(tool.parameters || {})
|
|
4356
|
+
.map(
|
|
4357
|
+
([p, spec]) => `${p}${spec.required ? "" : "?"}: ${spec.description}`,
|
|
4358
|
+
)
|
|
4359
|
+
.join(", ");
|
|
4360
|
+
return `- ${name}(${params}): ${tool.description}`;
|
|
4361
|
+
})
|
|
4362
|
+
.join("\n");
|
|
4363
|
+
}
|
|
4364
|
+
|
|
4365
|
+
export default {
|
|
4366
|
+
TOOLS,
|
|
4367
|
+
executeTool,
|
|
4368
|
+
getToolDescriptions,
|
|
4369
|
+
getAgentWorkingDirectory,
|
|
4370
|
+
setAgentWorkingDirectory,
|
|
4371
|
+
resetAgentWorkingDirectory,
|
|
4372
|
+
};
|