morpheus-cli 0.8.9 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/runtime/__tests__/keymaker.test.js +5 -2
- package/dist/runtime/apoc.js +6 -6
- package/dist/{devkit/registry.js → runtime/devkit-instrument.js} +5 -29
- package/dist/runtime/keymaker.js +5 -4
- package/dist/runtime/smiths/delegator.js +1 -1
- package/package.json +2 -1
- package/dist/devkit/adapters/shell.js +0 -80
- package/dist/devkit/index.js +0 -11
- package/dist/devkit/tools/browser.js +0 -825
- package/dist/devkit/tools/filesystem.js +0 -235
- package/dist/devkit/tools/git.js +0 -226
- package/dist/devkit/tools/network.js +0 -165
- package/dist/devkit/tools/packages.js +0 -73
- package/dist/devkit/tools/processes.js +0 -130
- package/dist/devkit/tools/shell.js +0 -106
- package/dist/devkit/tools/system.js +0 -132
- package/dist/devkit/types.js +0 -1
- package/dist/devkit/utils.js +0 -45
|
@@ -1,825 +0,0 @@
|
|
|
1
|
-
import { tool } from '@langchain/core/tools';
|
|
2
|
-
import { z } from 'zod';
|
|
3
|
-
import os from 'os';
|
|
4
|
-
import path from 'path';
|
|
5
|
-
import { truncateOutput } from '../utils.js';
|
|
6
|
-
import { registerToolFactory } from '../registry.js';
|
|
7
|
-
import { Readability } from '@mozilla/readability';
|
|
8
|
-
import { JSDOM } from 'jsdom';
|
|
9
|
-
// ─── Local path resolution (standalone Smith, no Morpheus PATHS) ────────────
|
|
10
|
-
const SMITH_HOME = process.env.SMITH_HOME ?? path.join(os.homedir(), '.smith');
|
|
11
|
-
const BROWSER_CACHE = path.join(SMITH_HOME, 'cache', 'browser');
|
|
12
|
-
// ─── Module-level browser singleton ────────────────────────────────────────
|
|
13
|
-
let browserInstance = null;
|
|
14
|
-
let pageInstance = null;
|
|
15
|
-
let idleTimer = null;
|
|
16
|
-
let installPromise = null;
|
|
17
|
-
const IDLE_TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
|
|
18
|
-
// ─── Common User Agents (rotated to avoid detection) ───────────────────────
|
|
19
|
-
const USER_AGENTS = [
|
|
20
|
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
21
|
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
22
|
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
23
|
-
];
|
|
24
|
-
function getRandomUserAgent() {
|
|
25
|
-
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
26
|
-
}
|
|
27
|
-
// ─── Retry helper with exponential backoff ──────────────────────────────────
|
|
28
|
-
async function withRetry(fn, maxRetries = 3, baseDelayMs = 1000) {
|
|
29
|
-
let lastError;
|
|
30
|
-
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
31
|
-
try {
|
|
32
|
-
return await fn();
|
|
33
|
-
}
|
|
34
|
-
catch (err) {
|
|
35
|
-
lastError = err;
|
|
36
|
-
if (attempt < maxRetries - 1) {
|
|
37
|
-
const delay = baseDelayMs * Math.pow(2, attempt);
|
|
38
|
-
await new Promise(r => setTimeout(r, delay));
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
throw lastError;
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Ensures Chromium is downloaded to ~/.smith/cache/browser/.
|
|
46
|
-
* Downloads only once; subsequent calls return the cached executablePath.
|
|
47
|
-
*/
|
|
48
|
-
async function ensureChromium() {
|
|
49
|
-
const { install, resolveBuildId, detectBrowserPlatform, computeExecutablePath, Browser: PBrowser, } = await import('@puppeteer/browsers');
|
|
50
|
-
const platform = detectBrowserPlatform();
|
|
51
|
-
const buildId = await resolveBuildId(PBrowser.CHROME, platform, 'stable');
|
|
52
|
-
// Check if already installed
|
|
53
|
-
const execPath = computeExecutablePath({
|
|
54
|
-
browser: PBrowser.CHROME,
|
|
55
|
-
buildId,
|
|
56
|
-
cacheDir: BROWSER_CACHE,
|
|
57
|
-
});
|
|
58
|
-
const { default: fs } = await import('fs-extra');
|
|
59
|
-
if (await fs.pathExists(execPath)) {
|
|
60
|
-
return execPath;
|
|
61
|
-
}
|
|
62
|
-
// Download with progress indicator
|
|
63
|
-
process.stdout.write('[Smith] Installing Chromium for browser tools (first run, ~150MB)...\n');
|
|
64
|
-
const installed = await install({
|
|
65
|
-
browser: PBrowser.CHROME,
|
|
66
|
-
buildId,
|
|
67
|
-
cacheDir: BROWSER_CACHE,
|
|
68
|
-
downloadProgressCallback: (downloaded, total) => {
|
|
69
|
-
const pct = total > 0 ? Math.round((downloaded / total) * 100) : 0;
|
|
70
|
-
process.stdout.write(`\r[Smith] Downloading Chromium: ${pct}% `);
|
|
71
|
-
},
|
|
72
|
-
});
|
|
73
|
-
process.stdout.write('\n[Smith] Chromium installed successfully.\n');
|
|
74
|
-
return installed.executablePath;
|
|
75
|
-
}
|
|
76
|
-
/**
|
|
77
|
-
* Returns (or creates) the browser singleton, resetting the idle timer.
|
|
78
|
-
* Handles Chromium lazy-install with a lock to prevent concurrent downloads.
|
|
79
|
-
*/
|
|
80
|
-
async function acquireBrowser() {
|
|
81
|
-
const { launch } = await import('puppeteer-core');
|
|
82
|
-
const needsLaunch = !browserInstance || !browserInstance.connected;
|
|
83
|
-
if (needsLaunch) {
|
|
84
|
-
if (!installPromise) {
|
|
85
|
-
installPromise = ensureChromium().finally(() => {
|
|
86
|
-
installPromise = null;
|
|
87
|
-
});
|
|
88
|
-
}
|
|
89
|
-
const executablePath = await installPromise;
|
|
90
|
-
// Re-check after awaiting (another caller may have launched already)
|
|
91
|
-
if (!browserInstance || !browserInstance.connected) {
|
|
92
|
-
browserInstance = await launch({
|
|
93
|
-
executablePath,
|
|
94
|
-
headless: true,
|
|
95
|
-
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu'],
|
|
96
|
-
});
|
|
97
|
-
pageInstance = await browserInstance.newPage();
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
else if (!pageInstance || pageInstance.isClosed()) {
|
|
101
|
-
pageInstance = await browserInstance.newPage();
|
|
102
|
-
}
|
|
103
|
-
// Reset idle timeout
|
|
104
|
-
if (idleTimer)
|
|
105
|
-
clearTimeout(idleTimer);
|
|
106
|
-
idleTimer = setTimeout(async () => {
|
|
107
|
-
try {
|
|
108
|
-
await pageInstance?.close();
|
|
109
|
-
}
|
|
110
|
-
catch { /* ignore */ }
|
|
111
|
-
try {
|
|
112
|
-
await browserInstance?.close();
|
|
113
|
-
}
|
|
114
|
-
catch { /* ignore */ }
|
|
115
|
-
pageInstance = null;
|
|
116
|
-
browserInstance = null;
|
|
117
|
-
idleTimer = null;
|
|
118
|
-
}, IDLE_TIMEOUT_MS);
|
|
119
|
-
return { browser: browserInstance, page: pageInstance };
|
|
120
|
-
}
|
|
121
|
-
// Best-effort cleanup on process exit
|
|
122
|
-
process.on('exit', () => {
|
|
123
|
-
try {
|
|
124
|
-
browserInstance?.process()?.kill();
|
|
125
|
-
}
|
|
126
|
-
catch { /* ignore */ }
|
|
127
|
-
});
|
|
128
|
-
// ─── Tool Definitions ───────────────────────────────────────────────────────
|
|
129
|
-
const browserNavigateTool = tool(async ({ url, wait_until, timeout_ms, return_html, wait_for_selector, extract_readable }) => {
|
|
130
|
-
try {
|
|
131
|
-
const { page } = await acquireBrowser();
|
|
132
|
-
// Set a realistic user agent
|
|
133
|
-
await page.setUserAgent(getRandomUserAgent());
|
|
134
|
-
// Set extra headers to appear more like a real browser
|
|
135
|
-
await page.setExtraHTTPHeaders({
|
|
136
|
-
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
137
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
138
|
-
});
|
|
139
|
-
await withRetry(async () => {
|
|
140
|
-
await page.goto(url, {
|
|
141
|
-
waitUntil: (wait_until ?? 'domcontentloaded'),
|
|
142
|
-
timeout: timeout_ms ?? 30_000,
|
|
143
|
-
});
|
|
144
|
-
}, 2);
|
|
145
|
-
// Wait for specific selector if requested
|
|
146
|
-
if (wait_for_selector) {
|
|
147
|
-
await page.waitForSelector(wait_for_selector, { timeout: timeout_ms ?? 30_000 });
|
|
148
|
-
}
|
|
149
|
-
const title = await page.title();
|
|
150
|
-
const htmlContent = await page.content();
|
|
151
|
-
let text;
|
|
152
|
-
let articleTitle = null;
|
|
153
|
-
let articleByline = null;
|
|
154
|
-
let articleExcerpt = null;
|
|
155
|
-
// Use Readability for cleaner content extraction
|
|
156
|
-
if (extract_readable !== false) {
|
|
157
|
-
try {
|
|
158
|
-
const dom = new JSDOM(htmlContent, { url });
|
|
159
|
-
const reader = new Readability(dom.window.document);
|
|
160
|
-
const article = reader.parse();
|
|
161
|
-
if (article) {
|
|
162
|
-
articleTitle = article.title || null;
|
|
163
|
-
articleByline = article.byline || null;
|
|
164
|
-
articleExcerpt = article.excerpt || null;
|
|
165
|
-
text = article.textContent || '';
|
|
166
|
-
}
|
|
167
|
-
else {
|
|
168
|
-
text = await page.evaluate(() => document.body.innerText);
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
catch {
|
|
172
|
-
text = await page.evaluate(() => document.body.innerText);
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
else {
|
|
176
|
-
text = await page.evaluate(() => document.body.innerText);
|
|
177
|
-
}
|
|
178
|
-
const result = {
|
|
179
|
-
success: true,
|
|
180
|
-
url,
|
|
181
|
-
current_url: page.url(),
|
|
182
|
-
title: articleTitle || title,
|
|
183
|
-
byline: articleByline,
|
|
184
|
-
excerpt: articleExcerpt,
|
|
185
|
-
text: truncateOutput(text),
|
|
186
|
-
};
|
|
187
|
-
if (return_html) {
|
|
188
|
-
result.html = truncateOutput(htmlContent);
|
|
189
|
-
}
|
|
190
|
-
return JSON.stringify(result);
|
|
191
|
-
}
|
|
192
|
-
catch (err) {
|
|
193
|
-
return JSON.stringify({ success: false, url, error: err.message });
|
|
194
|
-
}
|
|
195
|
-
}, {
|
|
196
|
-
name: 'browser_navigate',
|
|
197
|
-
description: 'Navigate to a URL in a real browser (executes JavaScript). Use for SPAs, JS-heavy pages, or sites requiring interaction. ' +
|
|
198
|
-
'Automatically extracts clean readable content using Mozilla Readability. Returns page title, byline, excerpt, and text content.',
|
|
199
|
-
schema: z.object({
|
|
200
|
-
url: z.string().describe('Full URL to navigate to (must include https://)'),
|
|
201
|
-
wait_until: z
|
|
202
|
-
.enum(['load', 'domcontentloaded', 'networkidle0', 'networkidle2'])
|
|
203
|
-
.optional()
|
|
204
|
-
.describe('Wait condition. Default: domcontentloaded. Use networkidle0 for SPAs.'),
|
|
205
|
-
timeout_ms: z.number().optional().describe('Navigation timeout in ms. Default: 30000'),
|
|
206
|
-
return_html: z
|
|
207
|
-
.boolean()
|
|
208
|
-
.optional()
|
|
209
|
-
.describe('Also return raw HTML in response. Default: false'),
|
|
210
|
-
wait_for_selector: z
|
|
211
|
-
.string()
|
|
212
|
-
.optional()
|
|
213
|
-
.describe('CSS selector to wait for before extracting content (useful for dynamic content)'),
|
|
214
|
-
extract_readable: z
|
|
215
|
-
.boolean()
|
|
216
|
-
.optional()
|
|
217
|
-
.describe('Use Readability to extract clean article content. Default: true'),
|
|
218
|
-
}),
|
|
219
|
-
});
|
|
220
|
-
const browserGetDomTool = tool(async ({ selector, include_attributes }) => {
|
|
221
|
-
try {
|
|
222
|
-
const { page } = await acquireBrowser();
|
|
223
|
-
const includeAttrs = include_attributes ?? true;
|
|
224
|
-
const dom = await page.evaluate(({ sel, attrs }) => {
|
|
225
|
-
const root = sel
|
|
226
|
-
? document.querySelector(sel)
|
|
227
|
-
: document.body;
|
|
228
|
-
if (!root)
|
|
229
|
-
return null;
|
|
230
|
-
const RELEVANT_ATTRS = [
|
|
231
|
-
'href', 'src', 'type', 'name', 'value',
|
|
232
|
-
'placeholder', 'action', 'id', 'role', 'aria-label',
|
|
233
|
-
];
|
|
234
|
-
function serialize(el, depth) {
|
|
235
|
-
const hasChildren = el.children.length > 0;
|
|
236
|
-
const node = {
|
|
237
|
-
tag: el.tagName.toLowerCase(),
|
|
238
|
-
};
|
|
239
|
-
if (el.id)
|
|
240
|
-
node.id = el.id;
|
|
241
|
-
if (el.className)
|
|
242
|
-
node.class = el.className;
|
|
243
|
-
if (!hasChildren) {
|
|
244
|
-
const txt = el.textContent?.trim();
|
|
245
|
-
if (txt)
|
|
246
|
-
node.text = txt.slice(0, 120);
|
|
247
|
-
}
|
|
248
|
-
if (attrs && el.attributes.length > 0) {
|
|
249
|
-
const attrMap = {};
|
|
250
|
-
for (const attr of el.attributes) {
|
|
251
|
-
if (RELEVANT_ATTRS.includes(attr.name)) {
|
|
252
|
-
attrMap[attr.name] = attr.value;
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
if (Object.keys(attrMap).length)
|
|
256
|
-
node.attrs = attrMap;
|
|
257
|
-
}
|
|
258
|
-
if (depth < 6 && hasChildren) {
|
|
259
|
-
node.children = Array.from(el.children)
|
|
260
|
-
.slice(0, 40)
|
|
261
|
-
.map((c) => serialize(c, depth + 1));
|
|
262
|
-
}
|
|
263
|
-
return node;
|
|
264
|
-
}
|
|
265
|
-
return serialize(root, 0);
|
|
266
|
-
}, { sel: selector ?? null, attrs: includeAttrs });
|
|
267
|
-
if (!dom) {
|
|
268
|
-
return JSON.stringify({ success: false, error: `Element not found: ${selector}` });
|
|
269
|
-
}
|
|
270
|
-
return JSON.stringify({ success: true, current_url: page.url(), dom: truncateOutput(JSON.stringify(dom, null, 2)) });
|
|
271
|
-
}
|
|
272
|
-
catch (err) {
|
|
273
|
-
return JSON.stringify({ success: false, error: err.message });
|
|
274
|
-
}
|
|
275
|
-
}, {
|
|
276
|
-
name: 'browser_get_dom',
|
|
277
|
-
description: 'Get a simplified DOM tree of the current page or a specific element. ' +
|
|
278
|
-
'ALWAYS call this BEFORE browser_click or browser_fill to inspect page structure and identify the correct CSS selectors. ' +
|
|
279
|
-
'Never guess selectors — analyze the DOM first.',
|
|
280
|
-
schema: z.object({
|
|
281
|
-
selector: z
|
|
282
|
-
.string()
|
|
283
|
-
.optional()
|
|
284
|
-
.describe('CSS selector to scope the DOM tree to. Omit to get the full body.'),
|
|
285
|
-
include_attributes: z
|
|
286
|
-
.boolean()
|
|
287
|
-
.optional()
|
|
288
|
-
.describe('Include relevant attributes (href, src, type, name, value, placeholder, role, aria-label). Default: true'),
|
|
289
|
-
}),
|
|
290
|
-
});
|
|
291
|
-
const browserClickTool = tool(async ({ selector, text, timeout_ms, wait_after_ms }) => {
|
|
292
|
-
try {
|
|
293
|
-
const { page } = await acquireBrowser();
|
|
294
|
-
if (!selector && !text) {
|
|
295
|
-
return JSON.stringify({ success: false, error: 'Provide either selector or text' });
|
|
296
|
-
}
|
|
297
|
-
const clickTimeout = timeout_ms ?? 10_000;
|
|
298
|
-
if (text) {
|
|
299
|
-
await page.locator(`::-p-text(${text})`).setTimeout(clickTimeout).click();
|
|
300
|
-
}
|
|
301
|
-
else {
|
|
302
|
-
await page.locator(selector).setTimeout(clickTimeout).click();
|
|
303
|
-
}
|
|
304
|
-
if (wait_after_ms) {
|
|
305
|
-
await new Promise((r) => setTimeout(r, wait_after_ms));
|
|
306
|
-
}
|
|
307
|
-
return JSON.stringify({
|
|
308
|
-
success: true,
|
|
309
|
-
current_url: page.url(),
|
|
310
|
-
title: await page.title(),
|
|
311
|
-
});
|
|
312
|
-
}
|
|
313
|
-
catch (err) {
|
|
314
|
-
return JSON.stringify({ success: false, error: err.message });
|
|
315
|
-
}
|
|
316
|
-
}, {
|
|
317
|
-
name: 'browser_click',
|
|
318
|
-
description: 'Click an element on the current browser page by CSS selector or visible text. ' +
|
|
319
|
-
'The page must already be loaded via browser_navigate. ' +
|
|
320
|
-
'Always inspect the DOM with browser_get_dom first to find the correct selector.',
|
|
321
|
-
schema: z.object({
|
|
322
|
-
selector: z
|
|
323
|
-
.string()
|
|
324
|
-
.optional()
|
|
325
|
-
.describe('CSS selector of the element to click (e.g. "button#submit", ".btn-login")'),
|
|
326
|
-
text: z
|
|
327
|
-
.string()
|
|
328
|
-
.optional()
|
|
329
|
-
.describe('Click element containing this visible text (alternative to selector)'),
|
|
330
|
-
timeout_ms: z
|
|
331
|
-
.number()
|
|
332
|
-
.optional()
|
|
333
|
-
.describe('Timeout to wait for the element in ms. Default: 10000'),
|
|
334
|
-
wait_after_ms: z
|
|
335
|
-
.number()
|
|
336
|
-
.optional()
|
|
337
|
-
.describe('Wait this many ms after clicking (for page transitions/animations). Default: 0'),
|
|
338
|
-
}),
|
|
339
|
-
});
|
|
340
|
-
const browserFillTool = tool(async ({ selector, value, press_enter, timeout_ms }) => {
|
|
341
|
-
try {
|
|
342
|
-
const { page } = await acquireBrowser();
|
|
343
|
-
await page.locator(selector).setTimeout(timeout_ms ?? 10_000).fill(value);
|
|
344
|
-
if (press_enter) {
|
|
345
|
-
await page.keyboard.press('Enter');
|
|
346
|
-
}
|
|
347
|
-
return JSON.stringify({ success: true, selector, filled: true });
|
|
348
|
-
}
|
|
349
|
-
catch (err) {
|
|
350
|
-
return JSON.stringify({ success: false, selector, error: err.message });
|
|
351
|
-
}
|
|
352
|
-
}, {
|
|
353
|
-
name: 'browser_fill',
|
|
354
|
-
description: 'Fill a form input or textarea field with a value. Clears any existing content first. ' +
|
|
355
|
-
'Always inspect the DOM with browser_get_dom first to identify the correct CSS selector.',
|
|
356
|
-
schema: z.object({
|
|
357
|
-
selector: z.string().describe('CSS selector of the input/textarea element'),
|
|
358
|
-
value: z.string().describe('Value to type into the field'),
|
|
359
|
-
press_enter: z
|
|
360
|
-
.boolean()
|
|
361
|
-
.optional()
|
|
362
|
-
.describe('Press Enter after filling (triggers form submit in many cases). Default: false'),
|
|
363
|
-
timeout_ms: z
|
|
364
|
-
.number()
|
|
365
|
-
.optional()
|
|
366
|
-
.describe('Timeout to find the element in ms. Default: 10000'),
|
|
367
|
-
}),
|
|
368
|
-
});
|
|
369
|
-
/**
|
|
370
|
-
* Search via DuckDuckGo Lite (plain HTML, no JS, no bot detection).
|
|
371
|
-
* Enhanced with better parsing, intent detection, and fallbacks.
|
|
372
|
-
*/
|
|
373
|
-
const browserSearchTool = tool(async ({ query, num_results, language, search_type }) => {
|
|
374
|
-
try {
|
|
375
|
-
const max = Math.min(num_results ?? 10, 20);
|
|
376
|
-
const year = new Date().getFullYear().toString();
|
|
377
|
-
const lang = language ?? "pt";
|
|
378
|
-
const qLower = query.toLowerCase();
|
|
379
|
-
let intent = "general";
|
|
380
|
-
// News patterns (PT/EN)
|
|
381
|
-
if (/(hoje|ontem|último|resultado|placar|próximos|futebol|eleição|202\d|today|yesterday|latest|breaking|election)/i.test(qLower)) {
|
|
382
|
-
intent = "news";
|
|
383
|
-
}
|
|
384
|
-
// Official/Government patterns
|
|
385
|
-
else if (/(site oficial|gov\.|receita federal|ministério|official site|government)/i.test(qLower)) {
|
|
386
|
-
intent = "official";
|
|
387
|
-
}
|
|
388
|
-
// Documentation patterns
|
|
389
|
-
else if (/(api|sdk|npm|pypi|docs|documentação|documentation|reference|tutorial|example)/i.test(qLower)) {
|
|
390
|
-
intent = "documentation";
|
|
391
|
-
}
|
|
392
|
-
// Price patterns
|
|
393
|
-
else if (/(preço|valor|quanto custa|price|cost|pricing|buy)/i.test(qLower)) {
|
|
394
|
-
intent = "price";
|
|
395
|
-
}
|
|
396
|
-
// Academic patterns
|
|
397
|
-
else if (/(research|paper|study|journal|artigo|pesquisa|científico|scientific)/i.test(qLower)) {
|
|
398
|
-
intent = "academic";
|
|
399
|
-
}
|
|
400
|
-
// How-to patterns
|
|
401
|
-
else if (/(como|how to|tutorial|guia|guide|passo a passo|step by step)/i.test(qLower)) {
|
|
402
|
-
intent = "how-to";
|
|
403
|
-
}
|
|
404
|
-
// ─── Smart Query Refinement ──────────────────────────────────────────
|
|
405
|
-
let refinedQuery = query;
|
|
406
|
-
const refinements = [];
|
|
407
|
-
switch (intent) {
|
|
408
|
-
case "news":
|
|
409
|
-
refinements.push(year);
|
|
410
|
-
break;
|
|
411
|
-
case "official":
|
|
412
|
-
// Don't modify - let user's query stand
|
|
413
|
-
break;
|
|
414
|
-
case "documentation":
|
|
415
|
-
// Only add if not already present
|
|
416
|
-
if (!/docs|documentation|github/i.test(qLower)) {
|
|
417
|
-
refinements.push("documentation");
|
|
418
|
-
}
|
|
419
|
-
break;
|
|
420
|
-
case "price":
|
|
421
|
-
refinements.push(year);
|
|
422
|
-
if (lang === "pt" || lang === "br")
|
|
423
|
-
refinements.push("Brasil");
|
|
424
|
-
break;
|
|
425
|
-
case "academic":
|
|
426
|
-
refinements.push("site:scholar.google.com OR site:arxiv.org OR site:researchgate.net");
|
|
427
|
-
break;
|
|
428
|
-
case "how-to":
|
|
429
|
-
// Don't add noise, how-to queries are usually specific enough
|
|
430
|
-
break;
|
|
431
|
-
}
|
|
432
|
-
if (refinements.length > 0) {
|
|
433
|
-
refinedQuery = `${query} ${refinements.join(" ")}`;
|
|
434
|
-
}
|
|
435
|
-
// ─── Region Mapping ──────────────────────────────────────────────────
|
|
436
|
-
const regionMap = {
|
|
437
|
-
pt: "br-pt",
|
|
438
|
-
br: "br-pt",
|
|
439
|
-
en: "us-en",
|
|
440
|
-
us: "us-en",
|
|
441
|
-
uk: "uk-en",
|
|
442
|
-
es: "es-es",
|
|
443
|
-
fr: "fr-fr",
|
|
444
|
-
de: "de-de",
|
|
445
|
-
};
|
|
446
|
-
const kl = regionMap[lang] ?? lang;
|
|
447
|
-
// ─── Execute Search with Retry ───────────────────────────────────────
|
|
448
|
-
const searchResult = await withRetry(async () => {
|
|
449
|
-
const body = new URLSearchParams({ q: refinedQuery, kl }).toString();
|
|
450
|
-
const res = await fetch("https://lite.duckduckgo.com/lite/", {
|
|
451
|
-
method: "POST",
|
|
452
|
-
headers: {
|
|
453
|
-
"Content-Type": "application/x-www-form-urlencoded",
|
|
454
|
-
"User-Agent": getRandomUserAgent(),
|
|
455
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
456
|
-
"Accept-Language": lang === "pt" ? "pt-BR,pt;q=0.9,en;q=0.8" : "en-US,en;q=0.9",
|
|
457
|
-
},
|
|
458
|
-
body,
|
|
459
|
-
signal: AbortSignal.timeout(20000),
|
|
460
|
-
});
|
|
461
|
-
if (!res.ok) {
|
|
462
|
-
throw new Error(`HTTP ${res.status}`);
|
|
463
|
-
}
|
|
464
|
-
return res.text();
|
|
465
|
-
}, 3);
|
|
466
|
-
const html = searchResult;
|
|
467
|
-
// ─── Improved Parsing (handles both quote styles) ────────────────────
|
|
468
|
-
// Match links with either single or double quotes
|
|
469
|
-
const linkPattern = /href=["'](https?:\/\/[^"']+)["'][^>]*class=["']result-link["'][^>]*>([^<]+)<\/a>/gi;
|
|
470
|
-
const snippetPattern = /class=["']result-snippet["'][^>]*>([\s\S]*?)<\/td>/gi;
|
|
471
|
-
const links = [...html.matchAll(linkPattern)];
|
|
472
|
-
const snippets = [...html.matchAll(snippetPattern)];
|
|
473
|
-
if (!links.length) {
|
|
474
|
-
// Try alternative pattern (DuckDuckGo sometimes changes format)
|
|
475
|
-
const altLinkPattern = /<a[^>]+class=["']result-link["'][^>]+href=["'](https?:\/\/[^"']+)["'][^>]*>([^<]+)<\/a>/gi;
|
|
476
|
-
const altLinks = [...html.matchAll(altLinkPattern)];
|
|
477
|
-
if (!altLinks.length) {
|
|
478
|
-
return JSON.stringify({
|
|
479
|
-
success: false,
|
|
480
|
-
query: refinedQuery,
|
|
481
|
-
error: "No results found. Try a different search term.",
|
|
482
|
-
hint: intent !== "general" ? `Detected intent: ${intent}. Try a more specific query.` : undefined,
|
|
483
|
-
});
|
|
484
|
-
}
|
|
485
|
-
links.push(...altLinks);
|
|
486
|
-
}
|
|
487
|
-
// ─── Helper Functions ────────────────────────────────────────────────
|
|
488
|
-
function normalizeUrl(url) {
|
|
489
|
-
try {
|
|
490
|
-
const u = new URL(url);
|
|
491
|
-
// Remove tracking parameters
|
|
492
|
-
['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid', 'gclid'].forEach(p => u.searchParams.delete(p));
|
|
493
|
-
return u.toString();
|
|
494
|
-
}
|
|
495
|
-
catch {
|
|
496
|
-
return url;
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
function getDomain(url) {
|
|
500
|
-
try {
|
|
501
|
-
return new URL(url).hostname.replace(/^www\./, "");
|
|
502
|
-
}
|
|
503
|
-
catch {
|
|
504
|
-
return "";
|
|
505
|
-
}
|
|
506
|
-
}
|
|
507
|
-
// ─── Enhanced Domain Scoring ─────────────────────────────────────────
|
|
508
|
-
const domainScores = {
|
|
509
|
-
// High authority
|
|
510
|
-
"github.com": 8,
|
|
511
|
-
"stackoverflow.com": 8,
|
|
512
|
-
"wikipedia.org": 7,
|
|
513
|
-
"docs.python.org": 8,
|
|
514
|
-
"developer.mozilla.org": 8,
|
|
515
|
-
"npmjs.com": 7,
|
|
516
|
-
"pypi.org": 7,
|
|
517
|
-
// News
|
|
518
|
-
"bbc.com": 6,
|
|
519
|
-
"reuters.com": 6,
|
|
520
|
-
"cnn.com": 5,
|
|
521
|
-
"globo.com": 5,
|
|
522
|
-
"uol.com.br": 4,
|
|
523
|
-
"g1.globo.com": 6,
|
|
524
|
-
// Brazilian official
|
|
525
|
-
"gov.br": 7,
|
|
526
|
-
// Tech blogs
|
|
527
|
-
"medium.com": 3,
|
|
528
|
-
"dev.to": 4,
|
|
529
|
-
"hashnode.dev": 3,
|
|
530
|
-
// Academic
|
|
531
|
-
"arxiv.org": 7,
|
|
532
|
-
"scholar.google.com": 7,
|
|
533
|
-
"researchgate.net": 6,
|
|
534
|
-
};
|
|
535
|
-
const penalizedPatterns = [
|
|
536
|
-
/login|signin|signup/i,
|
|
537
|
-
/assine|subscribe|paywall/i,
|
|
538
|
-
/compre|buy now|add to cart/i,
|
|
539
|
-
/pinterest\.com/i,
|
|
540
|
-
/facebook\.com/i,
|
|
541
|
-
/instagram\.com/i,
|
|
542
|
-
];
|
|
543
|
-
function scoreResult(result) {
|
|
544
|
-
let score = 0;
|
|
545
|
-
const domain = getDomain(result.url);
|
|
546
|
-
// Domain-based scoring
|
|
547
|
-
for (const [d, s] of Object.entries(domainScores)) {
|
|
548
|
-
if (domain.includes(d) || domain.endsWith(d)) {
|
|
549
|
-
score += s;
|
|
550
|
-
break;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
// Intent-based bonuses
|
|
554
|
-
if (intent === "documentation") {
|
|
555
|
-
if (/github|docs|reference|api/i.test(domain))
|
|
556
|
-
score += 4;
|
|
557
|
-
if (/example|tutorial|guide/i.test(result.title))
|
|
558
|
-
score += 2;
|
|
559
|
-
}
|
|
560
|
-
if (intent === "news") {
|
|
561
|
-
if (/(globo|uol|cnn|bbc|reuters|g1)/i.test(domain))
|
|
562
|
-
score += 4;
|
|
563
|
-
if (new RegExp(year).test(result.snippet))
|
|
564
|
-
score += 2;
|
|
565
|
-
}
|
|
566
|
-
if (intent === "official" && /gov\.|\.gov|official/i.test(domain)) {
|
|
567
|
-
score += 5;
|
|
568
|
-
}
|
|
569
|
-
if (intent === "academic" && /arxiv|scholar|research/i.test(domain)) {
|
|
570
|
-
score += 5;
|
|
571
|
-
}
|
|
572
|
-
if (intent === "how-to" && /tutorial|guide|how/i.test(result.title)) {
|
|
573
|
-
score += 3;
|
|
574
|
-
}
|
|
575
|
-
// Title relevance
|
|
576
|
-
const queryWords = query.toLowerCase().split(/\s+/).filter(w => w.length > 2);
|
|
577
|
-
const titleLower = result.title.toLowerCase();
|
|
578
|
-
const matchedWords = queryWords.filter(w => titleLower.includes(w));
|
|
579
|
-
score += Math.min(matchedWords.length * 1.5, 5);
|
|
580
|
-
// Snippet quality
|
|
581
|
-
if (result.snippet.length > 100)
|
|
582
|
-
score += 1;
|
|
583
|
-
if (result.snippet.length > 200)
|
|
584
|
-
score += 1;
|
|
585
|
-
// Penalties
|
|
586
|
-
for (const pattern of penalizedPatterns) {
|
|
587
|
-
if (pattern.test(result.url) || pattern.test(result.snippet)) {
|
|
588
|
-
score -= 4;
|
|
589
|
-
}
|
|
590
|
-
}
|
|
591
|
-
return Math.max(0, score);
|
|
592
|
-
}
|
|
593
|
-
// ─── Process Results ─────────────────────────────────────────────────
|
|
594
|
-
const domainSeen = new Set();
|
|
595
|
-
const results = [];
|
|
596
|
-
for (let i = 0; i < links.length; i++) {
|
|
597
|
-
const rawUrl = links[i][1];
|
|
598
|
-
if (rawUrl.includes("duckduckgo.com"))
|
|
599
|
-
continue;
|
|
600
|
-
const url = normalizeUrl(rawUrl);
|
|
601
|
-
const domain = getDomain(url);
|
|
602
|
-
// Skip if we already have this domain (dedupe)
|
|
603
|
-
if (domainSeen.has(domain))
|
|
604
|
-
continue;
|
|
605
|
-
domainSeen.add(domain);
|
|
606
|
-
const title = links[i][2].trim().replace(/\s+/g, " ");
|
|
607
|
-
const snippet = snippets[i]
|
|
608
|
-
? snippets[i][1].replace(/<[^>]+>/g, "").replace(/\s+/g, " ").trim()
|
|
609
|
-
: "";
|
|
610
|
-
const result = { title, url, snippet };
|
|
611
|
-
const score = scoreResult(result);
|
|
612
|
-
results.push({ ...result, domain, score });
|
|
613
|
-
}
|
|
614
|
-
if (!results.length) {
|
|
615
|
-
return JSON.stringify({
|
|
616
|
-
success: false,
|
|
617
|
-
query: refinedQuery,
|
|
618
|
-
error: "No valid results after filtering",
|
|
619
|
-
});
|
|
620
|
-
}
|
|
621
|
-
// Sort by score and take top results
|
|
622
|
-
results.sort((a, b) => b.score - a.score);
|
|
623
|
-
const topResults = results.slice(0, max);
|
|
624
|
-
// Calculate confidence
|
|
625
|
-
const avgScore = topResults.reduce((acc, r) => acc + r.score, 0) / topResults.length;
|
|
626
|
-
const confidence = avgScore >= 6 ? "high" : avgScore >= 3 ? "medium" : "low";
|
|
627
|
-
return JSON.stringify({
|
|
628
|
-
success: true,
|
|
629
|
-
original_query: query,
|
|
630
|
-
refined_query: refinedQuery !== query ? refinedQuery : undefined,
|
|
631
|
-
intent,
|
|
632
|
-
confidence,
|
|
633
|
-
result_count: topResults.length,
|
|
634
|
-
results: topResults.map((r) => ({
|
|
635
|
-
title: r.title,
|
|
636
|
-
url: r.url,
|
|
637
|
-
snippet: r.snippet,
|
|
638
|
-
domain: r.domain,
|
|
639
|
-
score: r.score,
|
|
640
|
-
})),
|
|
641
|
-
});
|
|
642
|
-
}
|
|
643
|
-
catch (err) {
|
|
644
|
-
return JSON.stringify({
|
|
645
|
-
success: false,
|
|
646
|
-
error: err.message,
|
|
647
|
-
hint: "Search failed. Try simplifying your query or check your internet connection."
|
|
648
|
-
});
|
|
649
|
-
}
|
|
650
|
-
}, {
|
|
651
|
-
name: "browser_search",
|
|
652
|
-
description: "Intelligent web search with automatic intent detection (news, documentation, how-to, academic, etc.), " +
|
|
653
|
-
"smart query refinement, domain authority scoring, and confidence levels. Uses DuckDuckGo Lite for privacy. " +
|
|
654
|
-
"Returns ranked results with relevance scores.",
|
|
655
|
-
schema: z.object({
|
|
656
|
-
query: z.string().describe("Search query. Be specific for better results."),
|
|
657
|
-
num_results: z.number().int().min(1).max(20).optional().describe("Max results to return. Default: 10"),
|
|
658
|
-
language: z.enum(["pt", "br", "en", "us", "uk", "es", "fr", "de"]).optional().describe("Search region/language. Default: pt"),
|
|
659
|
-
search_type: z.enum(["web", "news"]).optional().describe("Type of search. Default: web (news not yet implemented)"),
|
|
660
|
-
}),
|
|
661
|
-
});
|
|
662
|
-
/**
|
|
663
|
-
* Lightweight content fetcher - uses fetch + Readability instead of Puppeteer.
|
|
664
|
-
* Much faster for static pages, articles, documentation, etc.
|
|
665
|
-
*/
|
|
666
|
-
const browserFetchContentTool = tool(async ({ url, timeout_ms, include_links }) => {
|
|
667
|
-
try {
|
|
668
|
-
const result = await withRetry(async () => {
|
|
669
|
-
const controller = new AbortController();
|
|
670
|
-
const timer = setTimeout(() => controller.abort(), timeout_ms ?? 30_000);
|
|
671
|
-
try {
|
|
672
|
-
const response = await fetch(url, {
|
|
673
|
-
signal: controller.signal,
|
|
674
|
-
headers: {
|
|
675
|
-
'User-Agent': getRandomUserAgent(),
|
|
676
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
677
|
-
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
678
|
-
'Accept-Encoding': 'gzip, deflate, br',
|
|
679
|
-
'Cache-Control': 'no-cache',
|
|
680
|
-
},
|
|
681
|
-
});
|
|
682
|
-
if (!response.ok) {
|
|
683
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
684
|
-
}
|
|
685
|
-
const contentType = response.headers.get('content-type') || '';
|
|
686
|
-
// Handle JSON responses directly
|
|
687
|
-
if (contentType.includes('application/json')) {
|
|
688
|
-
const json = await response.json();
|
|
689
|
-
return {
|
|
690
|
-
success: true,
|
|
691
|
-
url,
|
|
692
|
-
content_type: 'json',
|
|
693
|
-
data: json,
|
|
694
|
-
};
|
|
695
|
-
}
|
|
696
|
-
const html = await response.text();
|
|
697
|
-
return { html, response };
|
|
698
|
-
}
|
|
699
|
-
finally {
|
|
700
|
-
clearTimeout(timer);
|
|
701
|
-
}
|
|
702
|
-
}, 3);
|
|
703
|
-
// If it was JSON, return early
|
|
704
|
-
if ('content_type' in result && result.content_type === 'json') {
|
|
705
|
-
return JSON.stringify(result);
|
|
706
|
-
}
|
|
707
|
-
const { html } = result;
|
|
708
|
-
// Parse with JSDOM and extract with Readability
|
|
709
|
-
const dom = new JSDOM(html, { url });
|
|
710
|
-
const document = dom.window.document;
|
|
711
|
-
// Extract metadata
|
|
712
|
-
const title = document.querySelector('title')?.textContent?.trim() || '';
|
|
713
|
-
const description = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
|
|
714
|
-
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
|
|
715
|
-
const author = document.querySelector('meta[name="author"]')?.getAttribute('content') || '';
|
|
716
|
-
// Use Readability for main content
|
|
717
|
-
const reader = new Readability(document.cloneNode(true));
|
|
718
|
-
const article = reader.parse();
|
|
719
|
-
// Extract links if requested
|
|
720
|
-
let links = [];
|
|
721
|
-
if (include_links) {
|
|
722
|
-
const anchors = document.querySelectorAll('a[href]');
|
|
723
|
-
const seen = new Set();
|
|
724
|
-
anchors.forEach((a) => {
|
|
725
|
-
const href = a.getAttribute('href');
|
|
726
|
-
const text = a.textContent?.trim();
|
|
727
|
-
if (href && text && !seen.has(href) && href.startsWith('http')) {
|
|
728
|
-
seen.add(href);
|
|
729
|
-
links.push({ text: text.slice(0, 100), href });
|
|
730
|
-
}
|
|
731
|
-
});
|
|
732
|
-
links = links.slice(0, 50); // Limit to 50 links
|
|
733
|
-
}
|
|
734
|
-
const output = {
|
|
735
|
-
success: true,
|
|
736
|
-
url,
|
|
737
|
-
title: article?.title || title,
|
|
738
|
-
description,
|
|
739
|
-
author: article?.byline || author,
|
|
740
|
-
excerpt: article?.excerpt || description,
|
|
741
|
-
content: truncateOutput(article?.textContent || document.body?.textContent || ''),
|
|
742
|
-
word_count: article?.textContent?.split(/\s+/).length || 0,
|
|
743
|
-
};
|
|
744
|
-
if (include_links && links.length > 0) {
|
|
745
|
-
output.links = links;
|
|
746
|
-
}
|
|
747
|
-
return JSON.stringify(output);
|
|
748
|
-
}
|
|
749
|
-
catch (err) {
|
|
750
|
-
return JSON.stringify({
|
|
751
|
-
success: false,
|
|
752
|
-
url,
|
|
753
|
-
error: err.message,
|
|
754
|
-
hint: 'If this is a JavaScript-heavy site, try browser_navigate instead.'
|
|
755
|
-
});
|
|
756
|
-
}
|
|
757
|
-
}, {
|
|
758
|
-
name: 'browser_fetch_content',
|
|
759
|
-
description: 'Fast, lightweight content fetcher for static pages, articles, documentation, and APIs. ' +
|
|
760
|
-
'Uses HTTP fetch + Readability (no browser needed). Much faster than browser_navigate. ' +
|
|
761
|
-
'Use this for: documentation pages, blog posts, news articles, API endpoints. ' +
|
|
762
|
-
'For JavaScript-heavy SPAs, use browser_navigate instead.',
|
|
763
|
-
schema: z.object({
|
|
764
|
-
url: z.string().describe('Full URL to fetch (must include https://)'),
|
|
765
|
-
timeout_ms: z.number().optional().describe('Timeout in ms. Default: 30000'),
|
|
766
|
-
include_links: z.boolean().optional().describe('Extract and return all links from the page. Default: false'),
|
|
767
|
-
}),
|
|
768
|
-
});
|
|
769
|
-
/**
|
|
770
|
-
* Screenshot tool - useful for visual verification and debugging
|
|
771
|
-
*/
|
|
772
|
-
const browserScreenshotTool = tool(async ({ selector, full_page }) => {
|
|
773
|
-
try {
|
|
774
|
-
const { page } = await acquireBrowser();
|
|
775
|
-
let screenshot;
|
|
776
|
-
if (selector) {
|
|
777
|
-
const element = await page.$(selector);
|
|
778
|
-
if (!element) {
|
|
779
|
-
return JSON.stringify({ success: false, error: `Element not found: ${selector}` });
|
|
780
|
-
}
|
|
781
|
-
screenshot = await element.screenshot({ encoding: 'binary' });
|
|
782
|
-
}
|
|
783
|
-
else {
|
|
784
|
-
screenshot = await page.screenshot({
|
|
785
|
-
fullPage: full_page ?? false,
|
|
786
|
-
encoding: 'binary'
|
|
787
|
-
});
|
|
788
|
-
}
|
|
789
|
-
const base64 = screenshot.toString('base64');
|
|
790
|
-
return JSON.stringify({
|
|
791
|
-
success: true,
|
|
792
|
-
current_url: page.url(),
|
|
793
|
-
title: await page.title(),
|
|
794
|
-
screenshot_base64: base64,
|
|
795
|
-
size_bytes: screenshot.length,
|
|
796
|
-
});
|
|
797
|
-
}
|
|
798
|
-
catch (err) {
|
|
799
|
-
return JSON.stringify({ success: false, error: err.message });
|
|
800
|
-
}
|
|
801
|
-
}, {
|
|
802
|
-
name: 'browser_screenshot',
|
|
803
|
-
description: 'Take a screenshot of the current page or a specific element. ' +
|
|
804
|
-
'Useful for visual verification and debugging. Returns base64-encoded PNG.',
|
|
805
|
-
schema: z.object({
|
|
806
|
-
selector: z.string().optional().describe('CSS selector of element to screenshot. Omit for full viewport.'),
|
|
807
|
-
full_page: z.boolean().optional().describe('Capture full scrollable page. Default: false (viewport only)'),
|
|
808
|
-
}),
|
|
809
|
-
});
|
|
810
|
-
// ─── Factory ────────────────────────────────────────────────────────────────
|
|
811
|
-
export function createBrowserTools(_ctx) {
|
|
812
|
-
if (process.env.SMITH_BROWSER_ENABLED === 'false') {
|
|
813
|
-
return [];
|
|
814
|
-
}
|
|
815
|
-
return [
|
|
816
|
-
browserNavigateTool,
|
|
817
|
-
browserGetDomTool,
|
|
818
|
-
browserClickTool,
|
|
819
|
-
browserFillTool,
|
|
820
|
-
browserSearchTool,
|
|
821
|
-
browserFetchContentTool,
|
|
822
|
-
browserScreenshotTool,
|
|
823
|
-
];
|
|
824
|
-
}
|
|
825
|
-
registerToolFactory(createBrowserTools, 'browser');
|