skyloom 1.15.4 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/cli/command_args.d.ts +74 -0
  2. package/dist/cli/command_args.d.ts.map +1 -0
  3. package/dist/cli/command_args.js +129 -0
  4. package/dist/cli/command_args.js.map +1 -0
  5. package/dist/cli/loom.d.ts +20 -0
  6. package/dist/cli/loom.d.ts.map +1 -1
  7. package/dist/cli/loom.js +202 -24
  8. package/dist/cli/loom.js.map +1 -1
  9. package/dist/cli/loom_chat.d.ts.map +1 -1
  10. package/dist/cli/loom_chat.js +39 -0
  11. package/dist/cli/loom_chat.js.map +1 -1
  12. package/dist/core/agent.js +2 -2
  13. package/dist/core/agent.js.map +1 -1
  14. package/dist/core/security.d.ts.map +1 -1
  15. package/dist/core/security.js +1 -0
  16. package/dist/core/security.js.map +1 -1
  17. package/dist/core/tool_router.d.ts.map +1 -1
  18. package/dist/core/tool_router.js +11 -3
  19. package/dist/core/tool_router.js.map +1 -1
  20. package/dist/tools/builtin.d.ts.map +1 -1
  21. package/dist/tools/builtin.js +38 -192
  22. package/dist/tools/builtin.js.map +1 -1
  23. package/dist/tools/websearch.d.ts +92 -0
  24. package/dist/tools/websearch.d.ts.map +1 -0
  25. package/dist/tools/websearch.js +343 -0
  26. package/dist/tools/websearch.js.map +1 -0
  27. package/dist/web/server.js +2 -9
  28. package/dist/web/server.js.map +1 -1
  29. package/dist/web/ui.d.ts.map +1 -1
  30. package/dist/web/ui.js +3 -2
  31. package/dist/web/ui.js.map +1 -1
  32. package/package.json +1 -1
  33. package/src/cli/command_args.ts +159 -0
  34. package/src/cli/loom.ts +155 -17
  35. package/src/cli/loom_chat.ts +33 -0
  36. package/src/core/agent.ts +2 -2
  37. package/src/core/security.ts +1 -0
  38. package/src/core/tool_router.ts +11 -3
  39. package/src/tools/builtin.ts +38 -190
  40. package/src/tools/websearch.ts +368 -0
  41. package/src/web/server.ts +2 -10
  42. package/src/web/ui.ts +3 -2
  43. package/tests/command_args.test.ts +115 -0
  44. package/tests/loom.test.ts +74 -0
  45. package/tests/tool_router.test.ts +15 -0
  46. package/tests/web.test.ts +7 -5
  47. package/tests/websearch.test.ts +190 -0
@@ -4,177 +4,18 @@
4
4
 
5
5
  import * as fs from 'fs';
6
6
  import * as path from 'path';
7
- import axios from 'axios';
8
7
  import type { ToolRegistry } from '../core/tool';
9
8
  import { getLogger } from '../core/logger';
10
9
  import { registerComputerTools } from './computer';
11
10
  import { registerExtraTools } from './extra';
12
11
  import { isPrivateIp, assertFetchAllowed, fenceRoot, fenceCheck } from './guards';
12
+ import { webSearch, formatSearchResults, readPage } from './websearch';
13
13
 
14
14
  // Re-exported so existing importers/tests keep resolving these from builtin.
15
15
  export { isPrivateIp, assertFetchAllowed, fenceRoot, fenceCheck };
16
16
 
17
17
  const log = getLogger('builtin-tools');
18
18
 
19
-
20
- /* ── Web search helpers ───────────────────────────────────────────────────
21
- Multi-engine fallback. DuckDuckGo's Instant Answer JSON API only returns
22
- "abstracts" and is blank for ~90% of real queries; HTML scraping is what
23
- actually works. In CN networks, DDG/Bing may be unreachable — Baidu/Sogou
24
- serve as fallbacks. Each parser is intentionally tolerant: HTML changes
25
- over time, so we extract loosely and let the engine list provide redundancy.
26
- ────────────────────────────────────────────────────────────────────────── */
27
- interface SearchResult { title: string; url: string; snippet: string }
28
-
29
- const SEARCH_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36';
30
-
31
- const searchClient = axios.create({
32
- timeout: 15000,
33
- headers: {
34
- 'User-Agent': SEARCH_UA,
35
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
37
- },
38
- // Allow redirects (search engines use them)
39
- maxRedirects: 5,
40
- // Validate status (only 2xx is ok)
41
- validateStatus: (status) => status >= 200 && status < 300,
42
- });
43
-
44
- async function fetchHtml(url: string, timeoutMs = 15000, retries = 2): Promise<string> {
45
- let lastError: Error | null = null;
46
- for (let attempt = 0; attempt <= retries; attempt++) {
47
- try {
48
- const res = await searchClient.get(url, {
49
- timeout: timeoutMs,
50
- // Skip SSRF check for known search engines
51
- transitional: { clarifyTimeoutError: true },
52
- });
53
- return res.data;
54
- } catch (e: any) {
55
- lastError = e;
56
- // Don't retry on 4xx (client errors like 403/404)
57
- if (e.response && e.response.status >= 400 && e.response.status < 500) {
58
- throw new Error(`HTTP ${e.response.status}: ${e.response.statusText || 'Blocked'}`);
59
- }
60
- // Wait before retry (exponential backoff)
61
- if (attempt < retries) {
62
- await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
63
- }
64
- }
65
- }
66
- throw lastError || new Error('fetch failed');
67
- }
68
-
69
- function decodeHtmlEntities(s: string): string {
70
- return s
71
- .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
72
- .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, ' ')
73
- .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
74
- .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
75
- }
76
-
77
- function stripTags(s: string): string {
78
- return decodeHtmlEntities(s.replace(/<[^>]+>/g, '')).replace(/\s+/g, ' ').trim();
79
- }
80
-
81
- function unwrapDdgRedirect(href: string): string {
82
- // DuckDuckGo HTML wraps results in /l/?uddg=<encoded-url>
83
- const m = href.match(/[?&]uddg=([^&]+)/);
84
- if (m) { try { return decodeURIComponent(m[1]); } catch { /* fall through */ } }
85
- if (href.startsWith('//')) return 'https:' + href;
86
- return href;
87
- }
88
-
89
- function unwrapBaiduRedirect(href: string): string {
90
- // Baidu uses opaque /link?url=... redirects; we can't resolve without another request.
91
- // Return as-is; consumer can still click through.
92
- return href;
93
- }
94
-
95
- async function searchDuckDuckGo(query: string, max: number): Promise<SearchResult[]> {
96
- const html = await fetchHtml(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`);
97
- const results: SearchResult[] = [];
98
- const re = /<a[^>]+class="[^"]*result__a[^"]*"[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<a[^>]+class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
99
- let m: RegExpExecArray | null;
100
- while ((m = re.exec(html)) && results.length < max) {
101
- results.push({ url: unwrapDdgRedirect(m[1]), title: stripTags(m[2]), snippet: stripTags(m[3]) });
102
- }
103
- return results;
104
- }
105
-
106
- async function searchBing(query: string, max: number): Promise<SearchResult[]> {
107
- const html = await fetchHtml(`https://www.bing.com/search?q=${encodeURIComponent(query)}&setlang=zh-cn`);
108
- const results: SearchResult[] = [];
109
- const liRe = /<li class="b_algo"[\s\S]*?<\/li>/gi;
110
- const items = html.match(liRe) || [];
111
- for (const item of items) {
112
- if (results.length >= max) break;
113
- const a = item.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
114
- if (!a) continue;
115
- const snipMatch =
116
- item.match(/<p class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i) ||
117
- item.match(/<div class="b_caption"[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i) ||
118
- item.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
119
- results.push({ url: a[1], title: stripTags(a[2]), snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
120
- }
121
- return results;
122
- }
123
-
124
- async function searchBaidu(query: string, max: number): Promise<SearchResult[]> {
125
- const html = await fetchHtml(`https://www.baidu.com/s?wd=${encodeURIComponent(query)}`);
126
- const results: SearchResult[] = [];
127
- // Baidu nests divs aggressively; anchor on <h3> ... <a href>...</a> and look
128
- // for the nearest abstract block following.
129
- const re = /<h3[^>]*>[\s\S]{0,500}?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
130
- let m: RegExpExecArray | null;
131
- while ((m = re.exec(html)) && results.length < max) {
132
- const url = unwrapBaiduRedirect(m[1]);
133
- const title = stripTags(m[2]);
134
- if (!title || !/^https?:\/\//.test(url)) continue;
135
- const after = html.slice(re.lastIndex, re.lastIndex + 4000);
136
- const snipMatch =
137
- after.match(/<span class="content-right[^"]*"[^>]*>([\s\S]*?)<\/span>/i) ||
138
- after.match(/<div class="c-abstract[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
139
- after.match(/<span[^>]*content[^"]*"[^>]*>([\s\S]{20,400}?)<\/span>/i) ||
140
- after.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
141
- results.push({ url, title, snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
142
- }
143
- return results;
144
- }
145
-
146
- async function searchSogou(query: string, max: number): Promise<SearchResult[]> {
147
- const html = await fetchHtml(`https://www.sogou.com/web?query=${encodeURIComponent(query)}`);
148
- const results: SearchResult[] = [];
149
- const divRe = /<div[^>]+class="vrwrap"[\s\S]*?(?=<div[^>]+class="vrwrap"|$)/gi;
150
- const items = html.match(divRe) || [];
151
- for (const item of items) {
152
- if (results.length >= max) break;
153
- const a = item.match(/<h3[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
154
- if (!a) continue;
155
- let url = a[1];
156
- if (url.startsWith('/link?')) url = 'https://www.sogou.com' + url;
157
- const snipMatch =
158
- item.match(/<div[^>]+class="(?:str_info|fz-mid|space-txt)[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
159
- item.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
160
- results.push({ url, title: stripTags(a[2]), snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
161
- }
162
- return results;
163
- }
164
-
165
- async function runSearchEngine(engine: string, query: string, max: number): Promise<SearchResult[]> {
166
- let results: SearchResult[];
167
- switch (engine) {
168
- case 'duckduckgo': case 'ddg': results = await searchDuckDuckGo(query, max); break;
169
- case 'bing': results = await searchBing(query, max); break;
170
- case 'baidu': results = await searchBaidu(query, max); break;
171
- case 'sogou': results = await searchSogou(query, max); break;
172
- default: throw new Error(`unknown search engine: ${engine}`);
173
- }
174
- // Drop placeholder/JS-anchor entries from inline answer cards.
175
- return results.filter((r) => r.title && /^https?:\/\//i.test(r.url));
176
- }
177
-
178
19
  /**
179
20
  * Register all built-in tools into the given registry.
180
21
  */
@@ -383,43 +224,50 @@ export function registerBuiltinTools(registry: ToolRegistry): void {
383
224
 
384
225
  registry.register({
385
226
  name: 'web_search',
386
- description: 'Search the web for information. Returns search results with titles, URLs and snippets.',
227
+ description:
228
+ 'Search the live web and return titles, URLs, and snippets (plus a direct answer when available). ' +
229
+ 'USE THIS whenever the answer depends on current or real-time information — today\'s news and hot topics, ' +
230
+ 'recent events, latest releases/versions, prices, weather, scores, or anything that may have changed since your ' +
231
+ 'training cutoff. Do NOT answer such questions from memory and do NOT claim you cannot access the internet — ' +
232
+ 'search first, then answer with the findings and cite the source URLs. Follow up with read_url to read a result in full.',
387
233
  parameters: [
388
- { name: 'query', type: 'string', description: 'Search query', required: true },
389
- { name: 'engine', type: 'string', description: 'Optional engine: duckduckgo|bing|baidu|sogou. Default: auto (tries each until one returns results)', required: false },
234
+ { name: 'query', type: 'string', description: 'Search query. Be specific; include the year/date for time-sensitive queries.', required: true },
235
+ { name: 'engine', type: 'string', description: 'Optional provider: tavily|brave|serper|searxng|jina|duckduckgo|bing|baidu|sogou. Default: auto (uses a configured API key if present, else the keyless Jina endpoint, else scraping).', required: false },
390
236
  { name: 'max_results', type: 'number', description: 'Max results to return (default 8, capped at 20)', required: false },
391
237
  ],
392
238
  handler: async (params) => {
393
239
  const query = String(params.query || '').trim();
394
240
  if (!query) return 'Error: query is required';
395
- const max = Math.max(1, Math.min(20, Math.floor(Number(params.max_results) || 8)));
396
- const explicit = String(params.engine || '').trim().toLowerCase();
397
- const envEngine = String(process.env.SKYLOOM_SEARCH_ENGINE || '').trim().toLowerCase();
398
- const order = explicit
399
- ? [explicit]
400
- : envEngine
401
- ? [envEngine, 'duckduckgo', 'bing', 'baidu', 'sogou']
402
- : ['duckduckgo', 'bing', 'baidu', 'sogou'];
403
- const seen = new Set<string>();
404
- const tried: string[] = [];
405
- for (const eng of order) {
406
- if (seen.has(eng)) continue;
407
- seen.add(eng);
408
- tried.push(eng);
409
- try {
410
- const results = await runSearchEngine(eng, query, max);
411
- if (results && results.length > 0) {
412
- const head = `Search results (${eng}, ${results.length}):`;
413
- const body = results
414
- .map((r, i) => `${i + 1}. ${r.title}\n ${r.url}${r.snippet ? `\n ${r.snippet}` : ''}`)
415
- .join('\n');
416
- return `${head}\n${body}`;
417
- }
418
- } catch (e: any) {
419
- log.warn('web_search_engine_failed', { engine: eng, error: String(e?.message || e) });
420
- }
241
+ try {
242
+ const res = await webSearch(query, {
243
+ max: Number(params.max_results) || 8,
244
+ engine: String(params.engine || '').trim().toLowerCase() || undefined,
245
+ onProviderError: (provider, error) => log.warn('web_search_provider_failed', { provider, error }),
246
+ });
247
+ return formatSearchResults(res);
248
+ } catch (e: any) {
249
+ return `Error: ${String(e?.message || e)}`;
250
+ }
251
+ },
252
+ });
253
+
254
+ registry.register({
255
+ name: 'read_url',
256
+ description:
257
+ 'Fetch a web page as clean, readable text (markdown), with boilerplate (nav/ads) stripped. ' +
258
+ 'Use after web_search to read a result in full, or to read any known URL. Prefer this over http_get for articles/pages.',
259
+ parameters: [
260
+ { name: 'url', type: 'string', description: 'The http(s) URL to read', required: true },
261
+ { name: 'max_chars', type: 'number', description: 'Max characters to return (default 12000)', required: false },
262
+ ],
263
+ handler: async (params) => {
264
+ const url = String(params.url || '').trim();
265
+ if (!url) return 'Error: url is required';
266
+ try {
267
+ return await readPage(url, { maxChars: Number(params.max_chars) || 12000 });
268
+ } catch (e: any) {
269
+ return `Error reading page: ${String(e?.message || e)}`;
421
270
  }
422
- return `No search results found (tried: ${tried.join(', ')}). Set SKYLOOM_SEARCH_ENGINE to pin an engine, or try a different query.`;
423
271
  },
424
272
  });
425
273
 
@@ -0,0 +1,368 @@
1
+ /**
2
+ * 联网搜索 · Web search with a provider waterfall.
3
+ *
4
+ * Why this module exists: the old web_search scraped DuckDuckGo/Bing/Baidu/Sogou
5
+ * HTML. Scraping breaks constantly — engines change markup, block bot
6
+ * user-agents, throw CAPTCHAs, and rate-limit — so "search doesn't work" was the
7
+ * norm. This replaces it with a waterfall that prefers reliable JSON APIs and
8
+ * only falls back to scraping as a last resort:
9
+ *
10
+ * 1. Tavily (TAVILY_API_KEY) — purpose-built for LLM agents, returns an answer
11
+ * 2. Brave (BRAVE_API_KEY) — independent index, clean JSON
12
+ * 3. Serper (SERPER_API_KEY) — Google results as JSON
13
+ * 4. SearXNG (SEARXNG_URL) — self-hosted metasearch JSON
14
+ * 5. Jina (keyless) — s.jina.ai, free, LLM-optimized — works with NO setup
15
+ * 6. Scrape (last resort) — the legacy HTML scrapers
16
+ *
17
+ * The headline win: even with zero configuration, Jina's keyless endpoint gives
18
+ * results that actually return — no API key, no scraping fragility. Set any of
19
+ * the API keys above for enterprise-grade reliability and higher rate limits.
20
+ *
21
+ * The HTTP layer is injectable so the orchestration and every parser are
22
+ * unit-testable without a network.
23
+ */
24
+
25
+ import axios from 'axios';
26
+
27
+ export interface SearchResult {
28
+ title: string;
29
+ url: string;
30
+ snippet: string;
31
+ }
32
+
33
+ export interface SearchResponse {
34
+ provider: string; // which provider produced these results
35
+ results: SearchResult[];
36
+ answer?: string; // direct answer / summary, when the provider offers one
37
+ }
38
+
39
+ /** Minimal HTTP surface — injectable for tests. */
40
+ export interface WebHttp {
41
+ getJson(url: string, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<any>;
42
+ postJson(url: string, body: any, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<any>;
43
+ getText(url: string, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<string>;
44
+ }
45
+
46
+ const UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36';
47
+ const DEFAULT_TIMEOUT = 15000;
48
+
49
+ /** Default HTTP client backed by axios. */
50
+ export const defaultHttp: WebHttp = {
51
+ async getJson(url, opts) {
52
+ const res = await axios.get(url, {
53
+ headers: { 'User-Agent': UA, Accept: 'application/json', ...(opts?.headers || {}) },
54
+ timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
55
+ maxRedirects: 5,
56
+ validateStatus: (s) => s >= 200 && s < 300,
57
+ });
58
+ return res.data;
59
+ },
60
+ async postJson(url, body, opts) {
61
+ const res = await axios.post(url, body, {
62
+ headers: { 'User-Agent': UA, Accept: 'application/json', 'Content-Type': 'application/json', ...(opts?.headers || {}) },
63
+ timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
64
+ maxRedirects: 5,
65
+ validateStatus: (s) => s >= 200 && s < 300,
66
+ });
67
+ return res.data;
68
+ },
69
+ async getText(url, opts) {
70
+ const res = await axios.get(url, {
71
+ headers: {
72
+ 'User-Agent': UA,
73
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
74
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
75
+ ...(opts?.headers || {}),
76
+ },
77
+ timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
78
+ maxRedirects: 5,
79
+ validateStatus: (s) => s >= 200 && s < 300,
80
+ responseType: 'text',
81
+ transformResponse: [(d) => d],
82
+ });
83
+ return res.data as string;
84
+ },
85
+ };
86
+
87
+ /* ── HTML helpers (shared by the scrape provider) ── */
88
+ export function decodeHtmlEntities(s: string): string {
89
+ return s
90
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
91
+ .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, ' ')
92
+ .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
93
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
94
+ }
95
+ export function stripTags(s: string): string {
96
+ return decodeHtmlEntities(s.replace(/<[^>]+>/g, '')).replace(/\s+/g, ' ').trim();
97
+ }
98
+ function unwrapDdgRedirect(href: string): string {
99
+ const m = href.match(/[?&]uddg=([^&]+)/);
100
+ if (m) { try { return decodeURIComponent(m[1]); } catch { /* fall through */ } }
101
+ if (href.startsWith('//')) return 'https:' + href;
102
+ return href;
103
+ }
104
+
105
+ function clean(results: SearchResult[], max: number): SearchResult[] {
106
+ const seen = new Set<string>();
107
+ const out: SearchResult[] = [];
108
+ for (const r of results) {
109
+ if (!r || !r.title || !/^https?:\/\//i.test(r.url || '')) continue;
110
+ if (seen.has(r.url)) continue;
111
+ seen.add(r.url);
112
+ out.push({ title: r.title.trim(), url: r.url.trim(), snippet: (r.snippet || '').trim() });
113
+ if (out.length >= max) break;
114
+ }
115
+ return out;
116
+ }
117
+
118
+ /* ════════════════════════════════════════════════════════════
119
+ API providers (preferred — reliable JSON)
120
+ ════════════════════════════════════════════════════════════ */
121
+
122
+ async function tavily(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
123
+ const data = await http.postJson('https://api.tavily.com/search', {
124
+ query, max_results: max, search_depth: 'basic', include_answer: true,
125
+ }, { headers: { Authorization: `Bearer ${key}` } });
126
+ const results = (data?.results || []).map((r: any) => ({
127
+ title: r.title || '', url: r.url || '', snippet: r.content || '',
128
+ }));
129
+ return { provider: 'tavily', results: clean(results, max), answer: data?.answer || undefined };
130
+ }
131
+
132
+ async function brave(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
133
+ const data = await http.getJson(
134
+ `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${max}`,
135
+ { headers: { 'X-Subscription-Token': key, Accept: 'application/json' } },
136
+ );
137
+ const results = (data?.web?.results || []).map((r: any) => ({
138
+ title: r.title || '', url: r.url || '', snippet: r.description || '',
139
+ }));
140
+ return { provider: 'brave', results: clean(results, max) };
141
+ }
142
+
143
+ async function serper(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
144
+ const data = await http.postJson('https://google.serper.dev/search',
145
+ { q: query, num: max },
146
+ { headers: { 'X-API-KEY': key } });
147
+ const results = (data?.organic || []).map((r: any) => ({
148
+ title: r.title || '', url: r.link || '', snippet: r.snippet || '',
149
+ }));
150
+ const answer = data?.answerBox?.answer || data?.answerBox?.snippet || data?.knowledgeGraph?.description || undefined;
151
+ return { provider: 'serper', results: clean(results, max), answer };
152
+ }
153
+
154
+ async function searxng(http: WebHttp, baseUrl: string, query: string, max: number): Promise<SearchResponse> {
155
+ const base = baseUrl.replace(/\/+$/, '');
156
+ const data = await http.getJson(
157
+ `${base}/search?q=${encodeURIComponent(query)}&format=json&language=zh-CN`,
158
+ );
159
+ const results = (data?.results || []).map((r: any) => ({
160
+ title: r.title || '', url: r.url || '', snippet: r.content || '',
161
+ }));
162
+ return { provider: 'searxng', results: clean(results, max) };
163
+ }
164
+
165
+ async function jina(http: WebHttp, key: string | undefined, query: string, max: number): Promise<SearchResponse> {
166
+ // s.jina.ai returns the SERP for a query. `X-Respond-With: no-content` skips
167
+ // fetching each page body (faster, fewer tokens — we only want the listing).
168
+ // Keyless works (shared rate pool); a JINA_API_KEY raises the limit.
169
+ const headers: Record<string, string> = { Accept: 'application/json', 'X-Respond-With': 'no-content' };
170
+ if (key) headers.Authorization = `Bearer ${key}`;
171
+ const data = await http.getJson(`https://s.jina.ai/?q=${encodeURIComponent(query)}`, { headers });
172
+ const rows = Array.isArray(data?.data) ? data.data : Array.isArray(data) ? data : [];
173
+ const results = rows.map((r: any) => ({
174
+ title: r.title || '', url: r.url || '', snippet: r.description || r.content || r.snippet || '',
175
+ }));
176
+ return { provider: 'jina', results: clean(results, max) };
177
+ }
178
+
179
+ /* ════════════════════════════════════════════════════════════
180
+ Scrape provider (last resort — fragile HTML parsing)
181
+ ════════════════════════════════════════════════════════════ */
182
+
183
+ async function scrapeDuckDuckGo(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
184
+ const html = await http.getText(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`);
185
+ const out: SearchResult[] = [];
186
+ const re = /<a[^>]+class="[^"]*result__a[^"]*"[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<a[^>]+class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
187
+ let m: RegExpExecArray | null;
188
+ while ((m = re.exec(html)) && out.length < max) {
189
+ out.push({ url: unwrapDdgRedirect(m[1]), title: stripTags(m[2]), snippet: stripTags(m[3]) });
190
+ }
191
+ return out;
192
+ }
193
+ async function scrapeBing(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
194
+ const html = await http.getText(`https://www.bing.com/search?q=${encodeURIComponent(query)}&setlang=zh-cn`);
195
+ const out: SearchResult[] = [];
196
+ for (const item of html.match(/<li class="b_algo"[\s\S]*?<\/li>/gi) || []) {
197
+ if (out.length >= max) break;
198
+ const a = item.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
199
+ if (!a) continue;
200
+ const snip = item.match(/<p class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i) || item.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
201
+ out.push({ url: a[1], title: stripTags(a[2]), snippet: snip ? stripTags(snip[1]) : '' });
202
+ }
203
+ return out;
204
+ }
205
+ async function scrapeBaidu(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
206
+ const html = await http.getText(`https://www.baidu.com/s?wd=${encodeURIComponent(query)}`);
207
+ const out: SearchResult[] = [];
208
+ const re = /<h3[^>]*>[\s\S]{0,500}?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
209
+ let m: RegExpExecArray | null;
210
+ while ((m = re.exec(html)) && out.length < max) {
211
+ const url = m[1]; const title = stripTags(m[2]);
212
+ if (!title || !/^https?:\/\//.test(url)) continue;
213
+ const after = html.slice(re.lastIndex, re.lastIndex + 4000);
214
+ const snip = after.match(/<span class="content-right[^"]*"[^>]*>([\s\S]*?)<\/span>/i)
215
+ || after.match(/<div class="c-abstract[^"]*"[^>]*>([\s\S]*?)<\/div>/i)
216
+ || after.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
217
+ out.push({ url, title, snippet: snip ? stripTags(snip[1]) : '' });
218
+ }
219
+ return out;
220
+ }
221
+ async function scrapeSogou(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
222
+ const html = await http.getText(`https://www.sogou.com/web?query=${encodeURIComponent(query)}`);
223
+ const out: SearchResult[] = [];
224
+ for (const item of html.match(/<div[^>]+class="vrwrap"[\s\S]*?(?=<div[^>]+class="vrwrap"|$)/gi) || []) {
225
+ if (out.length >= max) break;
226
+ const a = item.match(/<h3[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
227
+ if (!a) continue;
228
+ let url = a[1]; if (url.startsWith('/link?')) url = 'https://www.sogou.com' + url;
229
+ const snip = item.match(/<div[^>]+class="(?:str_info|fz-mid|space-txt)[^"]*"[^>]*>([\s\S]*?)<\/div>/i) || item.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
230
+ out.push({ url, title: stripTags(a[2]), snippet: snip ? stripTags(snip[1]) : '' });
231
+ }
232
+ return out;
233
+ }
234
+
235
+ const SCRAPE_ENGINES = ['duckduckgo', 'bing', 'baidu', 'sogou'] as const;
236
+ type ScrapeEngine = typeof SCRAPE_ENGINES[number];
237
+
238
+ async function scrape(http: WebHttp, engine: ScrapeEngine, query: string, max: number): Promise<SearchResponse> {
239
+ const fn = engine === 'bing' ? scrapeBing : engine === 'baidu' ? scrapeBaidu : engine === 'sogou' ? scrapeSogou : scrapeDuckDuckGo;
240
+ return { provider: engine, results: clean(await fn(http, query, max), max) };
241
+ }
242
+
243
+ /* ════════════════════════════════════════════════════════════
244
+ Orchestration
245
+ ════════════════════════════════════════════════════════════ */
246
+
247
+ export type EnvMap = Record<string, string | undefined>;
248
+
249
+ interface Provider {
250
+ id: string;
251
+ /** Run the provider; throws on failure so the waterfall can move on. */
252
+ run(http: WebHttp, env: EnvMap, query: string, max: number): Promise<SearchResponse>;
253
+ }
254
+
255
+ /** Resolve the ordered provider list for a given env + optional pinned engine. */
256
+ export function resolveProviders(env: EnvMap, pinned?: string): Provider[] {
257
+ const p = (pinned || '').trim().toLowerCase();
258
+
259
+ const tavilyP: Provider | null = env.TAVILY_API_KEY
260
+ ? { id: 'tavily', run: (h, e, q, m) => tavily(h, e.TAVILY_API_KEY!, q, m) } : null;
261
+ const braveKey = env.BRAVE_API_KEY || env.BRAVE_SEARCH_API_KEY;
262
+ const braveP: Provider | null = braveKey
263
+ ? { id: 'brave', run: (h, _e, q, m) => brave(h, braveKey!, q, m) } : null;
264
+ const serperP: Provider | null = env.SERPER_API_KEY
265
+ ? { id: 'serper', run: (h, e, q, m) => serper(h, e.SERPER_API_KEY!, q, m) } : null;
266
+ const searxngP: Provider | null = env.SEARXNG_URL
267
+ ? { id: 'searxng', run: (h, e, q, m) => searxng(h, e.SEARXNG_URL!, q, m) } : null;
268
+ const jinaP: Provider = { id: 'jina', run: (h, e, q, m) => jina(h, e.JINA_API_KEY, q, m) };
269
+ const scrapeP = (eng: ScrapeEngine): Provider => ({ id: eng, run: (h, _e, q, m) => scrape(h, eng, q, m) });
270
+
271
+ // Explicit pin (tool arg or SKYLOOM_SEARCH_ENGINE) — use only that provider.
272
+ if (p) {
273
+ if (p === 'tavily') return tavilyP ? [tavilyP] : [];
274
+ if (p === 'brave') return braveP ? [braveP] : [];
275
+ if (p === 'serper') return serperP ? [serperP] : [];
276
+ if (p === 'searxng') return searxngP ? [searxngP] : [];
277
+ if (p === 'jina') return [jinaP];
278
+ if (p === 'ddg' || p === 'duckduckgo') return [scrapeP('duckduckgo')];
279
+ if ((SCRAPE_ENGINES as readonly string[]).includes(p)) return [scrapeP(p as ScrapeEngine)];
280
+ // Unknown pin → fall through to auto.
281
+ }
282
+
283
+ // Auto waterfall: keyed providers first (best), then keyless Jina, then scrape.
284
+ const order: Provider[] = [];
285
+ for (const cand of [tavilyP, braveP, serperP, searxngP]) if (cand) order.push(cand);
286
+ order.push(jinaP);
287
+ for (const eng of SCRAPE_ENGINES) order.push(scrapeP(eng));
288
+ return order;
289
+ }
290
+
291
+ export interface WebSearchOptions {
292
+ max?: number;
293
+ engine?: string; // explicit pin from the tool arg
294
+ env?: EnvMap; // defaults to process.env
295
+ http?: WebHttp; // defaults to axios-backed client
296
+ onProviderError?: (provider: string, error: string) => void;
297
+ }
298
+
299
+ /**
300
+ * Run a web search through the provider waterfall. Returns the first provider
301
+ * that yields results, or a response with an empty result set + the list of
302
+ * providers that were tried.
303
+ */
304
+ export async function webSearch(query: string, opts: WebSearchOptions = {}): Promise<SearchResponse & { tried: string[] }> {
305
+ const q = (query || '').trim();
306
+ if (!q) throw new Error('query is required');
307
+ const max = Math.max(1, Math.min(20, Math.floor(opts.max ?? 8)));
308
+ const env = opts.env ?? (process.env as EnvMap);
309
+ const http = opts.http ?? defaultHttp;
310
+ const pinned = (opts.engine || env.SKYLOOM_SEARCH_ENGINE || '').trim();
311
+
312
+ const providers = resolveProviders(env, pinned);
313
+ const tried: string[] = [];
314
+ for (const provider of providers) {
315
+ tried.push(provider.id);
316
+ try {
317
+ const res = await provider.run(http, env, q, max);
318
+ if (res.results.length > 0 || res.answer) return { ...res, tried };
319
+ } catch (e: any) {
320
+ opts.onProviderError?.(provider.id, String(e?.message || e));
321
+ }
322
+ }
323
+ return { provider: 'none', results: [], tried };
324
+ }
325
+
326
+ /** Format a SearchResponse as compact text for an LLM tool result. */
327
+ export function formatSearchResults(res: SearchResponse & { tried?: string[] }): string {
328
+ if (!res.results.length && !res.answer) {
329
+ const tried = res.tried?.length ? ` (tried: ${res.tried.join(', ')})` : '';
330
+ return `No search results found${tried}. Try a simpler query, or set a search API key (TAVILY_API_KEY / BRAVE_API_KEY / SERPER_API_KEY) for more reliable results.`;
331
+ }
332
+ const parts: string[] = [];
333
+ if (res.answer) parts.push(`Answer: ${res.answer}\n`);
334
+ parts.push(`Search results (${res.provider}, ${res.results.length}):`);
335
+ parts.push(res.results.map((r, i) => `${i + 1}. ${r.title}\n ${r.url}${r.snippet ? `\n ${r.snippet}` : ''}`).join('\n'));
336
+ return parts.join('\n');
337
+ }
338
+
339
+ /* ════════════════════════════════════════════════════════════
340
+ Page reader — clean, LLM-ready content from a URL
341
+ ════════════════════════════════════════════════════════════ */
342
+
343
+ /**
344
+ * Fetch a URL as clean, readable text. Uses Jina's r.jina.ai reader (strips
345
+ * nav/ads, returns markdown) when reachable, falling back to a raw fetch. This
346
+ * is what makes "read the top news article" actually usable — raw HTML is
347
+ * mostly boilerplate.
348
+ */
349
+ export async function readPage(url: string, opts: { env?: EnvMap; http?: WebHttp; maxChars?: number } = {}): Promise<string> {
350
+ const env = opts.env ?? (process.env as EnvMap);
351
+ const http = opts.http ?? defaultHttp;
352
+ const maxChars = opts.maxChars ?? 12000;
353
+ if (!/^https?:\/\//i.test(url)) throw new Error('url must be http(s)');
354
+
355
+ const headers: Record<string, string> = { Accept: 'text/plain' };
356
+ if (env.JINA_API_KEY) headers.Authorization = `Bearer ${env.JINA_API_KEY}`;
357
+ try {
358
+ const text = await http.getText(`https://r.jina.ai/${url}`, { headers, timeoutMs: 20000 });
359
+ if (text && text.trim()) return clip(text, maxChars);
360
+ } catch { /* fall through to raw fetch */ }
361
+
362
+ const raw = await http.getText(url, { timeoutMs: 15000 });
363
+ return clip(stripTags(raw), maxChars);
364
+ }
365
+
366
+ function clip(s: string, max: number): string {
367
+ return s.length > max ? s.slice(0, max) + `\n...[truncated, ${s.length - max} more chars]` : s;
368
+ }
package/src/web/server.ts CHANGED
@@ -55,7 +55,7 @@ export async function startWebServer(port: number = 7777): Promise<void> {
55
55
  try {
56
56
  if ((url.pathname === "/" || url.pathname === "/index.html") && req.method === "GET") serveUI(res);
57
57
  else if (url.pathname === "/favicon.svg" && req.method === "GET") serveFavicon(res);
58
- else if (url.pathname === "/favicon.ico" && req.method === "GET") redirectFavicon(res);
58
+ else if (url.pathname === "/favicon.ico" && req.method === "GET") serveFavicon(res);
59
59
  else if (url.pathname === "/api/chat" && req.method === "POST") await handleChat(req, res, ctx);
60
60
  else if (url.pathname === "/api/agents" && req.method === "GET") handleAgents(res, ctx);
61
61
  else if (url.pathname === "/api/status" && req.method === "GET") handleStatus(res, ctx);
@@ -122,15 +122,7 @@ function serveUI(res: ServerResponse): void {
122
122
  function serveFavicon(res: ServerResponse): void {
123
123
  res.writeHead(200, {
124
124
  "Content-Type": "image/svg+xml; charset=utf-8",
125
- "Cache-Control": "public, max-age=86400",
125
+ "Cache-Control": "no-cache, max-age=0",
126
126
  });
127
127
  res.end(SKYLOOM_FAVICON_SVG);
128
128
  }
129
-
130
- function redirectFavicon(res: ServerResponse): void {
131
- res.writeHead(302, {
132
- "Location": "/favicon.svg",
133
- "Cache-Control": "public, max-age=86400",
134
- });
135
- res.end();
136
- }