npm - skyloom - Versions diffs - 1.15.4 → 1.16.0 - Mend

skyloom 1.15.4 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/dist/cli/command_args.d.ts +74 -0
package/dist/cli/command_args.d.ts.map +1 -0
package/dist/cli/command_args.js +129 -0
package/dist/cli/command_args.js.map +1 -0
package/dist/cli/loom.d.ts +20 -0
package/dist/cli/loom.d.ts.map +1 -1
package/dist/cli/loom.js +202 -24
package/dist/cli/loom.js.map +1 -1
package/dist/cli/loom_chat.d.ts.map +1 -1
package/dist/cli/loom_chat.js +39 -0
package/dist/cli/loom_chat.js.map +1 -1
package/dist/core/agent.js +2 -2
package/dist/core/agent.js.map +1 -1
package/dist/core/security.d.ts.map +1 -1
package/dist/core/security.js +1 -0
package/dist/core/security.js.map +1 -1
package/dist/core/tool_router.d.ts.map +1 -1
package/dist/core/tool_router.js +11 -3
package/dist/core/tool_router.js.map +1 -1
package/dist/tools/builtin.d.ts.map +1 -1
package/dist/tools/builtin.js +38 -192
package/dist/tools/builtin.js.map +1 -1
package/dist/tools/websearch.d.ts +92 -0
package/dist/tools/websearch.d.ts.map +1 -0
package/dist/tools/websearch.js +343 -0
package/dist/tools/websearch.js.map +1 -0
package/dist/web/server.js +2 -9
package/dist/web/server.js.map +1 -1
package/dist/web/ui.d.ts.map +1 -1
package/dist/web/ui.js +3 -2
package/dist/web/ui.js.map +1 -1
package/package.json +1 -1
package/src/cli/command_args.ts +159 -0
package/src/cli/loom.ts +155 -17
package/src/cli/loom_chat.ts +33 -0
package/src/core/agent.ts +2 -2
package/src/core/security.ts +1 -0
package/src/core/tool_router.ts +11 -3
package/src/tools/builtin.ts +38 -190
package/src/tools/websearch.ts +368 -0
package/src/web/server.ts +2 -10
package/src/web/ui.ts +3 -2
package/tests/command_args.test.ts +115 -0
package/tests/loom.test.ts +74 -0
package/tests/tool_router.test.ts +15 -0
package/tests/web.test.ts +7 -5
package/tests/websearch.test.ts +190 -0

package/src/tools/builtin.ts CHANGED Viewed

@@ -4,177 +4,18 @@
 import * as fs from 'fs';
 import * as path from 'path';
-import axios from 'axios';
 import type { ToolRegistry } from '../core/tool';
 import { getLogger } from '../core/logger';
 import { registerComputerTools } from './computer';
 import { registerExtraTools } from './extra';
 import { isPrivateIp, assertFetchAllowed, fenceRoot, fenceCheck } from './guards';
+import { webSearch, formatSearchResults, readPage } from './websearch';
 // Re-exported so existing importers/tests keep resolving these from builtin.
 export { isPrivateIp, assertFetchAllowed, fenceRoot, fenceCheck };
 const log = getLogger('builtin-tools');
-/* ── Web search helpers ───────────────────────────────────────────────────
-   Multi-engine fallback. DuckDuckGo's Instant Answer JSON API only returns
-   "abstracts" and is blank for ~90% of real queries; HTML scraping is what
-   actually works. In CN networks, DDG/Bing may be unreachable — Baidu/Sogou
-   serve as fallbacks. Each parser is intentionally tolerant: HTML changes
-   over time, so we extract loosely and let the engine list provide redundancy.
-   ────────────────────────────────────────────────────────────────────────── */
-interface SearchResult { title: string; url: string; snippet: string }
-const SEARCH_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36';
-const searchClient = axios.create({
-  timeout: 15000,
-  headers: {
-    'User-Agent': SEARCH_UA,
-    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
-  },
-  // Allow redirects (search engines use them)
-  maxRedirects: 5,
-  // Validate status (only 2xx is ok)
-  validateStatus: (status) => status >= 200 && status < 300,
-});
-async function fetchHtml(url: string, timeoutMs = 15000, retries = 2): Promise<string> {
-  let lastError: Error | null = null;
-  for (let attempt = 0; attempt <= retries; attempt++) {
-    try {
-      const res = await searchClient.get(url, {
-        timeout: timeoutMs,
-        // Skip SSRF check for known search engines
-        transitional: { clarifyTimeoutError: true },
-      });
-      return res.data;
-    } catch (e: any) {
-      lastError = e;
-      // Don't retry on 4xx (client errors like 403/404)
-      if (e.response && e.response.status >= 400 && e.response.status < 500) {
-        throw new Error(`HTTP ${e.response.status}: ${e.response.statusText || 'Blocked'}`);
-      }
-      // Wait before retry (exponential backoff)
-      if (attempt < retries) {
-        await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
-      }
-    }
-  }
-  throw lastError || new Error('fetch failed');
-}
-function decodeHtmlEntities(s: string): string {
-  return s
-    .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
-    .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, ' ')
-    .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
-    .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
-}
-function stripTags(s: string): string {
-  return decodeHtmlEntities(s.replace(/<[^>]+>/g, '')).replace(/\s+/g, ' ').trim();
-}
-function unwrapDdgRedirect(href: string): string {
-  // DuckDuckGo HTML wraps results in /l/?uddg=<encoded-url>
-  const m = href.match(/[?&]uddg=([^&]+)/);
-  if (m) { try { return decodeURIComponent(m[1]); } catch { /* fall through */ } }
-  if (href.startsWith('//')) return 'https:' + href;
-  return href;
-}
-function unwrapBaiduRedirect(href: string): string {
-  // Baidu uses opaque /link?url=... redirects; we can't resolve without another request.
-  // Return as-is; consumer can still click through.
-  return href;
-}
-async function searchDuckDuckGo(query: string, max: number): Promise<SearchResult[]> {
-  const html = await fetchHtml(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`);
-  const results: SearchResult[] = [];
-  const re = /<a[^>]+class="[^"]*result__a[^"]*"[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<a[^>]+class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
-  let m: RegExpExecArray | null;
-  while ((m = re.exec(html)) && results.length < max) {
-    results.push({ url: unwrapDdgRedirect(m[1]), title: stripTags(m[2]), snippet: stripTags(m[3]) });
-  }
-  return results;
-}
-async function searchBing(query: string, max: number): Promise<SearchResult[]> {
-  const html = await fetchHtml(`https://www.bing.com/search?q=${encodeURIComponent(query)}&setlang=zh-cn`);
-  const results: SearchResult[] = [];
-  const liRe = /<li class="b_algo"[\s\S]*?<\/li>/gi;
-  const items = html.match(liRe) || [];
-  for (const item of items) {
-    if (results.length >= max) break;
-    const a = item.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
-    if (!a) continue;
-    const snipMatch =
-      item.match(/<p class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i) ||
-      item.match(/<div class="b_caption"[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i) ||
-      item.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
-    results.push({ url: a[1], title: stripTags(a[2]), snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
-  }
-  return results;
-}
-async function searchBaidu(query: string, max: number): Promise<SearchResult[]> {
-  const html = await fetchHtml(`https://www.baidu.com/s?wd=${encodeURIComponent(query)}`);
-  const results: SearchResult[] = [];
-  // Baidu nests divs aggressively; anchor on <h3> ... <a href>...</a> and look
-  // for the nearest abstract block following.
-  const re = /<h3[^>]*>[\s\S]{0,500}?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
-  let m: RegExpExecArray | null;
-  while ((m = re.exec(html)) && results.length < max) {
-    const url = unwrapBaiduRedirect(m[1]);
-    const title = stripTags(m[2]);
-    if (!title || !/^https?:\/\//.test(url)) continue;
-    const after = html.slice(re.lastIndex, re.lastIndex + 4000);
-    const snipMatch =
-      after.match(/<span class="content-right[^"]*"[^>]*>([\s\S]*?)<\/span>/i) ||
-      after.match(/<div class="c-abstract[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
-      after.match(/<span[^>]*content[^"]*"[^>]*>([\s\S]{20,400}?)<\/span>/i) ||
-      after.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
-    results.push({ url, title, snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
-  }
-  return results;
-}
-async function searchSogou(query: string, max: number): Promise<SearchResult[]> {
-  const html = await fetchHtml(`https://www.sogou.com/web?query=${encodeURIComponent(query)}`);
-  const results: SearchResult[] = [];
-  const divRe = /<div[^>]+class="vrwrap"[\s\S]*?(?=<div[^>]+class="vrwrap"|$)/gi;
-  const items = html.match(divRe) || [];
-  for (const item of items) {
-    if (results.length >= max) break;
-    const a = item.match(/<h3[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
-    if (!a) continue;
-    let url = a[1];
-    if (url.startsWith('/link?')) url = 'https://www.sogou.com' + url;
-    const snipMatch =
-      item.match(/<div[^>]+class="(?:str_info|fz-mid|space-txt)[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ||
-      item.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
-    results.push({ url, title: stripTags(a[2]), snippet: snipMatch ? stripTags(snipMatch[1]) : '' });
-  }
-  return results;
-}
-async function runSearchEngine(engine: string, query: string, max: number): Promise<SearchResult[]> {
-  let results: SearchResult[];
-  switch (engine) {
-    case 'duckduckgo': case 'ddg': results = await searchDuckDuckGo(query, max); break;
-    case 'bing': results = await searchBing(query, max); break;
-    case 'baidu': results = await searchBaidu(query, max); break;
-    case 'sogou': results = await searchSogou(query, max); break;
-    default: throw new Error(`unknown search engine: ${engine}`);
-  }
-  // Drop placeholder/JS-anchor entries from inline answer cards.
-  return results.filter((r) => r.title && /^https?:\/\//i.test(r.url));
-}
 /**
  * Register all built-in tools into the given registry.
  */
@@ -383,43 +224,50 @@ export function registerBuiltinTools(registry: ToolRegistry): void {
   registry.register({
     name: 'web_search',
-    description: 'Search the web for information. Returns search results with titles, URLs and snippets.',
+    description:
+      'Search the live web and return titles, URLs, and snippets (plus a direct answer when available). ' +
+      'USE THIS whenever the answer depends on current or real-time information — today\'s news and hot topics, ' +
+      'recent events, latest releases/versions, prices, weather, scores, or anything that may have changed since your ' +
+      'training cutoff. Do NOT answer such questions from memory and do NOT claim you cannot access the internet — ' +
+      'search first, then answer with the findings and cite the source URLs. Follow up with read_url to read a result in full.',
     parameters: [
-      { name: 'query', type: 'string', description: 'Search query', required: true },
-      { name: 'engine', type: 'string', description: 'Optional engine: duckduckgo|bing|baidu|sogou. Default: auto (tries each until one returns results)', required: false },
+      { name: 'query', type: 'string', description: 'Search query. Be specific; include the year/date for time-sensitive queries.', required: true },
+      { name: 'engine', type: 'string', description: 'Optional provider: tavily|brave|serper|searxng|jina|duckduckgo|bing|baidu|sogou. Default: auto (uses a configured API key if present, else the keyless Jina endpoint, else scraping).', required: false },
       { name: 'max_results', type: 'number', description: 'Max results to return (default 8, capped at 20)', required: false },
     ],
     handler: async (params) => {
       const query = String(params.query || '').trim();
       if (!query) return 'Error: query is required';
-      const max = Math.max(1, Math.min(20, Math.floor(Number(params.max_results) || 8)));
-      const explicit = String(params.engine || '').trim().toLowerCase();
-      const envEngine = String(process.env.SKYLOOM_SEARCH_ENGINE || '').trim().toLowerCase();
-      const order = explicit
-        ? [explicit]
-        : envEngine
-        ? [envEngine, 'duckduckgo', 'bing', 'baidu', 'sogou']
-        : ['duckduckgo', 'bing', 'baidu', 'sogou'];
-      const seen = new Set<string>();
-      const tried: string[] = [];
-      for (const eng of order) {
-        if (seen.has(eng)) continue;
-        seen.add(eng);
-        tried.push(eng);
-        try {
-          const results = await runSearchEngine(eng, query, max);
-          if (results && results.length > 0) {
-            const head = `Search results (${eng}, ${results.length}):`;
-            const body = results
-              .map((r, i) => `${i + 1}. ${r.title}\n   ${r.url}${r.snippet ? `\n   ${r.snippet}` : ''}`)
-              .join('\n');
-            return `${head}\n${body}`;
-          }
-        } catch (e: any) {
-          log.warn('web_search_engine_failed', { engine: eng, error: String(e?.message || e) });
-        }
+      try {
+        const res = await webSearch(query, {
+          max: Number(params.max_results) || 8,
+          engine: String(params.engine || '').trim().toLowerCase() || undefined,
+          onProviderError: (provider, error) => log.warn('web_search_provider_failed', { provider, error }),
+        });
+        return formatSearchResults(res);
+      } catch (e: any) {
+        return `Error: ${String(e?.message || e)}`;
+      }
+    },
+  });
+  registry.register({
+    name: 'read_url',
+    description:
+      'Fetch a web page as clean, readable text (markdown), with boilerplate (nav/ads) stripped. ' +
+      'Use after web_search to read a result in full, or to read any known URL. Prefer this over http_get for articles/pages.',
+    parameters: [
+      { name: 'url', type: 'string', description: 'The http(s) URL to read', required: true },
+      { name: 'max_chars', type: 'number', description: 'Max characters to return (default 12000)', required: false },
+    ],
+    handler: async (params) => {
+      const url = String(params.url || '').trim();
+      if (!url) return 'Error: url is required';
+      try {
+        return await readPage(url, { maxChars: Number(params.max_chars) || 12000 });
+      } catch (e: any) {
+        return `Error reading page: ${String(e?.message || e)}`;
       }
-      return `No search results found (tried: ${tried.join(', ')}). Set SKYLOOM_SEARCH_ENGINE to pin an engine, or try a different query.`;
     },
   });

package/src/tools/websearch.ts ADDED Viewed

@@ -0,0 +1,368 @@
+/**
+ * 联网搜索 · Web search with a provider waterfall.
+ *
+ * Why this module exists: the old web_search scraped DuckDuckGo/Bing/Baidu/Sogou
+ * HTML. Scraping breaks constantly — engines change markup, block bot
+ * user-agents, throw CAPTCHAs, and rate-limit — so "search doesn't work" was the
+ * norm. This replaces it with a waterfall that prefers reliable JSON APIs and
+ * only falls back to scraping as a last resort:
+ *
+ *   1. Tavily   (TAVILY_API_KEY)   — purpose-built for LLM agents, returns an answer
+ *   2. Brave    (BRAVE_API_KEY)    — independent index, clean JSON
+ *   3. Serper   (SERPER_API_KEY)   — Google results as JSON
+ *   4. SearXNG  (SEARXNG_URL)      — self-hosted metasearch JSON
+ *   5. Jina     (keyless)          — s.jina.ai, free, LLM-optimized — works with NO setup
+ *   6. Scrape   (last resort)      — the legacy HTML scrapers
+ *
+ * The headline win: even with zero configuration, Jina's keyless endpoint gives
+ * results that actually return — no API key, no scraping fragility. Set any of
+ * the API keys above for enterprise-grade reliability and higher rate limits.
+ *
+ * The HTTP layer is injectable so the orchestration and every parser are
+ * unit-testable without a network.
+ */
+import axios from 'axios';
+export interface SearchResult {
+  title: string;
+  url: string;
+  snippet: string;
+}
+export interface SearchResponse {
+  provider: string;        // which provider produced these results
+  results: SearchResult[];
+  answer?: string;         // direct answer / summary, when the provider offers one
+}
+/** Minimal HTTP surface — injectable for tests. */
+export interface WebHttp {
+  getJson(url: string, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<any>;
+  postJson(url: string, body: any, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<any>;
+  getText(url: string, opts?: { headers?: Record<string, string>; timeoutMs?: number }): Promise<string>;
+}
+const UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36';
+const DEFAULT_TIMEOUT = 15000;
+/** Default HTTP client backed by axios. */
+export const defaultHttp: WebHttp = {
+  async getJson(url, opts) {
+    const res = await axios.get(url, {
+      headers: { 'User-Agent': UA, Accept: 'application/json', ...(opts?.headers || {}) },
+      timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
+      maxRedirects: 5,
+      validateStatus: (s) => s >= 200 && s < 300,
+    });
+    return res.data;
+  },
+  async postJson(url, body, opts) {
+    const res = await axios.post(url, body, {
+      headers: { 'User-Agent': UA, Accept: 'application/json', 'Content-Type': 'application/json', ...(opts?.headers || {}) },
+      timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
+      maxRedirects: 5,
+      validateStatus: (s) => s >= 200 && s < 300,
+    });
+    return res.data;
+  },
+  async getText(url, opts) {
+    const res = await axios.get(url, {
+      headers: {
+        'User-Agent': UA,
+        Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        ...(opts?.headers || {}),
+      },
+      timeout: opts?.timeoutMs ?? DEFAULT_TIMEOUT,
+      maxRedirects: 5,
+      validateStatus: (s) => s >= 200 && s < 300,
+      responseType: 'text',
+      transformResponse: [(d) => d],
+    });
+    return res.data as string;
+  },
+};
+/* ── HTML helpers (shared by the scrape provider) ── */
+export function decodeHtmlEntities(s: string): string {
+  return s
+    .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, ' ')
+    .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
+    .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCharCode(parseInt(n, 16)));
+}
+export function stripTags(s: string): string {
+  return decodeHtmlEntities(s.replace(/<[^>]+>/g, '')).replace(/\s+/g, ' ').trim();
+}
+function unwrapDdgRedirect(href: string): string {
+  const m = href.match(/[?&]uddg=([^&]+)/);
+  if (m) { try { return decodeURIComponent(m[1]); } catch { /* fall through */ } }
+  if (href.startsWith('//')) return 'https:' + href;
+  return href;
+}
+function clean(results: SearchResult[], max: number): SearchResult[] {
+  const seen = new Set<string>();
+  const out: SearchResult[] = [];
+  for (const r of results) {
+    if (!r || !r.title || !/^https?:\/\//i.test(r.url || '')) continue;
+    if (seen.has(r.url)) continue;
+    seen.add(r.url);
+    out.push({ title: r.title.trim(), url: r.url.trim(), snippet: (r.snippet || '').trim() });
+    if (out.length >= max) break;
+  }
+  return out;
+}
+/* ════════════════════════════════════════════════════════════
+   API providers (preferred — reliable JSON)
+   ════════════════════════════════════════════════════════════ */
+async function tavily(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
+  const data = await http.postJson('https://api.tavily.com/search', {
+    query, max_results: max, search_depth: 'basic', include_answer: true,
+  }, { headers: { Authorization: `Bearer ${key}` } });
+  const results = (data?.results || []).map((r: any) => ({
+    title: r.title || '', url: r.url || '', snippet: r.content || '',
+  }));
+  return { provider: 'tavily', results: clean(results, max), answer: data?.answer || undefined };
+}
+async function brave(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
+  const data = await http.getJson(
+    `https://api.search.brave.com/res/v1/web/search?q=${encodeURIComponent(query)}&count=${max}`,
+    { headers: { 'X-Subscription-Token': key, Accept: 'application/json' } },
+  );
+  const results = (data?.web?.results || []).map((r: any) => ({
+    title: r.title || '', url: r.url || '', snippet: r.description || '',
+  }));
+  return { provider: 'brave', results: clean(results, max) };
+}
+async function serper(http: WebHttp, key: string, query: string, max: number): Promise<SearchResponse> {
+  const data = await http.postJson('https://google.serper.dev/search',
+    { q: query, num: max },
+    { headers: { 'X-API-KEY': key } });
+  const results = (data?.organic || []).map((r: any) => ({
+    title: r.title || '', url: r.link || '', snippet: r.snippet || '',
+  }));
+  const answer = data?.answerBox?.answer || data?.answerBox?.snippet || data?.knowledgeGraph?.description || undefined;
+  return { provider: 'serper', results: clean(results, max), answer };
+}
+async function searxng(http: WebHttp, baseUrl: string, query: string, max: number): Promise<SearchResponse> {
+  const base = baseUrl.replace(/\/+$/, '');
+  const data = await http.getJson(
+    `${base}/search?q=${encodeURIComponent(query)}&format=json&language=zh-CN`,
+  );
+  const results = (data?.results || []).map((r: any) => ({
+    title: r.title || '', url: r.url || '', snippet: r.content || '',
+  }));
+  return { provider: 'searxng', results: clean(results, max) };
+}
+async function jina(http: WebHttp, key: string | undefined, query: string, max: number): Promise<SearchResponse> {
+  // s.jina.ai returns the SERP for a query. `X-Respond-With: no-content` skips
+  // fetching each page body (faster, fewer tokens — we only want the listing).
+  // Keyless works (shared rate pool); a JINA_API_KEY raises the limit.
+  const headers: Record<string, string> = { Accept: 'application/json', 'X-Respond-With': 'no-content' };
+  if (key) headers.Authorization = `Bearer ${key}`;
+  const data = await http.getJson(`https://s.jina.ai/?q=${encodeURIComponent(query)}`, { headers });
+  const rows = Array.isArray(data?.data) ? data.data : Array.isArray(data) ? data : [];
+  const results = rows.map((r: any) => ({
+    title: r.title || '', url: r.url || '', snippet: r.description || r.content || r.snippet || '',
+  }));
+  return { provider: 'jina', results: clean(results, max) };
+}
+/* ════════════════════════════════════════════════════════════
+   Scrape provider (last resort — fragile HTML parsing)
+   ════════════════════════════════════════════════════════════ */
+async function scrapeDuckDuckGo(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
+  const html = await http.getText(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`);
+  const out: SearchResult[] = [];
+  const re = /<a[^>]+class="[^"]*result__a[^"]*"[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<a[^>]+class="[^"]*result__snippet[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(html)) && out.length < max) {
+    out.push({ url: unwrapDdgRedirect(m[1]), title: stripTags(m[2]), snippet: stripTags(m[3]) });
+  }
+  return out;
+}
+async function scrapeBing(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
+  const html = await http.getText(`https://www.bing.com/search?q=${encodeURIComponent(query)}&setlang=zh-cn`);
+  const out: SearchResult[] = [];
+  for (const item of html.match(/<li class="b_algo"[\s\S]*?<\/li>/gi) || []) {
+    if (out.length >= max) break;
+    const a = item.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
+    if (!a) continue;
+    const snip = item.match(/<p class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i) || item.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
+    out.push({ url: a[1], title: stripTags(a[2]), snippet: snip ? stripTags(snip[1]) : '' });
+  }
+  return out;
+}
+async function scrapeBaidu(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
+  const html = await http.getText(`https://www.baidu.com/s?wd=${encodeURIComponent(query)}`);
+  const out: SearchResult[] = [];
+  const re = /<h3[^>]*>[\s\S]{0,500}?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(html)) && out.length < max) {
+    const url = m[1]; const title = stripTags(m[2]);
+    if (!title || !/^https?:\/\//.test(url)) continue;
+    const after = html.slice(re.lastIndex, re.lastIndex + 4000);
+    const snip = after.match(/<span class="content-right[^"]*"[^>]*>([\s\S]*?)<\/span>/i)
+      || after.match(/<div class="c-abstract[^"]*"[^>]*>([\s\S]*?)<\/div>/i)
+      || after.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
+    out.push({ url, title, snippet: snip ? stripTags(snip[1]) : '' });
+  }
+  return out;
+}
+async function scrapeSogou(http: WebHttp, query: string, max: number): Promise<SearchResult[]> {
+  const html = await http.getText(`https://www.sogou.com/web?query=${encodeURIComponent(query)}`);
+  const out: SearchResult[] = [];
+  for (const item of html.match(/<div[^>]+class="vrwrap"[\s\S]*?(?=<div[^>]+class="vrwrap"|$)/gi) || []) {
+    if (out.length >= max) break;
+    const a = item.match(/<h3[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i);
+    if (!a) continue;
+    let url = a[1]; if (url.startsWith('/link?')) url = 'https://www.sogou.com' + url;
+    const snip = item.match(/<div[^>]+class="(?:str_info|fz-mid|space-txt)[^"]*"[^>]*>([\s\S]*?)<\/div>/i) || item.match(/<p[^>]*>([\s\S]{20,400}?)<\/p>/i);
+    out.push({ url, title: stripTags(a[2]), snippet: snip ? stripTags(snip[1]) : '' });
+  }
+  return out;
+}
+const SCRAPE_ENGINES = ['duckduckgo', 'bing', 'baidu', 'sogou'] as const;
+type ScrapeEngine = typeof SCRAPE_ENGINES[number];
+async function scrape(http: WebHttp, engine: ScrapeEngine, query: string, max: number): Promise<SearchResponse> {
+  const fn = engine === 'bing' ? scrapeBing : engine === 'baidu' ? scrapeBaidu : engine === 'sogou' ? scrapeSogou : scrapeDuckDuckGo;
+  return { provider: engine, results: clean(await fn(http, query, max), max) };
+}
+/* ════════════════════════════════════════════════════════════
+   Orchestration
+   ════════════════════════════════════════════════════════════ */
+export type EnvMap = Record<string, string | undefined>;
+interface Provider {
+  id: string;
+  /** Run the provider; throws on failure so the waterfall can move on. */
+  run(http: WebHttp, env: EnvMap, query: string, max: number): Promise<SearchResponse>;
+}
+/** Resolve the ordered provider list for a given env + optional pinned engine. */
+export function resolveProviders(env: EnvMap, pinned?: string): Provider[] {
+  const p = (pinned || '').trim().toLowerCase();
+  const tavilyP: Provider | null = env.TAVILY_API_KEY
+    ? { id: 'tavily', run: (h, e, q, m) => tavily(h, e.TAVILY_API_KEY!, q, m) } : null;
+  const braveKey = env.BRAVE_API_KEY || env.BRAVE_SEARCH_API_KEY;
+  const braveP: Provider | null = braveKey
+    ? { id: 'brave', run: (h, _e, q, m) => brave(h, braveKey!, q, m) } : null;
+  const serperP: Provider | null = env.SERPER_API_KEY
+    ? { id: 'serper', run: (h, e, q, m) => serper(h, e.SERPER_API_KEY!, q, m) } : null;
+  const searxngP: Provider | null = env.SEARXNG_URL
+    ? { id: 'searxng', run: (h, e, q, m) => searxng(h, e.SEARXNG_URL!, q, m) } : null;
+  const jinaP: Provider = { id: 'jina', run: (h, e, q, m) => jina(h, e.JINA_API_KEY, q, m) };
+  const scrapeP = (eng: ScrapeEngine): Provider => ({ id: eng, run: (h, _e, q, m) => scrape(h, eng, q, m) });
+  // Explicit pin (tool arg or SKYLOOM_SEARCH_ENGINE) — use only that provider.
+  if (p) {
+    if (p === 'tavily') return tavilyP ? [tavilyP] : [];
+    if (p === 'brave') return braveP ? [braveP] : [];
+    if (p === 'serper') return serperP ? [serperP] : [];
+    if (p === 'searxng') return searxngP ? [searxngP] : [];
+    if (p === 'jina') return [jinaP];
+    if (p === 'ddg' || p === 'duckduckgo') return [scrapeP('duckduckgo')];
+    if ((SCRAPE_ENGINES as readonly string[]).includes(p)) return [scrapeP(p as ScrapeEngine)];
+    // Unknown pin → fall through to auto.
+  }
+  // Auto waterfall: keyed providers first (best), then keyless Jina, then scrape.
+  const order: Provider[] = [];
+  for (const cand of [tavilyP, braveP, serperP, searxngP]) if (cand) order.push(cand);
+  order.push(jinaP);
+  for (const eng of SCRAPE_ENGINES) order.push(scrapeP(eng));
+  return order;
+}
+export interface WebSearchOptions {
+  max?: number;
+  engine?: string;           // explicit pin from the tool arg
+  env?: EnvMap;              // defaults to process.env
+  http?: WebHttp;           // defaults to axios-backed client
+  onProviderError?: (provider: string, error: string) => void;
+}
+/**
+ * Run a web search through the provider waterfall. Returns the first provider
+ * that yields results, or a response with an empty result set + the list of
+ * providers that were tried.
+ */
+export async function webSearch(query: string, opts: WebSearchOptions = {}): Promise<SearchResponse & { tried: string[] }> {
+  const q = (query || '').trim();
+  if (!q) throw new Error('query is required');
+  const max = Math.max(1, Math.min(20, Math.floor(opts.max ?? 8)));
+  const env = opts.env ?? (process.env as EnvMap);
+  const http = opts.http ?? defaultHttp;
+  const pinned = (opts.engine || env.SKYLOOM_SEARCH_ENGINE || '').trim();
+  const providers = resolveProviders(env, pinned);
+  const tried: string[] = [];
+  for (const provider of providers) {
+    tried.push(provider.id);
+    try {
+      const res = await provider.run(http, env, q, max);
+      if (res.results.length > 0 || res.answer) return { ...res, tried };
+    } catch (e: any) {
+      opts.onProviderError?.(provider.id, String(e?.message || e));
+    }
+  }
+  return { provider: 'none', results: [], tried };
+}
+/** Format a SearchResponse as compact text for an LLM tool result. */
+export function formatSearchResults(res: SearchResponse & { tried?: string[] }): string {
+  if (!res.results.length && !res.answer) {
+    const tried = res.tried?.length ? ` (tried: ${res.tried.join(', ')})` : '';
+    return `No search results found${tried}. Try a simpler query, or set a search API key (TAVILY_API_KEY / BRAVE_API_KEY / SERPER_API_KEY) for more reliable results.`;
+  }
+  const parts: string[] = [];
+  if (res.answer) parts.push(`Answer: ${res.answer}\n`);
+  parts.push(`Search results (${res.provider}, ${res.results.length}):`);
+  parts.push(res.results.map((r, i) => `${i + 1}. ${r.title}\n   ${r.url}${r.snippet ? `\n   ${r.snippet}` : ''}`).join('\n'));
+  return parts.join('\n');
+}
+/* ════════════════════════════════════════════════════════════
+   Page reader — clean, LLM-ready content from a URL
+   ════════════════════════════════════════════════════════════ */
+/**
+ * Fetch a URL as clean, readable text. Uses Jina's r.jina.ai reader (strips
+ * nav/ads, returns markdown) when reachable, falling back to a raw fetch. This
+ * is what makes "read the top news article" actually usable — raw HTML is
+ * mostly boilerplate.
+ */
+export async function readPage(url: string, opts: { env?: EnvMap; http?: WebHttp; maxChars?: number } = {}): Promise<string> {
+  const env = opts.env ?? (process.env as EnvMap);
+  const http = opts.http ?? defaultHttp;
+  const maxChars = opts.maxChars ?? 12000;
+  if (!/^https?:\/\//i.test(url)) throw new Error('url must be http(s)');
+  const headers: Record<string, string> = { Accept: 'text/plain' };
+  if (env.JINA_API_KEY) headers.Authorization = `Bearer ${env.JINA_API_KEY}`;
+  try {
+    const text = await http.getText(`https://r.jina.ai/${url}`, { headers, timeoutMs: 20000 });
+    if (text && text.trim()) return clip(text, maxChars);
+  } catch { /* fall through to raw fetch */ }
+  const raw = await http.getText(url, { timeoutMs: 15000 });
+  return clip(stripTags(raw), maxChars);
+}
+function clip(s: string, max: number): string {
+  return s.length > max ? s.slice(0, max) + `\n...[truncated, ${s.length - max} more chars]` : s;
+}

package/src/web/server.ts CHANGED Viewed

@@ -55,7 +55,7 @@ export async function startWebServer(port: number = 7777): Promise<void> {
     try {
       if ((url.pathname === "/" || url.pathname === "/index.html") && req.method === "GET") serveUI(res);
       else if (url.pathname === "/favicon.svg" && req.method === "GET") serveFavicon(res);
-      else if (url.pathname === "/favicon.ico" && req.method === "GET") redirectFavicon(res);
+      else if (url.pathname === "/favicon.ico" && req.method === "GET") serveFavicon(res);
       else if (url.pathname === "/api/chat" && req.method === "POST") await handleChat(req, res, ctx);
       else if (url.pathname === "/api/agents" && req.method === "GET") handleAgents(res, ctx);
       else if (url.pathname === "/api/status" && req.method === "GET") handleStatus(res, ctx);
@@ -122,15 +122,7 @@ function serveUI(res: ServerResponse): void {
 function serveFavicon(res: ServerResponse): void {
   res.writeHead(200, {
     "Content-Type": "image/svg+xml; charset=utf-8",
-    "Cache-Control": "public, max-age=86400",
+    "Cache-Control": "no-cache, max-age=0",
   });
   res.end(SKYLOOM_FAVICON_SVG);
 }
-function redirectFavicon(res: ServerResponse): void {
-  res.writeHead(302, {
-    "Location": "/favicon.svg",
-    "Cache-Control": "public, max-age=86400",
-  });
-  res.end();
-}