agent-search-mcp 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Enhanced scoring with token-based ranking and weighted confidence.
3
+ * From ddgs SimpleFilterRanker: classify results into buckets.
4
+ */
5
+ export function scoreAndRank(results, query, weights = {}, frequencies) {
6
+ const tokens = query.toLowerCase().split(/\W+/).filter(t => t.length >= 3);
7
+ // Calculate max possible weight for normalization
8
+ const maxWeightSum = Math.max(...Object.values(weights), 0.5) * Math.max(tokens.length, 1);
9
+ return results
10
+ .map(r => {
11
+ const normalizedUrl = normalizeUrl(r.url);
12
+ const freq = frequencies?.get(normalizedUrl) || 1;
13
+ return {
14
+ ...r,
15
+ confidence: calculateWeightedConfidence(r, weights, maxWeightSum),
16
+ score: calculateScore(r, tokens, weights, freq),
17
+ };
18
+ })
19
+ .sort((a, b) => {
20
+ // 1. Primary: confidence (weighted quality signal)
21
+ if (b.confidence !== a.confidence)
22
+ return b.confidence - a.confidence;
23
+ // 2. Secondary: score
24
+ return b.score - a.score;
25
+ });
26
+ }
27
+ /**
28
+ * Calculate weighted confidence score (0-1) based on engine weights.
29
+ * Instead of raw engine count, uses sum of weights / max possible weight.
30
+ *
31
+ * Example: Brave (0.95) + Exa (0.92) = (0.95+0.92)/max_possible
32
+ * vs Sogou (0.80) + Baidu (0.75) = (0.80+0.75)/max_possible
33
+ * The first pair gets higher confidence.
34
+ */
35
+ function calculateWeightedConfidence(result, weights, maxWeightSum) {
36
+ const engines = result.engines || [];
37
+ if (engines.length === 0) {
38
+ // No engine info, use source weight as fallback
39
+ const sourceWeight = weights[result.source] || 0.5;
40
+ return sourceWeight * 0.5; // Lower confidence for unknown source
41
+ }
42
+ // Sum weights for engines that returned this result
43
+ const weightSum = engines.reduce((sum, engine) => {
44
+ return sum + (weights[engine] || 0.5);
45
+ }, 0);
46
+ // Normalize: divide by max possible weight sum (considering count)
47
+ const normalizedConfidence = Math.min(weightSum / (maxWeightSum * engines.length), 1.0);
48
+ // Apply count bonus (more engines still matters, but with diminishing returns)
49
+ const countBonus = Math.min(engines.length * 0.1, 0.3);
50
+ return Math.min(normalizedConfidence + countBonus, 1.0);
51
+ }
52
+ function normalizeUrl(url) {
53
+ try {
54
+ const u = new URL(url);
55
+ return `${u.hostname}${u.pathname.replace(/\/$/, '')}`.toLowerCase();
56
+ }
57
+ catch {
58
+ return url.toLowerCase();
59
+ }
60
+ }
61
+ /**
62
+ * Token-based scoring inspired by ddgs SimpleFilterRanker.
63
+ *
64
+ * Buckets:
65
+ * - Wikipedia boost: +0.15
66
+ * - Both title+body match: +0.4
67
+ * - Title only match: +0.3
68
+ * - Body only match: +0.2
69
+ * - Neither: 0
70
+ *
71
+ * Then multiply by frequency bonus and engine weight.
72
+ */
73
+ function calculateScore(result, tokens, weights, frequency) {
74
+ if (tokens.length === 0)
75
+ return 0.3;
76
+ const titleLower = result.title.toLowerCase();
77
+ const bodyLower = (result.snippet || '').toLowerCase();
78
+ // Count token matches
79
+ const titleMatches = tokens.filter(t => titleLower.includes(t)).length;
80
+ const bodyMatches = tokens.filter(t => bodyLower.includes(t)).length;
81
+ // Bucket classification
82
+ let bucketScore = 0;
83
+ const hasTitle = titleMatches > 0;
84
+ const hasBody = bodyMatches > 0;
85
+ if (hasTitle && hasBody) {
86
+ bucketScore = 0.4; // Both match
87
+ }
88
+ else if (hasTitle) {
89
+ bucketScore = 0.3; // Title only
90
+ }
91
+ else if (hasBody) {
92
+ bucketScore = 0.2; // Body only
93
+ }
94
+ // Wikipedia boost
95
+ if (result.url.includes('wikipedia.org')) {
96
+ bucketScore += 0.15;
97
+ }
98
+ // GitHub boost (high quality for code queries)
99
+ if (result.url.includes('github.com')) {
100
+ bucketScore += 0.05;
101
+ }
102
+ // Frequency bonus (from ddgs: more engines = more trustworthy)
103
+ const freqBonus = Math.min(frequency * 0.1, 0.3); // Cap at 0.3
104
+ // Engine weight
105
+ const maxWeight = Math.max(...(result.engines || [result.source]).map(e => weights[e] || 0.5));
106
+ // Final score: base + bucket + frequency, then apply weight
107
+ let score = 0.1 + bucketScore + freqBonus;
108
+ score *= maxWeight;
109
+ return Math.min(score, 1.0);
110
+ }
package/dist/cli.js ADDED
@@ -0,0 +1,169 @@
1
+ import { searchWithFallback } from './tools/free-search.js';
2
+ import { createHttpServer } from './infrastructure/http.js';
3
+ import { loadConfig } from './infrastructure/config.js';
4
+ const VALID_COMMANDS = ['search', 'extract', 'serve'];
5
+ const VALID_ENGINES = ['duckduckgo', 'sogou', 'bing', 'baidu', 'brave', 'tavily', 'exa'];
6
+ export function parseArgs(argv) {
7
+ const args = argv.slice(2); // skip node and script path
8
+ const result = { command: 'search' };
9
+ if (args.length === 0 || args.includes('--help')) {
10
+ result.help = true;
11
+ return result;
12
+ }
13
+ if (args.includes('--version')) {
14
+ result.version = true;
15
+ return result;
16
+ }
17
+ let i = 0;
18
+ // First arg is command or query
19
+ const first = args[0];
20
+ if (VALID_COMMANDS.includes(first)) {
21
+ result.command = first;
22
+ i = 1;
23
+ }
24
+ else {
25
+ // Default to search, first arg is query
26
+ result.command = 'search';
27
+ }
28
+ // Parse remaining args
29
+ for (; i < args.length; i++) {
30
+ const arg = args[i];
31
+ if (arg === '--count' && args[i + 1]) {
32
+ result.count = parseInt(args[++i], 10);
33
+ }
34
+ else if (arg === '--engines' && args[i + 1]) {
35
+ const engineList = args[++i].split(',');
36
+ result.engines = engineList.filter((e) => VALID_ENGINES.includes(e));
37
+ }
38
+ else if (arg === '--port' && args[i + 1]) {
39
+ result.port = parseInt(args[++i], 10);
40
+ }
41
+ else if (arg === '--json') {
42
+ result.json = true;
43
+ }
44
+ else if (arg === '--proxy' && args[i + 1]) {
45
+ result.proxy = args[++i];
46
+ }
47
+ else if (!arg.startsWith('--')) {
48
+ // Positional arg
49
+ if (result.command === 'search' && !result.query) {
50
+ result.query = arg;
51
+ }
52
+ else if (result.command === 'extract' && !result.url) {
53
+ result.url = arg;
54
+ }
55
+ }
56
+ }
57
+ return result;
58
+ }
59
+ function showHelp() {
60
+ console.log(`
61
+ free-agent-search-mcp CLI v2.1.0
62
+
63
+ Usage:
64
+ fasm search <query> [options] Search the web
65
+ fasm extract <url> [options] Extract page content
66
+ fasm serve [options] Start HTTP server
67
+ fasm --help Show this help
68
+ fasm --version Show version
69
+
70
+ Search Options:
71
+ --count <n> Number of results (1-50, default: 10)
72
+ --engines <list> Comma-separated engines (duckduckgo,sogou,bing,baidu,brave,tavily,exa)
73
+ --json Output as JSON
74
+ --proxy <url> HTTP proxy URL (e.g., http://127.0.0.1:7890)
75
+
76
+ Extract Options:
77
+ --json Output as JSON
78
+ --proxy <url> HTTP proxy URL
79
+
80
+ Serve Options:
81
+ --port <n> HTTP port (default: 3000)
82
+
83
+ Examples:
84
+ fasm search "TypeScript MCP server"
85
+ fasm search "query" --count 5 --engines bing,baidu
86
+ fasm extract "https://example.com" --json
87
+ fasm serve --port 8080
88
+ fasm search "query" --proxy http://127.0.0.1:7890
89
+ `);
90
+ }
91
+ async function main() {
92
+ const args = parseArgs(process.argv);
93
+ if (args.help) {
94
+ showHelp();
95
+ process.exit(0);
96
+ }
97
+ if (args.version) {
98
+ console.log('free-agent-search-mcp v2.1.0');
99
+ process.exit(0);
100
+ }
101
+ // Set proxy if provided
102
+ if (args.proxy) {
103
+ process.env.HTTP_PROXY = args.proxy;
104
+ process.env.HTTPS_PROXY = args.proxy;
105
+ }
106
+ if (args.command === 'search') {
107
+ if (!args.query) {
108
+ console.error('Error: search command requires a query');
109
+ process.exit(1);
110
+ }
111
+ const results = await searchWithFallback({
112
+ query: args.query,
113
+ count: args.count || 10,
114
+ engines: args.engines || ['duckduckgo', 'sogou'],
115
+ });
116
+ if (args.json) {
117
+ console.log(JSON.stringify(results, null, 2));
118
+ }
119
+ else {
120
+ console.log(`\nSearch: "${results.query}"`);
121
+ console.log(`Engines: ${results.engines.join(', ')}`);
122
+ console.log(`Results: ${results.meta.total}\n`);
123
+ for (const r of results.results) {
124
+ console.log(` ${r.title}`);
125
+ console.log(` ${r.url}`);
126
+ console.log(` ${r.snippet}`);
127
+ console.log();
128
+ }
129
+ }
130
+ }
131
+ else if (args.command === 'extract') {
132
+ if (!args.url) {
133
+ console.error('Error: extract command requires a URL');
134
+ process.exit(1);
135
+ }
136
+ const res = await fetch(`https://r.jina.ai/${args.url}`, {
137
+ headers: { 'Accept': 'text/markdown' },
138
+ signal: AbortSignal.timeout(10000),
139
+ });
140
+ const content = await res.text();
141
+ if (args.json) {
142
+ console.log(JSON.stringify({ url: args.url, content }, null, 2));
143
+ }
144
+ else {
145
+ console.log(content);
146
+ }
147
+ }
148
+ else if (args.command === 'serve') {
149
+ const config = loadConfig();
150
+ const port = args.port || config.port;
151
+ const server = createHttpServer({
152
+ port,
153
+ enableCors: config.enableCors,
154
+ corsOrigin: config.corsOrigin,
155
+ });
156
+ await server.listen();
157
+ console.log(`Server running on http://localhost:${port}`);
158
+ console.log('Press Ctrl+C to stop');
159
+ }
160
+ }
161
+ // Run main only when executed directly (not when imported)
162
+ const isMainModule = process.argv[1] && (process.argv[1].endsWith('/cli.js') ||
163
+ process.argv[1].endsWith('/cli.ts'));
164
+ if (isMainModule) {
165
+ main().catch((error) => {
166
+ console.error('Error:', error.message);
167
+ process.exit(1);
168
+ });
169
+ }
@@ -0,0 +1,56 @@
1
+ import { decodeHTMLTags } from '../infrastructure/html-utils.js';
2
+ export const baiduProvider = {
3
+ id: 'baidu',
4
+ name: 'Baidu',
5
+ isFree: true,
6
+ languages: ['zh'],
7
+ };
8
+ export async function searchBaidu(query, limit = 10) {
9
+ try {
10
+ const url = `https://www.baidu.com/s?wd=${encodeURIComponent(query)}&rn=${limit}`;
11
+ const res = await fetch(url, {
12
+ headers: {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Accept': 'text/html,application/xhtml+xml',
15
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
16
+ },
17
+ signal: AbortSignal.timeout(10000),
18
+ });
19
+ if (!res.ok) {
20
+ console.error(`Baidu: HTTP ${res.status}`);
21
+ return [];
22
+ }
23
+ const html = await res.text();
24
+ return parseBaiduResults(html, limit);
25
+ }
26
+ catch (error) {
27
+ const msg = error instanceof Error ? error.message : String(error);
28
+ if (msg.includes('timeout')) {
29
+ console.error('Baidu: Search timed out');
30
+ }
31
+ else {
32
+ console.error('Baidu search failed:', msg.slice(0, 200));
33
+ }
34
+ return [];
35
+ }
36
+ }
37
+ function parseBaiduResults(html, limit) {
38
+ const results = [];
39
+ // Fallback: use simpler h3 > a pattern (more robust)
40
+ const simpleRegex = /<h3[^>]*><a[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a><\/h3>/g;
41
+ let match;
42
+ while ((match = simpleRegex.exec(html)) && results.length < limit) {
43
+ const url = match[1];
44
+ const title = decodeHTMLTags(match[2]);
45
+ if (url && title && !url.includes('baidu.com')) {
46
+ results.push({
47
+ title,
48
+ url,
49
+ snippet: '',
50
+ source: 'baidu',
51
+ engines: ['baidu'],
52
+ });
53
+ }
54
+ }
55
+ return results;
56
+ }
@@ -0,0 +1,58 @@
1
+ import { decodeHTMLTags } from '../infrastructure/html-utils.js';
2
+ export const bingProvider = {
3
+ id: 'bing',
4
+ name: 'Bing',
5
+ isFree: true,
6
+ languages: ['en', 'zh'],
7
+ };
8
+ export async function searchBing(query, limit = 10) {
9
+ try {
10
+ const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${limit}`;
11
+ const res = await fetch(url, {
12
+ headers: {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Accept': 'text/html,application/xhtml+xml',
15
+ 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8',
16
+ },
17
+ signal: AbortSignal.timeout(10000),
18
+ });
19
+ if (!res.ok) {
20
+ console.error(`Bing: HTTP ${res.status}`);
21
+ return [];
22
+ }
23
+ const html = await res.text();
24
+ return parseBingResults(html, limit);
25
+ }
26
+ catch (error) {
27
+ const msg = error instanceof Error ? error.message : String(error);
28
+ if (msg.includes('timeout')) {
29
+ console.error('Bing: Search timed out');
30
+ }
31
+ else {
32
+ console.error('Bing search failed:', msg.slice(0, 200));
33
+ }
34
+ return [];
35
+ }
36
+ }
37
+ function parseBingResults(html, limit) {
38
+ const results = [];
39
+ // Parse Bing HTML results
40
+ // Pattern: <li class="b_algo"><h2><a href="URL">TITLE</a></h2><p>SNIPPET</p></li>
41
+ const resultRegex = /<li class="b_algo">[\s\S]*?<h2><a href="([^"]+)"[^>]*>([\s\S]*?)<\/a><\/h2>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/g;
42
+ let match;
43
+ while ((match = resultRegex.exec(html)) && results.length < limit) {
44
+ const url = match[1];
45
+ const title = decodeHTMLTags(match[2]);
46
+ const snippet = decodeHTMLTags(match[3]);
47
+ if (url && title) {
48
+ results.push({
49
+ title,
50
+ url,
51
+ snippet: snippet || '',
52
+ source: 'bing',
53
+ engines: ['bing'],
54
+ });
55
+ }
56
+ }
57
+ return results;
58
+ }
@@ -0,0 +1,33 @@
1
+ export class BraveProvider {
2
+ id = 'brave';
3
+ name = 'Brave Search';
4
+ isFree = false;
5
+ languages = ['en', 'zh'];
6
+ async search(query, count) {
7
+ const apiKey = process.env.BRAVE_API_KEY;
8
+ if (!apiKey)
9
+ return [];
10
+ const url = new URL('https://api.search.brave.com/res/v1/web/search');
11
+ url.searchParams.set('q', query);
12
+ url.searchParams.set('count', String(count));
13
+ const res = await fetch(url.toString(), {
14
+ headers: {
15
+ 'Accept': 'application/json',
16
+ 'Accept-Encoding': 'gzip',
17
+ 'X-Subscription-Token': apiKey,
18
+ },
19
+ signal: AbortSignal.timeout(5000),
20
+ });
21
+ if (!res.ok)
22
+ throw new Error(`Brave returned ${res.status}`);
23
+ const data = await res.json();
24
+ return (data.web?.results || []).map((r) => ({
25
+ title: r.title || '',
26
+ url: r.url || '',
27
+ snippet: r.description || '',
28
+ source: 'brave',
29
+ engines: ['brave'],
30
+ }));
31
+ }
32
+ }
33
+ export const braveProvider = new BraveProvider();
@@ -0,0 +1,47 @@
1
+ import { execFileSync } from 'child_process';
2
+ import { resolve } from 'path';
3
+ import { fileURLToPath } from 'url';
4
+ const __dirname = fileURLToPath(new URL('.', import.meta.url));
5
+ const SCRIPT_PATH = resolve(__dirname, '../../scripts/ddg-search.py');
6
+ export const duckduckgoProvider = {
7
+ id: 'duckduckgo',
8
+ name: 'DuckDuckGo',
9
+ isFree: true,
10
+ languages: ['en'],
11
+ };
12
+ /**
13
+ * Search DuckDuckGo using ddgs Python library (bypasses anti-bot).
14
+ * Falls back to empty array if Python/ddgs not available.
15
+ */
16
+ export async function searchDuckDuckGo(query, limit = 10) {
17
+ try {
18
+ // Use execFileSync to avoid shell injection (query passed as argument, not shell-interpolated)
19
+ const output = execFileSync('/usr/bin/python3', [SCRIPT_PATH, query, String(limit)], {
20
+ timeout: 15000, // 15s timeout
21
+ encoding: 'utf-8',
22
+ stdio: ['pipe', 'pipe', 'pipe'],
23
+ });
24
+ const results = JSON.parse(output.trim());
25
+ return results.map((r) => ({
26
+ title: r.title || '',
27
+ url: r.url || '',
28
+ snippet: r.snippet || '',
29
+ source: r.source || 'duckduckgo',
30
+ engines: ['duckduckgo'],
31
+ }));
32
+ }
33
+ catch (error) {
34
+ // Python/ddgs not available or timed out
35
+ const msg = error instanceof Error ? error.message : String(error);
36
+ if (msg.includes('ENOENT') || msg.includes('python3')) {
37
+ console.error('DDG: Python3 not found, skipping');
38
+ }
39
+ else if (msg.includes('timeout')) {
40
+ console.error('DDG: Search timed out');
41
+ }
42
+ else {
43
+ console.error('DDG search failed:', msg.slice(0, 200));
44
+ }
45
+ return [];
46
+ }
47
+ }
@@ -0,0 +1,46 @@
1
+ export const exaProvider = {
2
+ id: 'exa',
3
+ name: 'Exa Search',
4
+ isFree: false,
5
+ languages: ['en', 'zh'],
6
+ };
7
+ export async function searchExa(options) {
8
+ const { query, count = 10, apiKey } = options;
9
+ if (!apiKey) {
10
+ console.warn('Exa: No API key provided');
11
+ return [];
12
+ }
13
+ try {
14
+ const response = await fetch('https://api.exa.ai/search', {
15
+ method: 'POST',
16
+ headers: {
17
+ 'x-api-key': apiKey,
18
+ 'Content-Type': 'application/json',
19
+ },
20
+ body: JSON.stringify({
21
+ query,
22
+ numResults: count,
23
+ contents: {
24
+ highlights: true,
25
+ },
26
+ }),
27
+ signal: AbortSignal.timeout(15000),
28
+ });
29
+ if (!response.ok) {
30
+ console.error(`Exa: HTTP ${response.status}`);
31
+ return [];
32
+ }
33
+ const data = await response.json();
34
+ return data.results.map((result) => ({
35
+ title: result.title,
36
+ url: result.url,
37
+ snippet: result.highlights?.[0] || result.text?.substring(0, 200) || '',
38
+ source: `Exa${result.author ? ` (${result.author})` : ''}`,
39
+ engines: ['exa'],
40
+ }));
41
+ }
42
+ catch (error) {
43
+ console.error('Exa search failed:', error);
44
+ return [];
45
+ }
46
+ }
@@ -0,0 +1,25 @@
1
+ export { searchDuckDuckGo, duckduckgoProvider } from './duckduckgo.js';
2
+ export { searchSogou, sogouProvider } from './sogou.js';
3
+ export { searchBing, bingProvider } from './bing.js';
4
+ export { searchBaidu, baiduProvider } from './baidu.js';
5
+ export { braveProvider } from './brave.js';
6
+ export { tavilyProvider } from './tavily.js';
7
+ export { searchExa, exaProvider } from './exa.js';
8
+ /**
9
+ * All registered engine providers with metadata.
10
+ * Free engines: DDG, Sogou, Bing, Baidu
11
+ * Paid engines: Brave, Tavily, Exa (require API keys)
12
+ */
13
+ export const engines = {
14
+ duckduckgo: { id: 'duckduckgo', name: 'DuckDuckGo', isFree: true, languages: ['en'] },
15
+ sogou: { id: 'sogou', name: 'Sogou Search', isFree: true, languages: ['zh'] },
16
+ bing: { id: 'bing', name: 'Bing', isFree: true, languages: ['en', 'zh'] },
17
+ baidu: { id: 'baidu', name: 'Baidu', isFree: true, languages: ['zh'] },
18
+ brave: { id: 'brave', name: 'Brave Search', isFree: false, languages: ['en', 'zh'] },
19
+ tavily: { id: 'tavily', name: 'Tavily Search', isFree: false, languages: ['en', 'zh'] },
20
+ exa: { id: 'exa', name: 'Exa Search', isFree: false, languages: ['en', 'zh'] },
21
+ };
22
+ /** Free engines that always work without API keys */
23
+ export const freeEngines = ['duckduckgo', 'sogou', 'bing', 'baidu'];
24
+ /** Paid engines that require API keys */
25
+ export const paidEngines = ['brave', 'tavily', 'exa'];
@@ -0,0 +1,132 @@
1
+ const SOGOU_SEARCH_URL = 'https://www.sogou.com/web';
2
+ const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
3
+ /**
4
+ * Parse Sogou search results HTML using regex
5
+ */
6
+ function parseSogouHtml(html) {
7
+ const results = [];
8
+ const seenUrls = new Set();
9
+ // Try to find result blocks
10
+ // Sogou typically has: <div class="vrwrap"> or <div class="rb"> containing the results
11
+ const blockRegex = /<div[^>]*(?:class="[^"]*vr(?:wrap|5)[^"]*"|class="[^"]*\brb\b[^"]*"|id="[^"]*result[^"]*")[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gis;
12
+ let blockMatch;
13
+ while ((blockMatch = blockRegex.exec(html)) !== null) {
14
+ const block = blockMatch[1];
15
+ // Extract title link (h3 or h2 containing a link)
16
+ const titleLinkRegex = /<h[23][^>]*>.*?<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i;
17
+ const titleMatch = block.match(titleLinkRegex);
18
+ if (!titleMatch)
19
+ continue;
20
+ const rawUrl = titleMatch[1]?.trim() || '';
21
+ const title = titleMatch[2]?.replace(/<[^>]+>/g, '').trim() || '';
22
+ if (!title || !rawUrl)
23
+ continue;
24
+ // Resolve the actual URL (Sogou wraps URLs in redirects)
25
+ let url = rawUrl;
26
+ try {
27
+ const parsed = new URL(rawUrl, SOGOU_SEARCH_URL);
28
+ const target = parsed.searchParams.get('url') || parsed.searchParams.get('u') || parsed.searchParams.get('link');
29
+ if (target && /^https?:\/\//i.test(target)) {
30
+ url = target;
31
+ }
32
+ else {
33
+ url = parsed.toString();
34
+ }
35
+ }
36
+ catch {
37
+ // keep rawUrl
38
+ }
39
+ if (seenUrls.has(url))
40
+ continue;
41
+ seenUrls.add(url);
42
+ // Extract snippet
43
+ const descMatch = block.match(/<p[^>]*class="[^"]*str_info[^"]*"[^>]*>([\s\S]*?)<\/p>/i)
44
+ || block.match(/<div[^>]*class="[^"]*str_info[^"]*"[^>]*>([\s\S]*?)<\/div>/i)
45
+ || block.match(/class="[^"]*(?:str_info|ft|text-layout)[^"]*"[^>]*>([\s\S]*?)<\//i);
46
+ const snippet = descMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
47
+ // Extract source
48
+ const srcMatch = block.match(/<cite[^>]*>([\s\S]*?)<\/cite>/i)
49
+ || block.match(/class="[^"]*(?:citeurl|g|url)[^"]*"[^>]*>([\s\S]*?)<\//i);
50
+ let source = srcMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
51
+ if (!source) {
52
+ try {
53
+ source = new URL(url).hostname;
54
+ }
55
+ catch {
56
+ source = '';
57
+ }
58
+ }
59
+ results.push({ title, url, snippet, source, engines: ['sogou'] });
60
+ }
61
+ // Fallback: broader extraction for different page layouts
62
+ if (results.length === 0) {
63
+ const altBlockRegex = /<div[^>]*class="[^"]*vrwrap[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gis;
64
+ let altBlockMatch;
65
+ while ((altBlockMatch = altBlockRegex.exec(html)) !== null) {
66
+ const block = altBlockMatch[1];
67
+ const aMatch = block.match(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
68
+ if (!aMatch)
69
+ continue;
70
+ const rawUrl = aMatch[1]?.trim() || '';
71
+ const title = aMatch[2]?.replace(/<[^>]+>/g, '').trim() || '';
72
+ if (!title || !rawUrl || seenUrls.has(rawUrl))
73
+ continue;
74
+ seenUrls.add(rawUrl);
75
+ let url = rawUrl;
76
+ try {
77
+ const parsed = new URL(rawUrl, SOGOU_SEARCH_URL);
78
+ const target = parsed.searchParams.get('url') || parsed.searchParams.get('u');
79
+ if (target && /^https?:\/\//i.test(target))
80
+ url = target;
81
+ }
82
+ catch { /* keep rawUrl */ }
83
+ const descMatch = block.match(/(?:str_info|ft)[^>]*>([\s\S]*?)<\//i);
84
+ const snippet = descMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
85
+ let source = '';
86
+ try {
87
+ source = new URL(url).hostname;
88
+ }
89
+ catch { /* ignore */ }
90
+ results.push({ title, url, snippet, source, engines: ['sogou'] });
91
+ }
92
+ }
93
+ return results;
94
+ }
95
+ export const sogouProvider = {
96
+ id: 'sogou',
97
+ name: 'Sogou Search',
98
+ isFree: true,
99
+ languages: ['zh'],
100
+ };
101
+ export async function searchSogou(query, limit = 10) {
102
+ try {
103
+ const url = new URL(SOGOU_SEARCH_URL);
104
+ url.searchParams.set('query', query);
105
+ url.searchParams.set('ie', 'utf8');
106
+ const response = await fetch(url.toString(), {
107
+ method: 'GET',
108
+ headers: {
109
+ 'User-Agent': USER_AGENT,
110
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
111
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
112
+ 'Referer': 'https://www.sogou.com/',
113
+ },
114
+ redirect: 'follow',
115
+ });
116
+ if (!response.ok) {
117
+ throw new Error(`Sogou returned status ${response.status}`);
118
+ }
119
+ const html = await response.text();
120
+ // Check for anti-bot page
121
+ if (html.toLowerCase().includes('antispider') || html.includes('请输入验证码') || html.includes('访问过于频繁')) {
122
+ console.warn('Sogou returned an anti-bot challenge page');
123
+ return [];
124
+ }
125
+ const results = parseSogouHtml(html);
126
+ return results.slice(0, limit);
127
+ }
128
+ catch (error) {
129
+ console.error('Sogou search failed:', error instanceof Error ? error.message : String(error));
130
+ return [];
131
+ }
132
+ }