agent-search-mcp 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/LICENSE +207 -0
- package/README.md +480 -0
- package/dist/aggregation/dedup.js +102 -0
- package/dist/aggregation/format.js +60 -0
- package/dist/aggregation/index.js +3 -0
- package/dist/aggregation/scorer.js +110 -0
- package/dist/cli.js +169 -0
- package/dist/engines/baidu.js +56 -0
- package/dist/engines/bing.js +58 -0
- package/dist/engines/brave.js +33 -0
- package/dist/engines/duckduckgo.js +47 -0
- package/dist/engines/exa.js +46 -0
- package/dist/engines/index.js +25 -0
- package/dist/engines/sogou.js +132 -0
- package/dist/engines/tavily.js +33 -0
- package/dist/index.js +46 -0
- package/dist/infrastructure/cache.js +24 -0
- package/dist/infrastructure/config.js +18 -0
- package/dist/infrastructure/health.js +86 -0
- package/dist/infrastructure/html-utils.js +10 -0
- package/dist/infrastructure/http.js +66 -0
- package/dist/infrastructure/index.js +9 -0
- package/dist/infrastructure/logger.js +9 -0
- package/dist/infrastructure/rate-limiter.js +12 -0
- package/dist/infrastructure/security.js +158 -0
- package/dist/infrastructure/url-validator.js +33 -0
- package/dist/tools/capabilities.js +35 -0
- package/dist/tools/fetch-tools.js +200 -0
- package/dist/tools/free-extract.js +43 -0
- package/dist/tools/free-search-advanced.js +40 -0
- package/dist/tools/free-search.js +380 -0
- package/dist/tools/health.js +9 -0
- package/dist/types.js +1 -0
- package/package.json +68 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhanced scoring with token-based ranking and weighted confidence.
|
|
3
|
+
* From ddgs SimpleFilterRanker: classify results into buckets.
|
|
4
|
+
*/
|
|
5
|
+
export function scoreAndRank(results, query, weights = {}, frequencies) {
|
|
6
|
+
const tokens = query.toLowerCase().split(/\W+/).filter(t => t.length >= 3);
|
|
7
|
+
// Calculate max possible weight for normalization
|
|
8
|
+
const maxWeightSum = Math.max(...Object.values(weights), 0.5) * Math.max(tokens.length, 1);
|
|
9
|
+
return results
|
|
10
|
+
.map(r => {
|
|
11
|
+
const normalizedUrl = normalizeUrl(r.url);
|
|
12
|
+
const freq = frequencies?.get(normalizedUrl) || 1;
|
|
13
|
+
return {
|
|
14
|
+
...r,
|
|
15
|
+
confidence: calculateWeightedConfidence(r, weights, maxWeightSum),
|
|
16
|
+
score: calculateScore(r, tokens, weights, freq),
|
|
17
|
+
};
|
|
18
|
+
})
|
|
19
|
+
.sort((a, b) => {
|
|
20
|
+
// 1. Primary: confidence (weighted quality signal)
|
|
21
|
+
if (b.confidence !== a.confidence)
|
|
22
|
+
return b.confidence - a.confidence;
|
|
23
|
+
// 2. Secondary: score
|
|
24
|
+
return b.score - a.score;
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Calculate weighted confidence score (0-1) based on engine weights.
|
|
29
|
+
* Instead of raw engine count, uses sum of weights / max possible weight.
|
|
30
|
+
*
|
|
31
|
+
* Example: Brave (0.95) + Exa (0.92) = (0.95+0.92)/max_possible
|
|
32
|
+
* vs Sogou (0.80) + Baidu (0.75) = (0.80+0.75)/max_possible
|
|
33
|
+
* The first pair gets higher confidence.
|
|
34
|
+
*/
|
|
35
|
+
function calculateWeightedConfidence(result, weights, maxWeightSum) {
|
|
36
|
+
const engines = result.engines || [];
|
|
37
|
+
if (engines.length === 0) {
|
|
38
|
+
// No engine info, use source weight as fallback
|
|
39
|
+
const sourceWeight = weights[result.source] || 0.5;
|
|
40
|
+
return sourceWeight * 0.5; // Lower confidence for unknown source
|
|
41
|
+
}
|
|
42
|
+
// Sum weights for engines that returned this result
|
|
43
|
+
const weightSum = engines.reduce((sum, engine) => {
|
|
44
|
+
return sum + (weights[engine] || 0.5);
|
|
45
|
+
}, 0);
|
|
46
|
+
// Normalize: divide by max possible weight sum (considering count)
|
|
47
|
+
const normalizedConfidence = Math.min(weightSum / (maxWeightSum * engines.length), 1.0);
|
|
48
|
+
// Apply count bonus (more engines still matters, but with diminishing returns)
|
|
49
|
+
const countBonus = Math.min(engines.length * 0.1, 0.3);
|
|
50
|
+
return Math.min(normalizedConfidence + countBonus, 1.0);
|
|
51
|
+
}
|
|
52
|
+
function normalizeUrl(url) {
|
|
53
|
+
try {
|
|
54
|
+
const u = new URL(url);
|
|
55
|
+
return `${u.hostname}${u.pathname.replace(/\/$/, '')}`.toLowerCase();
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
return url.toLowerCase();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Token-based scoring inspired by ddgs SimpleFilterRanker.
|
|
63
|
+
*
|
|
64
|
+
* Buckets:
|
|
65
|
+
* - Wikipedia boost: +0.15
|
|
66
|
+
* - Both title+body match: +0.4
|
|
67
|
+
* - Title only match: +0.3
|
|
68
|
+
* - Body only match: +0.2
|
|
69
|
+
* - Neither: 0
|
|
70
|
+
*
|
|
71
|
+
* Then multiply by frequency bonus and engine weight.
|
|
72
|
+
*/
|
|
73
|
+
function calculateScore(result, tokens, weights, frequency) {
|
|
74
|
+
if (tokens.length === 0)
|
|
75
|
+
return 0.3;
|
|
76
|
+
const titleLower = result.title.toLowerCase();
|
|
77
|
+
const bodyLower = (result.snippet || '').toLowerCase();
|
|
78
|
+
// Count token matches
|
|
79
|
+
const titleMatches = tokens.filter(t => titleLower.includes(t)).length;
|
|
80
|
+
const bodyMatches = tokens.filter(t => bodyLower.includes(t)).length;
|
|
81
|
+
// Bucket classification
|
|
82
|
+
let bucketScore = 0;
|
|
83
|
+
const hasTitle = titleMatches > 0;
|
|
84
|
+
const hasBody = bodyMatches > 0;
|
|
85
|
+
if (hasTitle && hasBody) {
|
|
86
|
+
bucketScore = 0.4; // Both match
|
|
87
|
+
}
|
|
88
|
+
else if (hasTitle) {
|
|
89
|
+
bucketScore = 0.3; // Title only
|
|
90
|
+
}
|
|
91
|
+
else if (hasBody) {
|
|
92
|
+
bucketScore = 0.2; // Body only
|
|
93
|
+
}
|
|
94
|
+
// Wikipedia boost
|
|
95
|
+
if (result.url.includes('wikipedia.org')) {
|
|
96
|
+
bucketScore += 0.15;
|
|
97
|
+
}
|
|
98
|
+
// GitHub boost (high quality for code queries)
|
|
99
|
+
if (result.url.includes('github.com')) {
|
|
100
|
+
bucketScore += 0.05;
|
|
101
|
+
}
|
|
102
|
+
// Frequency bonus (from ddgs: more engines = more trustworthy)
|
|
103
|
+
const freqBonus = Math.min(frequency * 0.1, 0.3); // Cap at 0.3
|
|
104
|
+
// Engine weight
|
|
105
|
+
const maxWeight = Math.max(...(result.engines || [result.source]).map(e => weights[e] || 0.5));
|
|
106
|
+
// Final score: base + bucket + frequency, then apply weight
|
|
107
|
+
let score = 0.1 + bucketScore + freqBonus;
|
|
108
|
+
score *= maxWeight;
|
|
109
|
+
return Math.min(score, 1.0);
|
|
110
|
+
}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { searchWithFallback } from './tools/free-search.js';
|
|
2
|
+
import { createHttpServer } from './infrastructure/http.js';
|
|
3
|
+
import { loadConfig } from './infrastructure/config.js';
|
|
4
|
+
const VALID_COMMANDS = ['search', 'extract', 'serve'];
|
|
5
|
+
const VALID_ENGINES = ['duckduckgo', 'sogou', 'bing', 'baidu', 'brave', 'tavily', 'exa'];
|
|
6
|
+
export function parseArgs(argv) {
|
|
7
|
+
const args = argv.slice(2); // skip node and script path
|
|
8
|
+
const result = { command: 'search' };
|
|
9
|
+
if (args.length === 0 || args.includes('--help')) {
|
|
10
|
+
result.help = true;
|
|
11
|
+
return result;
|
|
12
|
+
}
|
|
13
|
+
if (args.includes('--version')) {
|
|
14
|
+
result.version = true;
|
|
15
|
+
return result;
|
|
16
|
+
}
|
|
17
|
+
let i = 0;
|
|
18
|
+
// First arg is command or query
|
|
19
|
+
const first = args[0];
|
|
20
|
+
if (VALID_COMMANDS.includes(first)) {
|
|
21
|
+
result.command = first;
|
|
22
|
+
i = 1;
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
// Default to search, first arg is query
|
|
26
|
+
result.command = 'search';
|
|
27
|
+
}
|
|
28
|
+
// Parse remaining args
|
|
29
|
+
for (; i < args.length; i++) {
|
|
30
|
+
const arg = args[i];
|
|
31
|
+
if (arg === '--count' && args[i + 1]) {
|
|
32
|
+
result.count = parseInt(args[++i], 10);
|
|
33
|
+
}
|
|
34
|
+
else if (arg === '--engines' && args[i + 1]) {
|
|
35
|
+
const engineList = args[++i].split(',');
|
|
36
|
+
result.engines = engineList.filter((e) => VALID_ENGINES.includes(e));
|
|
37
|
+
}
|
|
38
|
+
else if (arg === '--port' && args[i + 1]) {
|
|
39
|
+
result.port = parseInt(args[++i], 10);
|
|
40
|
+
}
|
|
41
|
+
else if (arg === '--json') {
|
|
42
|
+
result.json = true;
|
|
43
|
+
}
|
|
44
|
+
else if (arg === '--proxy' && args[i + 1]) {
|
|
45
|
+
result.proxy = args[++i];
|
|
46
|
+
}
|
|
47
|
+
else if (!arg.startsWith('--')) {
|
|
48
|
+
// Positional arg
|
|
49
|
+
if (result.command === 'search' && !result.query) {
|
|
50
|
+
result.query = arg;
|
|
51
|
+
}
|
|
52
|
+
else if (result.command === 'extract' && !result.url) {
|
|
53
|
+
result.url = arg;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
function showHelp() {
|
|
60
|
+
console.log(`
|
|
61
|
+
free-agent-search-mcp CLI v2.1.0
|
|
62
|
+
|
|
63
|
+
Usage:
|
|
64
|
+
fasm search <query> [options] Search the web
|
|
65
|
+
fasm extract <url> [options] Extract page content
|
|
66
|
+
fasm serve [options] Start HTTP server
|
|
67
|
+
fasm --help Show this help
|
|
68
|
+
fasm --version Show version
|
|
69
|
+
|
|
70
|
+
Search Options:
|
|
71
|
+
--count <n> Number of results (1-50, default: 10)
|
|
72
|
+
--engines <list> Comma-separated engines (duckduckgo,sogou,bing,baidu,brave,tavily,exa)
|
|
73
|
+
--json Output as JSON
|
|
74
|
+
--proxy <url> HTTP proxy URL (e.g., http://127.0.0.1:7890)
|
|
75
|
+
|
|
76
|
+
Extract Options:
|
|
77
|
+
--json Output as JSON
|
|
78
|
+
--proxy <url> HTTP proxy URL
|
|
79
|
+
|
|
80
|
+
Serve Options:
|
|
81
|
+
--port <n> HTTP port (default: 3000)
|
|
82
|
+
|
|
83
|
+
Examples:
|
|
84
|
+
fasm search "TypeScript MCP server"
|
|
85
|
+
fasm search "query" --count 5 --engines bing,baidu
|
|
86
|
+
fasm extract "https://example.com" --json
|
|
87
|
+
fasm serve --port 8080
|
|
88
|
+
fasm search "query" --proxy http://127.0.0.1:7890
|
|
89
|
+
`);
|
|
90
|
+
}
|
|
91
|
+
async function main() {
|
|
92
|
+
const args = parseArgs(process.argv);
|
|
93
|
+
if (args.help) {
|
|
94
|
+
showHelp();
|
|
95
|
+
process.exit(0);
|
|
96
|
+
}
|
|
97
|
+
if (args.version) {
|
|
98
|
+
console.log('free-agent-search-mcp v2.1.0');
|
|
99
|
+
process.exit(0);
|
|
100
|
+
}
|
|
101
|
+
// Set proxy if provided
|
|
102
|
+
if (args.proxy) {
|
|
103
|
+
process.env.HTTP_PROXY = args.proxy;
|
|
104
|
+
process.env.HTTPS_PROXY = args.proxy;
|
|
105
|
+
}
|
|
106
|
+
if (args.command === 'search') {
|
|
107
|
+
if (!args.query) {
|
|
108
|
+
console.error('Error: search command requires a query');
|
|
109
|
+
process.exit(1);
|
|
110
|
+
}
|
|
111
|
+
const results = await searchWithFallback({
|
|
112
|
+
query: args.query,
|
|
113
|
+
count: args.count || 10,
|
|
114
|
+
engines: args.engines || ['duckduckgo', 'sogou'],
|
|
115
|
+
});
|
|
116
|
+
if (args.json) {
|
|
117
|
+
console.log(JSON.stringify(results, null, 2));
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
console.log(`\nSearch: "${results.query}"`);
|
|
121
|
+
console.log(`Engines: ${results.engines.join(', ')}`);
|
|
122
|
+
console.log(`Results: ${results.meta.total}\n`);
|
|
123
|
+
for (const r of results.results) {
|
|
124
|
+
console.log(` ${r.title}`);
|
|
125
|
+
console.log(` ${r.url}`);
|
|
126
|
+
console.log(` ${r.snippet}`);
|
|
127
|
+
console.log();
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
else if (args.command === 'extract') {
|
|
132
|
+
if (!args.url) {
|
|
133
|
+
console.error('Error: extract command requires a URL');
|
|
134
|
+
process.exit(1);
|
|
135
|
+
}
|
|
136
|
+
const res = await fetch(`https://r.jina.ai/${args.url}`, {
|
|
137
|
+
headers: { 'Accept': 'text/markdown' },
|
|
138
|
+
signal: AbortSignal.timeout(10000),
|
|
139
|
+
});
|
|
140
|
+
const content = await res.text();
|
|
141
|
+
if (args.json) {
|
|
142
|
+
console.log(JSON.stringify({ url: args.url, content }, null, 2));
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
console.log(content);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
else if (args.command === 'serve') {
|
|
149
|
+
const config = loadConfig();
|
|
150
|
+
const port = args.port || config.port;
|
|
151
|
+
const server = createHttpServer({
|
|
152
|
+
port,
|
|
153
|
+
enableCors: config.enableCors,
|
|
154
|
+
corsOrigin: config.corsOrigin,
|
|
155
|
+
});
|
|
156
|
+
await server.listen();
|
|
157
|
+
console.log(`Server running on http://localhost:${port}`);
|
|
158
|
+
console.log('Press Ctrl+C to stop');
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// Run main only when executed directly (not when imported)
|
|
162
|
+
const isMainModule = process.argv[1] && (process.argv[1].endsWith('/cli.js') ||
|
|
163
|
+
process.argv[1].endsWith('/cli.ts'));
|
|
164
|
+
if (isMainModule) {
|
|
165
|
+
main().catch((error) => {
|
|
166
|
+
console.error('Error:', error.message);
|
|
167
|
+
process.exit(1);
|
|
168
|
+
});
|
|
169
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { decodeHTMLTags } from '../infrastructure/html-utils.js';
|
|
2
|
+
export const baiduProvider = {
|
|
3
|
+
id: 'baidu',
|
|
4
|
+
name: 'Baidu',
|
|
5
|
+
isFree: true,
|
|
6
|
+
languages: ['zh'],
|
|
7
|
+
};
|
|
8
|
+
export async function searchBaidu(query, limit = 10) {
|
|
9
|
+
try {
|
|
10
|
+
const url = `https://www.baidu.com/s?wd=${encodeURIComponent(query)}&rn=${limit}`;
|
|
11
|
+
const res = await fetch(url, {
|
|
12
|
+
headers: {
|
|
13
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
14
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
15
|
+
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
16
|
+
},
|
|
17
|
+
signal: AbortSignal.timeout(10000),
|
|
18
|
+
});
|
|
19
|
+
if (!res.ok) {
|
|
20
|
+
console.error(`Baidu: HTTP ${res.status}`);
|
|
21
|
+
return [];
|
|
22
|
+
}
|
|
23
|
+
const html = await res.text();
|
|
24
|
+
return parseBaiduResults(html, limit);
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
28
|
+
if (msg.includes('timeout')) {
|
|
29
|
+
console.error('Baidu: Search timed out');
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
console.error('Baidu search failed:', msg.slice(0, 200));
|
|
33
|
+
}
|
|
34
|
+
return [];
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function parseBaiduResults(html, limit) {
|
|
38
|
+
const results = [];
|
|
39
|
+
// Fallback: use simpler h3 > a pattern (more robust)
|
|
40
|
+
const simpleRegex = /<h3[^>]*><a[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a><\/h3>/g;
|
|
41
|
+
let match;
|
|
42
|
+
while ((match = simpleRegex.exec(html)) && results.length < limit) {
|
|
43
|
+
const url = match[1];
|
|
44
|
+
const title = decodeHTMLTags(match[2]);
|
|
45
|
+
if (url && title && !url.includes('baidu.com')) {
|
|
46
|
+
results.push({
|
|
47
|
+
title,
|
|
48
|
+
url,
|
|
49
|
+
snippet: '',
|
|
50
|
+
source: 'baidu',
|
|
51
|
+
engines: ['baidu'],
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return results;
|
|
56
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { decodeHTMLTags } from '../infrastructure/html-utils.js';
|
|
2
|
+
export const bingProvider = {
|
|
3
|
+
id: 'bing',
|
|
4
|
+
name: 'Bing',
|
|
5
|
+
isFree: true,
|
|
6
|
+
languages: ['en', 'zh'],
|
|
7
|
+
};
|
|
8
|
+
export async function searchBing(query, limit = 10) {
|
|
9
|
+
try {
|
|
10
|
+
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=${limit}`;
|
|
11
|
+
const res = await fetch(url, {
|
|
12
|
+
headers: {
|
|
13
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
14
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
15
|
+
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8',
|
|
16
|
+
},
|
|
17
|
+
signal: AbortSignal.timeout(10000),
|
|
18
|
+
});
|
|
19
|
+
if (!res.ok) {
|
|
20
|
+
console.error(`Bing: HTTP ${res.status}`);
|
|
21
|
+
return [];
|
|
22
|
+
}
|
|
23
|
+
const html = await res.text();
|
|
24
|
+
return parseBingResults(html, limit);
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
28
|
+
if (msg.includes('timeout')) {
|
|
29
|
+
console.error('Bing: Search timed out');
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
console.error('Bing search failed:', msg.slice(0, 200));
|
|
33
|
+
}
|
|
34
|
+
return [];
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function parseBingResults(html, limit) {
|
|
38
|
+
const results = [];
|
|
39
|
+
// Parse Bing HTML results
|
|
40
|
+
// Pattern: <li class="b_algo"><h2><a href="URL">TITLE</a></h2><p>SNIPPET</p></li>
|
|
41
|
+
const resultRegex = /<li class="b_algo">[\s\S]*?<h2><a href="([^"]+)"[^>]*>([\s\S]*?)<\/a><\/h2>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/g;
|
|
42
|
+
let match;
|
|
43
|
+
while ((match = resultRegex.exec(html)) && results.length < limit) {
|
|
44
|
+
const url = match[1];
|
|
45
|
+
const title = decodeHTMLTags(match[2]);
|
|
46
|
+
const snippet = decodeHTMLTags(match[3]);
|
|
47
|
+
if (url && title) {
|
|
48
|
+
results.push({
|
|
49
|
+
title,
|
|
50
|
+
url,
|
|
51
|
+
snippet: snippet || '',
|
|
52
|
+
source: 'bing',
|
|
53
|
+
engines: ['bing'],
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return results;
|
|
58
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export class BraveProvider {
|
|
2
|
+
id = 'brave';
|
|
3
|
+
name = 'Brave Search';
|
|
4
|
+
isFree = false;
|
|
5
|
+
languages = ['en', 'zh'];
|
|
6
|
+
async search(query, count) {
|
|
7
|
+
const apiKey = process.env.BRAVE_API_KEY;
|
|
8
|
+
if (!apiKey)
|
|
9
|
+
return [];
|
|
10
|
+
const url = new URL('https://api.search.brave.com/res/v1/web/search');
|
|
11
|
+
url.searchParams.set('q', query);
|
|
12
|
+
url.searchParams.set('count', String(count));
|
|
13
|
+
const res = await fetch(url.toString(), {
|
|
14
|
+
headers: {
|
|
15
|
+
'Accept': 'application/json',
|
|
16
|
+
'Accept-Encoding': 'gzip',
|
|
17
|
+
'X-Subscription-Token': apiKey,
|
|
18
|
+
},
|
|
19
|
+
signal: AbortSignal.timeout(5000),
|
|
20
|
+
});
|
|
21
|
+
if (!res.ok)
|
|
22
|
+
throw new Error(`Brave returned ${res.status}`);
|
|
23
|
+
const data = await res.json();
|
|
24
|
+
return (data.web?.results || []).map((r) => ({
|
|
25
|
+
title: r.title || '',
|
|
26
|
+
url: r.url || '',
|
|
27
|
+
snippet: r.description || '',
|
|
28
|
+
source: 'brave',
|
|
29
|
+
engines: ['brave'],
|
|
30
|
+
}));
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
export const braveProvider = new BraveProvider();
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { execFileSync } from 'child_process';
|
|
2
|
+
import { resolve } from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
const __dirname = fileURLToPath(new URL('.', import.meta.url));
|
|
5
|
+
const SCRIPT_PATH = resolve(__dirname, '../../scripts/ddg-search.py');
|
|
6
|
+
export const duckduckgoProvider = {
|
|
7
|
+
id: 'duckduckgo',
|
|
8
|
+
name: 'DuckDuckGo',
|
|
9
|
+
isFree: true,
|
|
10
|
+
languages: ['en'],
|
|
11
|
+
};
|
|
12
|
+
/**
|
|
13
|
+
* Search DuckDuckGo using ddgs Python library (bypasses anti-bot).
|
|
14
|
+
* Falls back to empty array if Python/ddgs not available.
|
|
15
|
+
*/
|
|
16
|
+
export async function searchDuckDuckGo(query, limit = 10) {
|
|
17
|
+
try {
|
|
18
|
+
// Use execFileSync to avoid shell injection (query passed as argument, not shell-interpolated)
|
|
19
|
+
const output = execFileSync('/usr/bin/python3', [SCRIPT_PATH, query, String(limit)], {
|
|
20
|
+
timeout: 15000, // 15s timeout
|
|
21
|
+
encoding: 'utf-8',
|
|
22
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
23
|
+
});
|
|
24
|
+
const results = JSON.parse(output.trim());
|
|
25
|
+
return results.map((r) => ({
|
|
26
|
+
title: r.title || '',
|
|
27
|
+
url: r.url || '',
|
|
28
|
+
snippet: r.snippet || '',
|
|
29
|
+
source: r.source || 'duckduckgo',
|
|
30
|
+
engines: ['duckduckgo'],
|
|
31
|
+
}));
|
|
32
|
+
}
|
|
33
|
+
catch (error) {
|
|
34
|
+
// Python/ddgs not available or timed out
|
|
35
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
36
|
+
if (msg.includes('ENOENT') || msg.includes('python3')) {
|
|
37
|
+
console.error('DDG: Python3 not found, skipping');
|
|
38
|
+
}
|
|
39
|
+
else if (msg.includes('timeout')) {
|
|
40
|
+
console.error('DDG: Search timed out');
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
console.error('DDG search failed:', msg.slice(0, 200));
|
|
44
|
+
}
|
|
45
|
+
return [];
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export const exaProvider = {
|
|
2
|
+
id: 'exa',
|
|
3
|
+
name: 'Exa Search',
|
|
4
|
+
isFree: false,
|
|
5
|
+
languages: ['en', 'zh'],
|
|
6
|
+
};
|
|
7
|
+
export async function searchExa(options) {
|
|
8
|
+
const { query, count = 10, apiKey } = options;
|
|
9
|
+
if (!apiKey) {
|
|
10
|
+
console.warn('Exa: No API key provided');
|
|
11
|
+
return [];
|
|
12
|
+
}
|
|
13
|
+
try {
|
|
14
|
+
const response = await fetch('https://api.exa.ai/search', {
|
|
15
|
+
method: 'POST',
|
|
16
|
+
headers: {
|
|
17
|
+
'x-api-key': apiKey,
|
|
18
|
+
'Content-Type': 'application/json',
|
|
19
|
+
},
|
|
20
|
+
body: JSON.stringify({
|
|
21
|
+
query,
|
|
22
|
+
numResults: count,
|
|
23
|
+
contents: {
|
|
24
|
+
highlights: true,
|
|
25
|
+
},
|
|
26
|
+
}),
|
|
27
|
+
signal: AbortSignal.timeout(15000),
|
|
28
|
+
});
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
console.error(`Exa: HTTP ${response.status}`);
|
|
31
|
+
return [];
|
|
32
|
+
}
|
|
33
|
+
const data = await response.json();
|
|
34
|
+
return data.results.map((result) => ({
|
|
35
|
+
title: result.title,
|
|
36
|
+
url: result.url,
|
|
37
|
+
snippet: result.highlights?.[0] || result.text?.substring(0, 200) || '',
|
|
38
|
+
source: `Exa${result.author ? ` (${result.author})` : ''}`,
|
|
39
|
+
engines: ['exa'],
|
|
40
|
+
}));
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
console.error('Exa search failed:', error);
|
|
44
|
+
return [];
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export { searchDuckDuckGo, duckduckgoProvider } from './duckduckgo.js';
|
|
2
|
+
export { searchSogou, sogouProvider } from './sogou.js';
|
|
3
|
+
export { searchBing, bingProvider } from './bing.js';
|
|
4
|
+
export { searchBaidu, baiduProvider } from './baidu.js';
|
|
5
|
+
export { braveProvider } from './brave.js';
|
|
6
|
+
export { tavilyProvider } from './tavily.js';
|
|
7
|
+
export { searchExa, exaProvider } from './exa.js';
|
|
8
|
+
/**
|
|
9
|
+
* All registered engine providers with metadata.
|
|
10
|
+
* Free engines: DDG, Sogou, Bing, Baidu
|
|
11
|
+
* Paid engines: Brave, Tavily, Exa (require API keys)
|
|
12
|
+
*/
|
|
13
|
+
export const engines = {
|
|
14
|
+
duckduckgo: { id: 'duckduckgo', name: 'DuckDuckGo', isFree: true, languages: ['en'] },
|
|
15
|
+
sogou: { id: 'sogou', name: 'Sogou Search', isFree: true, languages: ['zh'] },
|
|
16
|
+
bing: { id: 'bing', name: 'Bing', isFree: true, languages: ['en', 'zh'] },
|
|
17
|
+
baidu: { id: 'baidu', name: 'Baidu', isFree: true, languages: ['zh'] },
|
|
18
|
+
brave: { id: 'brave', name: 'Brave Search', isFree: false, languages: ['en', 'zh'] },
|
|
19
|
+
tavily: { id: 'tavily', name: 'Tavily Search', isFree: false, languages: ['en', 'zh'] },
|
|
20
|
+
exa: { id: 'exa', name: 'Exa Search', isFree: false, languages: ['en', 'zh'] },
|
|
21
|
+
};
|
|
22
|
+
/** Free engines that always work without API keys */
|
|
23
|
+
export const freeEngines = ['duckduckgo', 'sogou', 'bing', 'baidu'];
|
|
24
|
+
/** Paid engines that require API keys */
|
|
25
|
+
export const paidEngines = ['brave', 'tavily', 'exa'];
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
const SOGOU_SEARCH_URL = 'https://www.sogou.com/web';
|
|
2
|
+
const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
3
|
+
/**
|
|
4
|
+
* Parse Sogou search results HTML using regex
|
|
5
|
+
*/
|
|
6
|
+
function parseSogouHtml(html) {
|
|
7
|
+
const results = [];
|
|
8
|
+
const seenUrls = new Set();
|
|
9
|
+
// Try to find result blocks
|
|
10
|
+
// Sogou typically has: <div class="vrwrap"> or <div class="rb"> containing the results
|
|
11
|
+
const blockRegex = /<div[^>]*(?:class="[^"]*vr(?:wrap|5)[^"]*"|class="[^"]*\brb\b[^"]*"|id="[^"]*result[^"]*")[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gis;
|
|
12
|
+
let blockMatch;
|
|
13
|
+
while ((blockMatch = blockRegex.exec(html)) !== null) {
|
|
14
|
+
const block = blockMatch[1];
|
|
15
|
+
// Extract title link (h3 or h2 containing a link)
|
|
16
|
+
const titleLinkRegex = /<h[23][^>]*>.*?<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i;
|
|
17
|
+
const titleMatch = block.match(titleLinkRegex);
|
|
18
|
+
if (!titleMatch)
|
|
19
|
+
continue;
|
|
20
|
+
const rawUrl = titleMatch[1]?.trim() || '';
|
|
21
|
+
const title = titleMatch[2]?.replace(/<[^>]+>/g, '').trim() || '';
|
|
22
|
+
if (!title || !rawUrl)
|
|
23
|
+
continue;
|
|
24
|
+
// Resolve the actual URL (Sogou wraps URLs in redirects)
|
|
25
|
+
let url = rawUrl;
|
|
26
|
+
try {
|
|
27
|
+
const parsed = new URL(rawUrl, SOGOU_SEARCH_URL);
|
|
28
|
+
const target = parsed.searchParams.get('url') || parsed.searchParams.get('u') || parsed.searchParams.get('link');
|
|
29
|
+
if (target && /^https?:\/\//i.test(target)) {
|
|
30
|
+
url = target;
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
url = parsed.toString();
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
// keep rawUrl
|
|
38
|
+
}
|
|
39
|
+
if (seenUrls.has(url))
|
|
40
|
+
continue;
|
|
41
|
+
seenUrls.add(url);
|
|
42
|
+
// Extract snippet
|
|
43
|
+
const descMatch = block.match(/<p[^>]*class="[^"]*str_info[^"]*"[^>]*>([\s\S]*?)<\/p>/i)
|
|
44
|
+
|| block.match(/<div[^>]*class="[^"]*str_info[^"]*"[^>]*>([\s\S]*?)<\/div>/i)
|
|
45
|
+
|| block.match(/class="[^"]*(?:str_info|ft|text-layout)[^"]*"[^>]*>([\s\S]*?)<\//i);
|
|
46
|
+
const snippet = descMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
|
|
47
|
+
// Extract source
|
|
48
|
+
const srcMatch = block.match(/<cite[^>]*>([\s\S]*?)<\/cite>/i)
|
|
49
|
+
|| block.match(/class="[^"]*(?:citeurl|g|url)[^"]*"[^>]*>([\s\S]*?)<\//i);
|
|
50
|
+
let source = srcMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
|
|
51
|
+
if (!source) {
|
|
52
|
+
try {
|
|
53
|
+
source = new URL(url).hostname;
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
source = '';
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
results.push({ title, url, snippet, source, engines: ['sogou'] });
|
|
60
|
+
}
|
|
61
|
+
// Fallback: broader extraction for different page layouts
|
|
62
|
+
if (results.length === 0) {
|
|
63
|
+
const altBlockRegex = /<div[^>]*class="[^"]*vrwrap[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gis;
|
|
64
|
+
let altBlockMatch;
|
|
65
|
+
while ((altBlockMatch = altBlockRegex.exec(html)) !== null) {
|
|
66
|
+
const block = altBlockMatch[1];
|
|
67
|
+
const aMatch = block.match(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
|
|
68
|
+
if (!aMatch)
|
|
69
|
+
continue;
|
|
70
|
+
const rawUrl = aMatch[1]?.trim() || '';
|
|
71
|
+
const title = aMatch[2]?.replace(/<[^>]+>/g, '').trim() || '';
|
|
72
|
+
if (!title || !rawUrl || seenUrls.has(rawUrl))
|
|
73
|
+
continue;
|
|
74
|
+
seenUrls.add(rawUrl);
|
|
75
|
+
let url = rawUrl;
|
|
76
|
+
try {
|
|
77
|
+
const parsed = new URL(rawUrl, SOGOU_SEARCH_URL);
|
|
78
|
+
const target = parsed.searchParams.get('url') || parsed.searchParams.get('u');
|
|
79
|
+
if (target && /^https?:\/\//i.test(target))
|
|
80
|
+
url = target;
|
|
81
|
+
}
|
|
82
|
+
catch { /* keep rawUrl */ }
|
|
83
|
+
const descMatch = block.match(/(?:str_info|ft)[^>]*>([\s\S]*?)<\//i);
|
|
84
|
+
const snippet = descMatch?.[1]?.replace(/<[^>]+>/g, '').trim() || '';
|
|
85
|
+
let source = '';
|
|
86
|
+
try {
|
|
87
|
+
source = new URL(url).hostname;
|
|
88
|
+
}
|
|
89
|
+
catch { /* ignore */ }
|
|
90
|
+
results.push({ title, url, snippet, source, engines: ['sogou'] });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return results;
|
|
94
|
+
}
|
|
95
|
+
export const sogouProvider = {
|
|
96
|
+
id: 'sogou',
|
|
97
|
+
name: 'Sogou Search',
|
|
98
|
+
isFree: true,
|
|
99
|
+
languages: ['zh'],
|
|
100
|
+
};
|
|
101
|
+
export async function searchSogou(query, limit = 10) {
|
|
102
|
+
try {
|
|
103
|
+
const url = new URL(SOGOU_SEARCH_URL);
|
|
104
|
+
url.searchParams.set('query', query);
|
|
105
|
+
url.searchParams.set('ie', 'utf8');
|
|
106
|
+
const response = await fetch(url.toString(), {
|
|
107
|
+
method: 'GET',
|
|
108
|
+
headers: {
|
|
109
|
+
'User-Agent': USER_AGENT,
|
|
110
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
111
|
+
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
112
|
+
'Referer': 'https://www.sogou.com/',
|
|
113
|
+
},
|
|
114
|
+
redirect: 'follow',
|
|
115
|
+
});
|
|
116
|
+
if (!response.ok) {
|
|
117
|
+
throw new Error(`Sogou returned status ${response.status}`);
|
|
118
|
+
}
|
|
119
|
+
const html = await response.text();
|
|
120
|
+
// Check for anti-bot page
|
|
121
|
+
if (html.toLowerCase().includes('antispider') || html.includes('请输入验证码') || html.includes('访问过于频繁')) {
|
|
122
|
+
console.warn('Sogou returned an anti-bot challenge page');
|
|
123
|
+
return [];
|
|
124
|
+
}
|
|
125
|
+
const results = parseSogouHtml(html);
|
|
126
|
+
return results.slice(0, limit);
|
|
127
|
+
}
|
|
128
|
+
catch (error) {
|
|
129
|
+
console.error('Sogou search failed:', error instanceof Error ? error.message : String(error));
|
|
130
|
+
return [];
|
|
131
|
+
}
|
|
132
|
+
}
|