agent-search-mcp 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/LICENSE +207 -0
- package/README.md +480 -0
- package/dist/aggregation/dedup.js +102 -0
- package/dist/aggregation/format.js +60 -0
- package/dist/aggregation/index.js +3 -0
- package/dist/aggregation/scorer.js +110 -0
- package/dist/cli.js +169 -0
- package/dist/engines/baidu.js +56 -0
- package/dist/engines/bing.js +58 -0
- package/dist/engines/brave.js +33 -0
- package/dist/engines/duckduckgo.js +47 -0
- package/dist/engines/exa.js +46 -0
- package/dist/engines/index.js +25 -0
- package/dist/engines/sogou.js +132 -0
- package/dist/engines/tavily.js +33 -0
- package/dist/index.js +46 -0
- package/dist/infrastructure/cache.js +24 -0
- package/dist/infrastructure/config.js +18 -0
- package/dist/infrastructure/health.js +86 -0
- package/dist/infrastructure/html-utils.js +10 -0
- package/dist/infrastructure/http.js +66 -0
- package/dist/infrastructure/index.js +9 -0
- package/dist/infrastructure/logger.js +9 -0
- package/dist/infrastructure/rate-limiter.js +12 -0
- package/dist/infrastructure/security.js +158 -0
- package/dist/infrastructure/url-validator.js +33 -0
- package/dist/tools/capabilities.js +35 -0
- package/dist/tools/fetch-tools.js +200 -0
- package/dist/tools/free-extract.js +43 -0
- package/dist/tools/free-search-advanced.js +40 -0
- package/dist/tools/free-search.js +380 -0
- package/dist/tools/health.js +9 -0
- package/dist/types.js +1 -0
- package/package.json +68 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { logger } from '../infrastructure/index.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extract GitHub README content from a repository URL.
|
|
5
|
+
*/
|
|
6
|
+
export async function fetchGithubReadme(url) {
|
|
7
|
+
try {
|
|
8
|
+
// Parse GitHub URL to extract owner/repo
|
|
9
|
+
const githubMatch = url.match(/github\.com\/([^\/]+)\/([^\/]+)/);
|
|
10
|
+
if (!githubMatch) {
|
|
11
|
+
throw new Error('Invalid GitHub URL');
|
|
12
|
+
}
|
|
13
|
+
const [, owner, repo] = githubMatch;
|
|
14
|
+
const cleanRepo = repo.replace(/\.git$/, '');
|
|
15
|
+
// Try common README filenames
|
|
16
|
+
const readmeFiles = ['README.md', 'readme.md', 'Readme.md', 'README.MD', 'README'];
|
|
17
|
+
for (const filename of readmeFiles) {
|
|
18
|
+
try {
|
|
19
|
+
const rawUrl = `https://raw.githubusercontent.com/${owner}/${cleanRepo}/main/${filename}`;
|
|
20
|
+
const response = await fetch(rawUrl, {
|
|
21
|
+
signal: AbortSignal.timeout(10000),
|
|
22
|
+
});
|
|
23
|
+
if (response.ok) {
|
|
24
|
+
const content = await response.text();
|
|
25
|
+
return `# ${owner}/${cleanRepo}\n\n${content}`;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
// Try master branch if main fails
|
|
30
|
+
try {
|
|
31
|
+
const rawUrl = `https://raw.githubusercontent.com/${owner}/${cleanRepo}/master/${filename}`;
|
|
32
|
+
const response = await fetch(rawUrl, {
|
|
33
|
+
signal: AbortSignal.timeout(10000),
|
|
34
|
+
});
|
|
35
|
+
if (response.ok) {
|
|
36
|
+
const content = await response.text();
|
|
37
|
+
return `# ${owner}/${cleanRepo}\n\n${content}`;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
// Continue to next filename
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
throw new Error('README not found');
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
logger.error({ err: error instanceof Error ? error.message : String(error) }, 'Failed to fetch GitHub README');
|
|
49
|
+
throw error;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Extract CSDN article content.
|
|
54
|
+
*/
|
|
55
|
+
export async function fetchCsdnArticle(url) {
|
|
56
|
+
try {
|
|
57
|
+
const response = await fetch(url, {
|
|
58
|
+
headers: {
|
|
59
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
60
|
+
},
|
|
61
|
+
signal: AbortSignal.timeout(10000),
|
|
62
|
+
});
|
|
63
|
+
if (!response.ok) {
|
|
64
|
+
throw new Error(`HTTP ${response.status}`);
|
|
65
|
+
}
|
|
66
|
+
const html = await response.text();
|
|
67
|
+
// Simple extraction: find article content between common markers
|
|
68
|
+
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i) ||
|
|
69
|
+
html.match(/<div[^>]*class="[^"]*article[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
|
|
70
|
+
if (articleMatch) {
|
|
71
|
+
// Basic HTML to text conversion
|
|
72
|
+
let content = articleMatch[1]
|
|
73
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
74
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
75
|
+
.replace(/<[^>]+>/g, '\n')
|
|
76
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
77
|
+
.trim();
|
|
78
|
+
return content;
|
|
79
|
+
}
|
|
80
|
+
throw new Error('Article content not found');
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
logger.error({ err: error instanceof Error ? error.message : String(error) }, 'Failed to fetch CSDN article');
|
|
84
|
+
throw error;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Extract Juejin article content.
|
|
89
|
+
*/
|
|
90
|
+
export async function fetchJuejinArticle(url) {
|
|
91
|
+
try {
|
|
92
|
+
// Extract article ID from URL
|
|
93
|
+
const idMatch = url.match(/post\/(\d+)/);
|
|
94
|
+
if (!idMatch) {
|
|
95
|
+
throw new Error('Invalid Juejin URL');
|
|
96
|
+
}
|
|
97
|
+
const articleId = idMatch[1];
|
|
98
|
+
const apiUrl = `https://api.juejin.cn/content_api/v1/article/detail?article_id=${articleId}`;
|
|
99
|
+
const response = await fetch(apiUrl, {
|
|
100
|
+
signal: AbortSignal.timeout(10000),
|
|
101
|
+
});
|
|
102
|
+
if (!response.ok) {
|
|
103
|
+
throw new Error(`HTTP ${response.status}`);
|
|
104
|
+
}
|
|
105
|
+
const data = await response.json();
|
|
106
|
+
if (data.err_no !== 0 || !data.data) {
|
|
107
|
+
throw new Error(data.err_msg || 'Failed to fetch article');
|
|
108
|
+
}
|
|
109
|
+
const article = data.data;
|
|
110
|
+
const content = article.article_info?.markdown_content || article.article_info?.content || '';
|
|
111
|
+
return `# ${article.article_info?.title || 'Juejin Article'}\n\n${content}`;
|
|
112
|
+
}
|
|
113
|
+
catch (error) {
|
|
114
|
+
logger.error({ err: error instanceof Error ? error.message : String(error) }, 'Failed to fetch Juejin article');
|
|
115
|
+
throw error;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
export function setupFetchTools(server) {
|
|
119
|
+
// GitHub README tool
|
|
120
|
+
server.tool('fetch_github_readme', 'Fetch README content from a GitHub repository', {
|
|
121
|
+
url: z.string().url('Must be a valid URL').describe('GitHub repository URL (e.g., https://github.com/owner/repo)'),
|
|
122
|
+
}, async ({ url }) => {
|
|
123
|
+
try {
|
|
124
|
+
const content = await fetchGithubReadme(url);
|
|
125
|
+
return {
|
|
126
|
+
content: [
|
|
127
|
+
{
|
|
128
|
+
type: 'text',
|
|
129
|
+
text: content,
|
|
130
|
+
},
|
|
131
|
+
],
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
return {
|
|
136
|
+
content: [
|
|
137
|
+
{
|
|
138
|
+
type: 'text',
|
|
139
|
+
text: `Failed to fetch GitHub README: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
140
|
+
},
|
|
141
|
+
],
|
|
142
|
+
isError: true,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
// CSDN article tool
|
|
147
|
+
server.tool('fetch_csdn_article', 'Fetch content from a CSDN blog article', {
|
|
148
|
+
url: z.string().url('Must be a valid URL').describe('CSDN article URL'),
|
|
149
|
+
}, async ({ url }) => {
|
|
150
|
+
try {
|
|
151
|
+
const content = await fetchCsdnArticle(url);
|
|
152
|
+
return {
|
|
153
|
+
content: [
|
|
154
|
+
{
|
|
155
|
+
type: 'text',
|
|
156
|
+
text: content,
|
|
157
|
+
},
|
|
158
|
+
],
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
catch (error) {
|
|
162
|
+
return {
|
|
163
|
+
content: [
|
|
164
|
+
{
|
|
165
|
+
type: 'text',
|
|
166
|
+
text: `Failed to fetch CSDN article: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
167
|
+
},
|
|
168
|
+
],
|
|
169
|
+
isError: true,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
// Juejin article tool
|
|
174
|
+
server.tool('fetch_juejin_article', 'Fetch content from a Juejin article', {
|
|
175
|
+
url: z.string().url('Must be a valid URL').describe('Juejin article URL'),
|
|
176
|
+
}, async ({ url }) => {
|
|
177
|
+
try {
|
|
178
|
+
const content = await fetchJuejinArticle(url);
|
|
179
|
+
return {
|
|
180
|
+
content: [
|
|
181
|
+
{
|
|
182
|
+
type: 'text',
|
|
183
|
+
text: content,
|
|
184
|
+
},
|
|
185
|
+
],
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
catch (error) {
|
|
189
|
+
return {
|
|
190
|
+
content: [
|
|
191
|
+
{
|
|
192
|
+
type: 'text',
|
|
193
|
+
text: `Failed to fetch Juejin article: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
194
|
+
},
|
|
195
|
+
],
|
|
196
|
+
isError: true,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { validateUrl } from '../infrastructure/url-validator.js';
|
|
3
|
+
export function registerFreeExtract(server) {
|
|
4
|
+
server.tool('free_extract', `Extract full content from a URL. Returns clean markdown.
|
|
5
|
+
|
|
6
|
+
Best for: Reading a specific page found in search results.
|
|
7
|
+
Not recommended for: Bulk extraction — use search first.`, {
|
|
8
|
+
url: z.string().describe('URL to extract'),
|
|
9
|
+
max_length: z.number().optional().default(5000).describe('Max characters to return'),
|
|
10
|
+
}, async ({ url, max_length }) => {
|
|
11
|
+
// SSRF 防护
|
|
12
|
+
const validation = validateUrl(url);
|
|
13
|
+
if (!validation.valid) {
|
|
14
|
+
return {
|
|
15
|
+
content: [{ type: 'text', text: `Error: ${validation.error}` }],
|
|
16
|
+
isError: true,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
try {
|
|
20
|
+
// 使用 Jina Reader
|
|
21
|
+
const res = await fetch(`https://r.jina.ai/${url}`, {
|
|
22
|
+
headers: { 'Accept': 'text/markdown' },
|
|
23
|
+
signal: AbortSignal.timeout(10000),
|
|
24
|
+
});
|
|
25
|
+
if (!res.ok) {
|
|
26
|
+
return {
|
|
27
|
+
content: [{ type: 'text', text: `Error: HTTP ${res.status}` }],
|
|
28
|
+
isError: true,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
const content = await res.text();
|
|
32
|
+
return {
|
|
33
|
+
content: [{ type: 'text', text: content.slice(0, max_length) }],
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
catch (error) {
|
|
37
|
+
return {
|
|
38
|
+
content: [{ type: 'text', text: `Error: ${error instanceof Error ? error.message : String(error)}` }],
|
|
39
|
+
isError: true,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { searchWithFallback } from './free-search.js';
|
|
3
|
+
export function registerFreeSearchAdvanced(server) {
|
|
4
|
+
server.tool('free_search_advanced', `Advanced search with filters and quality control.
|
|
5
|
+
|
|
6
|
+
Best for: Date ranges, domain filtering, high-confidence only, Chinese content.
|
|
7
|
+
Not recommended for: Simple queries — use free_search instead.`, {
|
|
8
|
+
query: z.string().describe('Search query'),
|
|
9
|
+
count: z.number().optional().default(5).describe('Number of results (1-20)'),
|
|
10
|
+
min_confidence: z.number().min(1).max(3).optional().default(1)
|
|
11
|
+
.describe('Only return results verified by N+ sources'),
|
|
12
|
+
time_range: z.enum(['day', 'week', 'month', 'year']).optional()
|
|
13
|
+
.describe('Filter by recency'),
|
|
14
|
+
language: z.enum(['auto', 'en', 'zh']).optional().default('auto')
|
|
15
|
+
.describe('Language preference'),
|
|
16
|
+
include_domains: z.array(z.string()).optional()
|
|
17
|
+
.describe('Only search these domains'),
|
|
18
|
+
exclude_domains: z.array(z.string()).optional()
|
|
19
|
+
.describe('Exclude these domains'),
|
|
20
|
+
}, async (input) => {
|
|
21
|
+
try {
|
|
22
|
+
const results = await searchWithFallback({
|
|
23
|
+
query: input.query,
|
|
24
|
+
count: input.count,
|
|
25
|
+
engines: ['duckduckgo', 'sogou', 'bing', 'baidu', 'brave', 'tavily'],
|
|
26
|
+
minConfidence: input.min_confidence,
|
|
27
|
+
language: input.language,
|
|
28
|
+
includeDomains: input.include_domains,
|
|
29
|
+
excludeDomains: input.exclude_domains,
|
|
30
|
+
});
|
|
31
|
+
return { content: [{ type: 'text', text: JSON.stringify(results, null, 2) }] };
|
|
32
|
+
}
|
|
33
|
+
catch (error) {
|
|
34
|
+
return {
|
|
35
|
+
content: [{ type: 'text', text: `Search failed: ${error instanceof Error ? error.message : 'Unknown error'}` }],
|
|
36
|
+
isError: true,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
}
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { searchDuckDuckGo } from '../engines/duckduckgo.js';
|
|
3
|
+
import { searchSogou } from '../engines/sogou.js';
|
|
4
|
+
import { searchBing } from '../engines/bing.js';
|
|
5
|
+
import { searchBaidu } from '../engines/baidu.js';
|
|
6
|
+
import { BraveProvider } from '../engines/brave.js';
|
|
7
|
+
import { TavilyProvider } from '../engines/tavily.js';
|
|
8
|
+
import { searchExa } from '../engines/exa.js';
|
|
9
|
+
import { dedupByUrl, dedupByTitle, filterLowQuality, scoreAndRank, formatResults } from '../aggregation/index.js';
|
|
10
|
+
import { SearchCache, logger, HealthTracker, RateLimiter } from '../infrastructure/index.js';
|
|
11
|
+
const SUPPORTED_ENGINES = ['duckduckgo', 'sogou', 'bing', 'baidu', 'brave', 'tavily', 'exa'];
|
|
12
|
+
const FREE_ENGINES = ['duckduckgo', 'sogou', 'bing', 'baidu'];
|
|
13
|
+
const PAID_ENGINES = ['brave', 'tavily', 'exa'];
|
|
14
|
+
// Engine weights (higher = more trusted)
|
|
15
|
+
const ENGINE_WEIGHTS = {
|
|
16
|
+
duckduckgo: 0.85,
|
|
17
|
+
sogou: 0.8,
|
|
18
|
+
bing: 0.9,
|
|
19
|
+
baidu: 0.75,
|
|
20
|
+
brave: 0.95,
|
|
21
|
+
tavily: 0.9,
|
|
22
|
+
exa: 0.92,
|
|
23
|
+
};
|
|
24
|
+
// Infrastructure singletons
|
|
25
|
+
const cache = new SearchCache();
|
|
26
|
+
const healthTracker = new HealthTracker();
|
|
27
|
+
const rateLimiter = new RateLimiter();
|
|
28
|
+
// ─── Engine provider mapping (from ddgs pattern) ──────────────────────────
|
|
29
|
+
// DDG uses Bing as backend, so we track providers to avoid duplicate queries
|
|
30
|
+
const PROVIDER_MAP = {
|
|
31
|
+
duckduckgo: 'bing',
|
|
32
|
+
sogou: 'sogou',
|
|
33
|
+
bing: 'bing',
|
|
34
|
+
baidu: 'baidu',
|
|
35
|
+
brave: 'brave',
|
|
36
|
+
tavily: 'tavily',
|
|
37
|
+
exa: 'exa',
|
|
38
|
+
};
|
|
39
|
+
/**
|
|
40
|
+
* Get unique providers from engine list.
|
|
41
|
+
* From ddgs: same provider only searches once.
|
|
42
|
+
*/
|
|
43
|
+
function getUniqueProviders(engines) {
|
|
44
|
+
const seenProviders = new Set();
|
|
45
|
+
const unique = [];
|
|
46
|
+
for (const engine of engines) {
|
|
47
|
+
const provider = PROVIDER_MAP[engine] || engine;
|
|
48
|
+
if (!seenProviders.has(provider)) {
|
|
49
|
+
seenProviders.add(provider);
|
|
50
|
+
unique.push(engine);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return unique;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Search a single engine with health check, rate limiting, and retry logic.
|
|
57
|
+
*/
|
|
58
|
+
async function searchEngine(engine, query, limit, maxRetries = 2) {
|
|
59
|
+
// Skip unhealthy providers
|
|
60
|
+
if (!healthTracker.isHealthy(engine)) {
|
|
61
|
+
logger.warn({ engine }, 'Skipping unhealthy provider');
|
|
62
|
+
return [];
|
|
63
|
+
}
|
|
64
|
+
// Rate limit before making the request
|
|
65
|
+
await rateLimiter.waitForSlot(engine);
|
|
66
|
+
let lastError = null;
|
|
67
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
68
|
+
const startTime = Date.now();
|
|
69
|
+
try {
|
|
70
|
+
let results;
|
|
71
|
+
switch (engine) {
|
|
72
|
+
case 'duckduckgo':
|
|
73
|
+
results = await searchDuckDuckGo(query, limit);
|
|
74
|
+
break;
|
|
75
|
+
case 'sogou':
|
|
76
|
+
results = await searchSogou(query, limit);
|
|
77
|
+
break;
|
|
78
|
+
case 'bing':
|
|
79
|
+
results = await searchBing(query, limit);
|
|
80
|
+
break;
|
|
81
|
+
case 'baidu':
|
|
82
|
+
results = await searchBaidu(query, limit);
|
|
83
|
+
break;
|
|
84
|
+
case 'brave':
|
|
85
|
+
results = await new BraveProvider().search(query, limit);
|
|
86
|
+
break;
|
|
87
|
+
case 'tavily':
|
|
88
|
+
results = await new TavilyProvider().search(query, limit);
|
|
89
|
+
break;
|
|
90
|
+
case 'exa':
|
|
91
|
+
results = await searchExa({ query, count: limit, apiKey: process.env.EXA_API_KEY });
|
|
92
|
+
break;
|
|
93
|
+
default:
|
|
94
|
+
return [];
|
|
95
|
+
}
|
|
96
|
+
const latency = Date.now() - startTime;
|
|
97
|
+
healthTracker.recordSuccess(engine, latency);
|
|
98
|
+
logger.info({ engine, latency, count: results.length, attempt }, 'Search completed');
|
|
99
|
+
return results;
|
|
100
|
+
}
|
|
101
|
+
catch (err) {
|
|
102
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
103
|
+
const latency = Date.now() - startTime;
|
|
104
|
+
// Check if this is a retryable error (network, timeout, 5xx)
|
|
105
|
+
const isRetryable = isRetryableError(lastError);
|
|
106
|
+
if (attempt < maxRetries && isRetryable) {
|
|
107
|
+
// Exponential backoff: 500ms, 1000ms, 2000ms...
|
|
108
|
+
const delay = Math.min(500 * Math.pow(2, attempt), 5000);
|
|
109
|
+
logger.warn({ engine, attempt, delay, err: lastError.message }, 'Retryable error, retrying...');
|
|
110
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
// Non-retryable or max retries exceeded
|
|
114
|
+
healthTracker.recordFailure(engine);
|
|
115
|
+
logger.error({ engine, latency, attempt, err: lastError.message }, 'Search failed');
|
|
116
|
+
return [];
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// Should not reach here, but just in case
|
|
120
|
+
return [];
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Check if an error is retryable (network, timeout, 5xx).
|
|
124
|
+
*/
|
|
125
|
+
function isRetryableError(err) {
|
|
126
|
+
const msg = err.message.toLowerCase();
|
|
127
|
+
// Network errors
|
|
128
|
+
if (msg.includes('econnreset') || msg.includes('econnrefused') ||
|
|
129
|
+
msg.includes('etimedout') || msg.includes('network')) {
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
// Timeout
|
|
133
|
+
if (msg.includes('timeout') || msg.includes('abort')) {
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
// HTTP 5xx errors (but not 501 Not Implemented)
|
|
137
|
+
if (msg.includes('http 5') && !msg.includes('http 501')) {
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Check if a paid engine has its API key configured.
|
|
144
|
+
*/
|
|
145
|
+
function hasApiKey(engine) {
|
|
146
|
+
switch (engine) {
|
|
147
|
+
case 'brave':
|
|
148
|
+
return !!process.env.BRAVE_API_KEY;
|
|
149
|
+
case 'tavily':
|
|
150
|
+
return !!process.env.TAVILY_API_KEY;
|
|
151
|
+
case 'exa':
|
|
152
|
+
return !!process.env.EXA_API_KEY;
|
|
153
|
+
default:
|
|
154
|
+
return true; // free engines always available
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
// ─── Request collapsing ───────────────────────────────────────────────
|
|
158
|
+
// Track in-flight requests to avoid duplicate concurrent calls
|
|
159
|
+
const pendingRequests = new Map();
|
|
160
|
+
/**
|
|
161
|
+
* Generate cache key for request collapsing.
|
|
162
|
+
*/
|
|
163
|
+
function makeCollapseKey(options) {
|
|
164
|
+
const { query, count = 10, engines = [] } = options;
|
|
165
|
+
const sortedEngines = [...engines].sort().join(',');
|
|
166
|
+
return `${query}:${count}:${sortedEngines}`;
|
|
167
|
+
}
|
|
168
|
+
// ─── Core search logic (fused patterns from ddgs) ──────────────────────
|
|
169
|
+
/**
|
|
170
|
+
* Search with provider dedup, batch concurrency, and early exit.
|
|
171
|
+
*
|
|
172
|
+
* Patterns from ddgs:
|
|
173
|
+
* 1. Provider dedup: same provider only searches once
|
|
174
|
+
* 2. Batch concurrency: search in batches to avoid rate limits
|
|
175
|
+
* 3. Early exit: stop when enough results collected
|
|
176
|
+
* 4. Frequency scoring: count how many engines returned each result
|
|
177
|
+
*/
|
|
178
|
+
export async function searchWithFallback(options) {
|
|
179
|
+
const collapseKey = makeCollapseKey(options);
|
|
180
|
+
// Check if same request is already in-flight
|
|
181
|
+
const pending = pendingRequests.get(collapseKey);
|
|
182
|
+
if (pending) {
|
|
183
|
+
logger.info({ query: options.query }, 'Request collapsing: reusing pending request');
|
|
184
|
+
return pending;
|
|
185
|
+
}
|
|
186
|
+
// Start new request and track it
|
|
187
|
+
const searchPromise = executeSearch(options);
|
|
188
|
+
pendingRequests.set(collapseKey, searchPromise);
|
|
189
|
+
// Clean up when done
|
|
190
|
+
searchPromise.finally(() => {
|
|
191
|
+
pendingRequests.delete(collapseKey);
|
|
192
|
+
});
|
|
193
|
+
return searchPromise;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Execute the actual search logic (internal).
|
|
197
|
+
*/
|
|
198
|
+
async function executeSearch(options) {
|
|
199
|
+
const { query, count = 10, engines: userEngines = ['duckduckgo', 'sogou'], minConfidence = 1, language, includeDomains, excludeDomains, } = options;
|
|
200
|
+
// Check cache first
|
|
201
|
+
const cacheKey = cache.makeKey(query, count, userEngines);
|
|
202
|
+
const cached = cache.get(cacheKey);
|
|
203
|
+
if (cached) {
|
|
204
|
+
logger.info({ query, count, engines: userEngines }, 'Cache hit');
|
|
205
|
+
return cached;
|
|
206
|
+
}
|
|
207
|
+
logger.info({ query, count, engines: userEngines }, 'Starting search');
|
|
208
|
+
// ── Step 1: Provider dedup (from ddgs) ──────────────────────────────
|
|
209
|
+
// Only search each provider once (e.g., DDG and Bing both use Bing backend)
|
|
210
|
+
const uniqueEngines = getUniqueProviders(userEngines);
|
|
211
|
+
logger.info({ engines: uniqueEngines }, 'After provider dedup');
|
|
212
|
+
// ── Step 2: Determine which engines to search ───────────────────────
|
|
213
|
+
// Phase 1: Free engines
|
|
214
|
+
const freeToSearch = uniqueEngines.filter(e => FREE_ENGINES.includes(e));
|
|
215
|
+
const allFree = FREE_ENGINES.filter(e => !uniqueEngines.includes(e));
|
|
216
|
+
const phase1Engines = [...freeToSearch, ...allFree];
|
|
217
|
+
// ── Step 3: Batch concurrency + early exit (from ddgs) ──────────────
|
|
218
|
+
// Adaptive batch size based on count and engine count
|
|
219
|
+
const BATCH_SIZE = Math.max(2, Math.min(phase1Engines.length, Math.ceil(count / 10) + 1));
|
|
220
|
+
const allResults = [];
|
|
221
|
+
const failures = [];
|
|
222
|
+
const searchedEngines = [];
|
|
223
|
+
// Batch 1: Free engines
|
|
224
|
+
logger.info({ engines: phase1Engines }, 'Phase 1: free engines (batch)');
|
|
225
|
+
for (let i = 0; i < phase1Engines.length; i += BATCH_SIZE) {
|
|
226
|
+
const batch = phase1Engines.slice(i, i + BATCH_SIZE);
|
|
227
|
+
const batchResults = await Promise.allSettled(batch.map(async (engine) => {
|
|
228
|
+
const results = await searchEngine(engine, query, count);
|
|
229
|
+
searchedEngines.push(engine);
|
|
230
|
+
return { engine, results };
|
|
231
|
+
}));
|
|
232
|
+
for (const result of batchResults) {
|
|
233
|
+
if (result.status === 'fulfilled') {
|
|
234
|
+
allResults.push(...result.value.results);
|
|
235
|
+
}
|
|
236
|
+
else {
|
|
237
|
+
failures.push({
|
|
238
|
+
engine: 'unknown',
|
|
239
|
+
message: result.reason?.message || 'Unknown error',
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// Early exit: stop if we have enough results
|
|
244
|
+
if (allResults.length >= count * 1.5) {
|
|
245
|
+
logger.info({ count: allResults.length }, 'Early exit: enough results');
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
logger.info({ count: allResults.length }, 'Phase 1 results');
|
|
250
|
+
// ── Step 4: Fallback to paid engines if not enough ───────────────────
|
|
251
|
+
if (allResults.length < count) {
|
|
252
|
+
const paidToSearch = uniqueEngines.filter(e => PAID_ENGINES.includes(e) && hasApiKey(e));
|
|
253
|
+
if (paidToSearch.length > 0) {
|
|
254
|
+
const remaining = Math.max(count - allResults.length, 1);
|
|
255
|
+
logger.info({ engines: paidToSearch, remaining }, 'Phase 2: paid engines');
|
|
256
|
+
const phase2Results = await Promise.allSettled(paidToSearch.map(async (engine) => {
|
|
257
|
+
const results = await searchEngine(engine, query, remaining);
|
|
258
|
+
searchedEngines.push(engine);
|
|
259
|
+
return { engine, results };
|
|
260
|
+
}));
|
|
261
|
+
for (const result of phase2Results) {
|
|
262
|
+
if (result.status === 'fulfilled') {
|
|
263
|
+
allResults.push(...result.value.results);
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
failures.push({
|
|
267
|
+
engine: 'unknown',
|
|
268
|
+
message: result.reason?.message || 'Unknown error',
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
logger.info({ got: allResults.length }, 'Phase 2 results');
|
|
273
|
+
}
|
|
274
|
+
else {
|
|
275
|
+
logger.info('Phase 2: no paid engines available');
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
// ── Step 5: Aggregation layer (fused from ddgs + our patterns) ──────
|
|
279
|
+
// 5a. Filter low-quality results (from ddgs)
|
|
280
|
+
const filtered = filterLowQuality(allResults);
|
|
281
|
+
// 5b. URL dedup with frequency counting
|
|
282
|
+
const { results: urlDeduped, frequencies } = dedupByUrl(filtered);
|
|
283
|
+
// 5c. Title dedup
|
|
284
|
+
const titleDeduped = dedupByTitle(urlDeduped);
|
|
285
|
+
// 5d. Score and rank with frequency bonus
|
|
286
|
+
let scored = scoreAndRank(titleDeduped, query, ENGINE_WEIGHTS, frequencies);
|
|
287
|
+
// ── Step 6: Post-search filters ─────────────────────────────────────
|
|
288
|
+
if (minConfidence > 1) {
|
|
289
|
+
scored = scored.filter(r => r.confidence >= minConfidence);
|
|
290
|
+
}
|
|
291
|
+
if (includeDomains && includeDomains.length > 0) {
|
|
292
|
+
scored = scored.filter(r => {
|
|
293
|
+
try {
|
|
294
|
+
const hostname = new URL(r.url).hostname;
|
|
295
|
+
return includeDomains.some(d => hostname.includes(d) || hostname.endsWith(d));
|
|
296
|
+
}
|
|
297
|
+
catch {
|
|
298
|
+
return false;
|
|
299
|
+
}
|
|
300
|
+
});
|
|
301
|
+
}
|
|
302
|
+
if (excludeDomains && excludeDomains.length > 0) {
|
|
303
|
+
scored = scored.filter(r => {
|
|
304
|
+
try {
|
|
305
|
+
const hostname = new URL(r.url).hostname;
|
|
306
|
+
return !excludeDomains.some(d => hostname.includes(d) || hostname.endsWith(d));
|
|
307
|
+
}
|
|
308
|
+
catch {
|
|
309
|
+
return true;
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
// ── Step 7: Format output with security processing ──────────────────
|
|
314
|
+
const formatted = formatResults(scored);
|
|
315
|
+
const response = {
|
|
316
|
+
query,
|
|
317
|
+
engines: userEngines,
|
|
318
|
+
...formatted,
|
|
319
|
+
...(failures.length > 0
|
|
320
|
+
? { partialFailures: failures }
|
|
321
|
+
: {}),
|
|
322
|
+
};
|
|
323
|
+
// ── Step 8: Async cache write (from ddgs) ───────────────────────────
|
|
324
|
+
// Don't block the response - write cache in background
|
|
325
|
+
setImmediate(() => {
|
|
326
|
+
try {
|
|
327
|
+
cache.set(cacheKey, response);
|
|
328
|
+
logger.info({ total: response.meta.total }, 'Search complete');
|
|
329
|
+
}
|
|
330
|
+
catch (err) {
|
|
331
|
+
logger.error({ err }, 'Cache write failed');
|
|
332
|
+
}
|
|
333
|
+
});
|
|
334
|
+
return response;
|
|
335
|
+
}
|
|
336
|
+
// ─── Tool registration ──────────────────────────────────────────────────
|
|
337
|
+
// Export the health tracker instance so index.ts can use the same singleton
|
|
338
|
+
export { healthTracker };
|
|
339
|
+
export function setupFreeSearchTool(server) {
|
|
340
|
+
server.tool('free_search', 'Search the web with automatic fallback between free and paid engines. ' +
|
|
341
|
+
'Phase 1: DuckDuckGo + Sogou + Bing + Baidu (free, no key required). ' +
|
|
342
|
+
'Phase 2: Brave + Tavily + Exa (paid, requires BRAVE_API_KEY / TAVILY_API_KEY / EXA_API_KEY env vars). ' +
|
|
343
|
+
'All results are deduplicated, scored, and ranked. ' +
|
|
344
|
+
'Results include security metadata to protect against prompt injection.', {
|
|
345
|
+
query: z.string().min(1, 'Search query must not be empty'),
|
|
346
|
+
limit: z.number().int().min(1).max(50).default(10).describe('Number of results to return (1-50)'),
|
|
347
|
+
engines: z.array(z.enum(['duckduckgo', 'sogou', 'bing', 'baidu', 'brave', 'tavily', 'exa']))
|
|
348
|
+
.min(1)
|
|
349
|
+
.default(['duckduckgo', 'sogou'])
|
|
350
|
+
.describe('Search engines to use (default: all free engines)'),
|
|
351
|
+
}, async ({ query, limit = 10, engines: userEngines }) => {
|
|
352
|
+
try {
|
|
353
|
+
const results = await searchWithFallback({
|
|
354
|
+
query,
|
|
355
|
+
count: limit,
|
|
356
|
+
engines: userEngines,
|
|
357
|
+
});
|
|
358
|
+
return {
|
|
359
|
+
content: [
|
|
360
|
+
{
|
|
361
|
+
type: 'text',
|
|
362
|
+
text: JSON.stringify(results, null, 2),
|
|
363
|
+
},
|
|
364
|
+
],
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
catch (error) {
|
|
368
|
+
logger.error({ err: error instanceof Error ? error.message : String(error) }, 'Search tool execution failed');
|
|
369
|
+
return {
|
|
370
|
+
content: [
|
|
371
|
+
{
|
|
372
|
+
type: 'text',
|
|
373
|
+
text: `Search failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
374
|
+
},
|
|
375
|
+
],
|
|
376
|
+
isError: true,
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
});
|
|
380
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|