@elizaos/plugin-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +400 -0
- package/dist/index.cjs +9366 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +9284 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
- package/src/__tests__/action-chaining.test.ts +532 -0
- package/src/__tests__/actions.test.ts +118 -0
- package/src/__tests__/cache-rate-limiter.test.ts +303 -0
- package/src/__tests__/content-extractors.test.ts +26 -0
- package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
- package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
- package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
- package/src/__tests__/e2e.test.ts +1870 -0
- package/src/__tests__/multi-benchmark-runner.ts +427 -0
- package/src/__tests__/providers.test.ts +156 -0
- package/src/__tests__/real-world.e2e.test.ts +788 -0
- package/src/__tests__/research-scenarios.test.ts +755 -0
- package/src/__tests__/research.e2e.test.ts +704 -0
- package/src/__tests__/research.test.ts +174 -0
- package/src/__tests__/search-providers.test.ts +174 -0
- package/src/__tests__/single-benchmark-runner.ts +735 -0
- package/src/__tests__/test-search-providers.ts +171 -0
- package/src/__tests__/verify-apis.test.ts +82 -0
- package/src/actions.ts +1677 -0
- package/src/benchmark/deepresearch-benchmark.ts +369 -0
- package/src/evaluation/research-evaluator.ts +444 -0
- package/src/examples/api-integration.md +498 -0
- package/src/examples/browserbase-integration.md +132 -0
- package/src/examples/debug-research-query.ts +162 -0
- package/src/examples/defi-code-scenarios.md +536 -0
- package/src/examples/defi-implementation-guide.md +454 -0
- package/src/examples/eliza-research-example.ts +142 -0
- package/src/examples/fix-renewable-energy-research.ts +209 -0
- package/src/examples/research-scenarios.md +408 -0
- package/src/examples/run-complete-renewable-research.ts +303 -0
- package/src/examples/run-deep-research.ts +352 -0
- package/src/examples/run-logged-research.ts +304 -0
- package/src/examples/run-real-research.ts +151 -0
- package/src/examples/save-research-output.ts +133 -0
- package/src/examples/test-file-logging.ts +199 -0
- package/src/examples/test-real-research.ts +67 -0
- package/src/examples/test-renewable-energy-research.ts +229 -0
- package/src/index.ts +28 -0
- package/src/integrations/cache.ts +128 -0
- package/src/integrations/content-extractors/firecrawl.ts +314 -0
- package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
- package/src/integrations/content-extractors/playwright.ts +420 -0
- package/src/integrations/factory.ts +419 -0
- package/src/integrations/index.ts +18 -0
- package/src/integrations/rate-limiter.ts +181 -0
- package/src/integrations/search-providers/academic.ts +290 -0
- package/src/integrations/search-providers/exa.ts +205 -0
- package/src/integrations/search-providers/npm.ts +330 -0
- package/src/integrations/search-providers/pypi.ts +211 -0
- package/src/integrations/search-providers/serpapi.ts +277 -0
- package/src/integrations/search-providers/serper.ts +358 -0
- package/src/integrations/search-providers/stagehand-google.ts +87 -0
- package/src/integrations/search-providers/tavily.ts +187 -0
- package/src/processing/relevance-analyzer.ts +353 -0
- package/src/processing/research-logger.ts +450 -0
- package/src/processing/result-processor.ts +372 -0
- package/src/prompts/research-prompts.ts +419 -0
- package/src/providers/cacheProvider.ts +164 -0
- package/src/providers.ts +173 -0
- package/src/service.ts +2588 -0
- package/src/services/swe-bench.ts +286 -0
- package/src/strategies/research-strategies.ts +790 -0
- package/src/types/pdf-parse.d.ts +34 -0
- package/src/types.ts +551 -0
- package/src/verification/claim-verifier.ts +443 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
import { IAgentRuntime, elizaLogger } from '@elizaos/core';
|
|
2
|
+
import { SearchProvider, ContentExtractor } from './rate-limiter';
|
|
3
|
+
import { TavilySearchProvider } from './search-providers/tavily';
|
|
4
|
+
import { SerperSearchProvider } from './search-providers/serper';
|
|
5
|
+
import { AcademicSearchProvider } from './search-providers/academic';
|
|
6
|
+
import { FirecrawlContentExtractor, FirecrawlConfig } from './content-extractors/firecrawl';
|
|
7
|
+
import { PlaywrightContentExtractor } from './content-extractors/playwright';
|
|
8
|
+
import { CachedSearchProvider } from './cache';
|
|
9
|
+
import { RateLimitedSearchProvider } from './rate-limiter';
|
|
10
|
+
import { ExaSearchProvider } from './search-providers/exa';
|
|
11
|
+
import { SerpAPISearchProvider } from './search-providers/serpapi';
|
|
12
|
+
import { StagehandGoogleSearchProvider } from './search-providers/stagehand-google';
|
|
13
|
+
import { PyPISearchProvider } from './search-providers/pypi';
|
|
14
|
+
import { NPMSearchProvider } from './search-providers/npm';
|
|
15
|
+
|
|
16
|
+
export type { SearchProvider, ContentExtractor };
|
|
17
|
+
|
|
18
|
+
// Wrapper to make FirecrawlContentExtractor compatible with ContentExtractor interface
|
|
19
|
+
class FirecrawlWrapper implements ContentExtractor {
|
|
20
|
+
private extractor: FirecrawlContentExtractor;
|
|
21
|
+
|
|
22
|
+
constructor(apiKey: string) {
|
|
23
|
+
const config: FirecrawlConfig = { apiKey };
|
|
24
|
+
this.extractor = new FirecrawlContentExtractor(config);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async extractContent(url: string): Promise<{ content: string; metadata?: any }> {
|
|
28
|
+
const result = await this.extractor.extractContent(url);
|
|
29
|
+
if (!result) {
|
|
30
|
+
return { content: '', metadata: {} };
|
|
31
|
+
}
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Wrapper to make PlaywrightContentExtractor compatible with ContentExtractor interface
|
|
37
|
+
class PlaywrightWrapper implements ContentExtractor {
|
|
38
|
+
private extractor: PlaywrightContentExtractor;
|
|
39
|
+
|
|
40
|
+
constructor() {
|
|
41
|
+
this.extractor = new PlaywrightContentExtractor();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async extractContent(url: string): Promise<{ content: string; metadata?: any }> {
|
|
45
|
+
const result = await this.extractor.extractContent(url);
|
|
46
|
+
if (!result) {
|
|
47
|
+
return { content: '', metadata: {} };
|
|
48
|
+
}
|
|
49
|
+
return result;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Wrapper for PyPI search provider
|
|
54
|
+
class PyPISearchWrapper implements SearchProvider {
|
|
55
|
+
private provider: PyPISearchProvider;
|
|
56
|
+
name = 'pypi';
|
|
57
|
+
|
|
58
|
+
constructor() {
|
|
59
|
+
this.provider = new PyPISearchProvider();
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async search(query: string, maxResults?: number): Promise<any[]> {
|
|
63
|
+
return this.provider.search(query, maxResults);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Wrapper for NPM search provider
|
|
68
|
+
class NPMSearchWrapper implements SearchProvider {
|
|
69
|
+
private provider: NPMSearchProvider;
|
|
70
|
+
name = 'npm';
|
|
71
|
+
|
|
72
|
+
constructor() {
|
|
73
|
+
this.provider = new NPMSearchProvider();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async search(query: string, maxResults?: number): Promise<any[]> {
|
|
77
|
+
return this.provider.search(query, maxResults);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Wrapper for GitHub search provider (uses existing GitHub plugin)
|
|
82
|
+
class GitHubSearchWrapper implements SearchProvider {
|
|
83
|
+
name = 'github';
|
|
84
|
+
|
|
85
|
+
constructor(private runtime: IAgentRuntime) {}
|
|
86
|
+
|
|
87
|
+
async search(query: string, maxResults?: number): Promise<any[]> {
|
|
88
|
+
try {
|
|
89
|
+
const githubService = this.runtime.getService('github');
|
|
90
|
+
if (!githubService) {
|
|
91
|
+
elizaLogger.warn('GitHub service not available');
|
|
92
|
+
return [];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const results: any[] = [];
|
|
96
|
+
const limit = maxResults || 10;
|
|
97
|
+
|
|
98
|
+
// Search repositories
|
|
99
|
+
const repos = await (githubService as any).searchRepositories(query, {
|
|
100
|
+
sort: 'stars',
|
|
101
|
+
per_page: Math.min(limit, 5),
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
if (repos?.items) {
|
|
105
|
+
results.push(...repos.items.map((repo: any) => ({
|
|
106
|
+
title: repo.full_name,
|
|
107
|
+
url: repo.html_url,
|
|
108
|
+
snippet: repo.description || 'No description',
|
|
109
|
+
content: `${repo.description || ''}\nStars: ${repo.stargazers_count} | Language: ${repo.language || 'N/A'}`,
|
|
110
|
+
score: Math.min(1.0, repo.stargazers_count / 10000), // Normalize by star count
|
|
111
|
+
provider: 'github',
|
|
112
|
+
metadata: {
|
|
113
|
+
type: 'repository',
|
|
114
|
+
language: repo.language,
|
|
115
|
+
stars: repo.stargazers_count,
|
|
116
|
+
forks: repo.forks_count,
|
|
117
|
+
openIssues: repo.open_issues_count,
|
|
118
|
+
updatedAt: repo.updated_at,
|
|
119
|
+
owner: repo.owner?.login,
|
|
120
|
+
},
|
|
121
|
+
})));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Search issues if we have room for more results
|
|
125
|
+
if (results.length < limit) {
|
|
126
|
+
const issues = await (githubService as any).searchIssues(`${query} is:issue`, {
|
|
127
|
+
sort: 'updated',
|
|
128
|
+
per_page: Math.min(limit - results.length, 5),
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
if (issues?.items) {
|
|
132
|
+
results.push(...issues.items.map((issue: any) => ({
|
|
133
|
+
title: issue.title,
|
|
134
|
+
url: issue.html_url,
|
|
135
|
+
snippet: issue.body ? issue.body.substring(0, 200) + '...' : 'No description',
|
|
136
|
+
content: `${issue.title}\n${issue.body || ''}`,
|
|
137
|
+
score: issue.comments / 50, // Normalize by comment count
|
|
138
|
+
provider: 'github',
|
|
139
|
+
metadata: {
|
|
140
|
+
type: 'issue',
|
|
141
|
+
state: issue.state,
|
|
142
|
+
comments: issue.comments,
|
|
143
|
+
author: issue.user?.login,
|
|
144
|
+
createdAt: issue.created_at,
|
|
145
|
+
updatedAt: issue.updated_at,
|
|
146
|
+
number: issue.number,
|
|
147
|
+
},
|
|
148
|
+
})));
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return results.slice(0, limit);
|
|
153
|
+
} catch (error) {
|
|
154
|
+
elizaLogger.error('GitHub search error:', error);
|
|
155
|
+
return [];
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// StagehandContentExtractor - uses browserbase/stagehand for extraction
|
|
161
|
+
class StagehandContentExtractor implements ContentExtractor {
|
|
162
|
+
constructor(private runtime: IAgentRuntime) {}
|
|
163
|
+
|
|
164
|
+
async extractContent(url: string): Promise<{ content: string; title?: string; metadata?: any }> {
|
|
165
|
+
try {
|
|
166
|
+
const stagehandService = this.runtime.getService('stagehand');
|
|
167
|
+
if (!stagehandService) {
|
|
168
|
+
return { content: '', title: undefined, metadata: undefined };
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Cast to any to access custom methods
|
|
172
|
+
const stagehand = stagehandService as any;
|
|
173
|
+
const session = await stagehand.getCurrentSession?.() ||
|
|
174
|
+
await stagehand.createSession?.(`extract-${Date.now()}`);
|
|
175
|
+
|
|
176
|
+
if (!session) {
|
|
177
|
+
return { content: '', title: undefined, metadata: undefined };
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
await session.page.goto(url);
|
|
181
|
+
await session.page.waitForLoadState('domcontentloaded');
|
|
182
|
+
|
|
183
|
+
// Extract main content using AI
|
|
184
|
+
const extracted = await session.stagehand.extract({
|
|
185
|
+
instruction: 'Extract the main article content, title, and any important metadata. Exclude navigation, ads, and sidebars.',
|
|
186
|
+
schema: {
|
|
187
|
+
title: 'string',
|
|
188
|
+
content: 'string',
|
|
189
|
+
author: 'string?',
|
|
190
|
+
publishDate: 'string?',
|
|
191
|
+
description: 'string?'
|
|
192
|
+
}
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
content: extracted.content || '',
|
|
197
|
+
title: extracted.title,
|
|
198
|
+
metadata: {
|
|
199
|
+
author: extracted.author,
|
|
200
|
+
publishDate: extracted.publishDate,
|
|
201
|
+
description: extracted.description
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
} catch (error) {
|
|
205
|
+
elizaLogger.error('Stagehand content extraction error:', error);
|
|
206
|
+
return { content: '', title: undefined, metadata: undefined };
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
export function createSearchProvider(type: string, runtime: any): SearchProvider {
|
|
212
|
+
const apiKey = runtime.getSetting(`${type.toUpperCase()}_API_KEY`);
|
|
213
|
+
|
|
214
|
+
switch (type) {
|
|
215
|
+
case 'tavily':
|
|
216
|
+
if (!apiKey) {
|
|
217
|
+
elizaLogger.info('Tavily API key not found, search features will be limited');
|
|
218
|
+
// Return a mock provider that returns empty results
|
|
219
|
+
return {
|
|
220
|
+
name: 'tavily-mock',
|
|
221
|
+
search: async () => []
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
return new TavilySearchProvider({ apiKey });
|
|
225
|
+
|
|
226
|
+
case 'serper':
|
|
227
|
+
if (!apiKey) {
|
|
228
|
+
elizaLogger.info('Serper API key not found, search features will be limited');
|
|
229
|
+
return {
|
|
230
|
+
name: 'serper-mock',
|
|
231
|
+
search: async () => []
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
return new SerperSearchProvider({ apiKey });
|
|
235
|
+
|
|
236
|
+
case 'exa':
|
|
237
|
+
if (!apiKey) {
|
|
238
|
+
elizaLogger.info('Exa API key not found, search features will be limited');
|
|
239
|
+
return {
|
|
240
|
+
name: 'exa-mock',
|
|
241
|
+
search: async () => []
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
return new ExaSearchProvider({ apiKey });
|
|
245
|
+
|
|
246
|
+
case 'serpapi':
|
|
247
|
+
if (!apiKey) {
|
|
248
|
+
elizaLogger.info('SerpAPI key not found, search features will be limited');
|
|
249
|
+
return {
|
|
250
|
+
name: 'serpapi-mock',
|
|
251
|
+
search: async () => []
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
return new SerpAPISearchProvider({ apiKey });
|
|
255
|
+
|
|
256
|
+
case 'academic':
|
|
257
|
+
return new AcademicSearchProvider(runtime);
|
|
258
|
+
|
|
259
|
+
case 'pypi':
|
|
260
|
+
elizaLogger.info('Using PyPI search provider');
|
|
261
|
+
return new PyPISearchWrapper();
|
|
262
|
+
|
|
263
|
+
case 'npm':
|
|
264
|
+
elizaLogger.info('Using NPM search provider');
|
|
265
|
+
return new NPMSearchWrapper();
|
|
266
|
+
|
|
267
|
+
case 'github':
|
|
268
|
+
elizaLogger.info('Using GitHub search provider');
|
|
269
|
+
return new GitHubSearchWrapper(runtime);
|
|
270
|
+
|
|
271
|
+
case 'web':
|
|
272
|
+
default:
|
|
273
|
+
// Try different providers in order of preference
|
|
274
|
+
const providers = ['TAVILY', 'EXA', 'SERPAPI', 'SERPER'];
|
|
275
|
+
for (const provider of providers) {
|
|
276
|
+
const key = runtime.getSetting(`${provider}_API_KEY`);
|
|
277
|
+
if (key) {
|
|
278
|
+
elizaLogger.info(`Using ${provider} as web search provider`);
|
|
279
|
+
switch (provider) {
|
|
280
|
+
case 'TAVILY':
|
|
281
|
+
return new TavilySearchProvider({ apiKey: key });
|
|
282
|
+
case 'EXA':
|
|
283
|
+
return new ExaSearchProvider({ apiKey: key });
|
|
284
|
+
case 'SERPAPI':
|
|
285
|
+
return new SerpAPISearchProvider({ apiKey: key });
|
|
286
|
+
case 'SERPER':
|
|
287
|
+
return new SerperSearchProvider({ apiKey: key });
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
elizaLogger.info('No web search provider configured, using mock provider');
|
|
293
|
+
return {
|
|
294
|
+
name: 'mock-web',
|
|
295
|
+
search: async (query: string) => {
|
|
296
|
+
elizaLogger.warn(`[Mock Web Provider] Attempted to search for: "${query}" but no API keys are configured`);
|
|
297
|
+
elizaLogger.warn('[Mock Web Provider] Please configure at least one of: TAVILY_API_KEY, EXA_API_KEY, SERPAPI_API_KEY, or SERPER_API_KEY');
|
|
298
|
+
// Return a single result explaining the issue
|
|
299
|
+
return [{
|
|
300
|
+
title: 'Search Provider Not Configured',
|
|
301
|
+
url: 'https://example.com/configuration-needed',
|
|
302
|
+
snippet: `Unable to search for "${query}" - No search API keys are configured. Please add TAVILY_API_KEY, EXA_API_KEY, SERPAPI_API_KEY, or SERPER_API_KEY to your environment.`,
|
|
303
|
+
content: `Search attempted for: "${query}"\n\nNo search provider API keys were found. To enable web search, please configure one of the following environment variables:\n- TAVILY_API_KEY\n- EXA_API_KEY\n- SERPAPI_API_KEY\n- SERPER_API_KEY`,
|
|
304
|
+
score: 0,
|
|
305
|
+
provider: 'mock-web',
|
|
306
|
+
metadata: {
|
|
307
|
+
error: true,
|
|
308
|
+
query: query,
|
|
309
|
+
message: 'No search provider configured'
|
|
310
|
+
}
|
|
311
|
+
}];
|
|
312
|
+
}
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Enhanced content extractor with multiple fallback strategies
|
|
318
|
+
class RobustContentExtractor implements ContentExtractor {
|
|
319
|
+
private extractors: ContentExtractor[] = [];
|
|
320
|
+
|
|
321
|
+
constructor(runtime: IAgentRuntime) {
|
|
322
|
+
// Build priority list of extractors
|
|
323
|
+
|
|
324
|
+
// 1. Stagehand/Browserbase (highest priority - AI-powered)
|
|
325
|
+
try {
|
|
326
|
+
const stagehandService = runtime.getService('stagehand');
|
|
327
|
+
if (stagehandService) {
|
|
328
|
+
this.extractors.push(new StagehandContentExtractor(runtime));
|
|
329
|
+
elizaLogger.info('Added Stagehand content extractor');
|
|
330
|
+
}
|
|
331
|
+
} catch (e) {
|
|
332
|
+
// Service not available
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// 2. Firecrawl (high priority - commercial service)
|
|
336
|
+
const firecrawlKey = runtime.getSetting('FIRECRAWL_API_KEY');
|
|
337
|
+
if (firecrawlKey) {
|
|
338
|
+
this.extractors.push(new FirecrawlWrapper(firecrawlKey));
|
|
339
|
+
elizaLogger.info('Added Firecrawl content extractor');
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// 3. Playwright (fallback - may get blocked)
|
|
343
|
+
this.extractors.push(new PlaywrightWrapper());
|
|
344
|
+
elizaLogger.info('Added Playwright content extractor as fallback');
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
async extractContent(url: string): Promise<{ content: string; title?: string; metadata?: any }> {
|
|
348
|
+
const errors: Error[] = [];
|
|
349
|
+
|
|
350
|
+
for (let i = 0; i < this.extractors.length; i++) {
|
|
351
|
+
const extractor = this.extractors[i];
|
|
352
|
+
const extractorName = extractor.constructor.name;
|
|
353
|
+
|
|
354
|
+
try {
|
|
355
|
+
elizaLogger.debug(`Attempting content extraction with ${extractorName} for: ${url}`);
|
|
356
|
+
const result = await extractor.extractContent(url);
|
|
357
|
+
|
|
358
|
+
// Validate result quality
|
|
359
|
+
if (result && result.content && result.content.trim().length > 100) {
|
|
360
|
+
elizaLogger.info(`Successfully extracted content with ${extractorName} (${result.content.length} chars)`);
|
|
361
|
+
|
|
362
|
+
// Add extraction metadata
|
|
363
|
+
result.metadata = {
|
|
364
|
+
...result.metadata,
|
|
365
|
+
extractorUsed: extractorName,
|
|
366
|
+
extractionTime: Date.now(),
|
|
367
|
+
contentLength: result.content.length,
|
|
368
|
+
url: url
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
return result;
|
|
372
|
+
} else {
|
|
373
|
+
elizaLogger.warn(`${extractorName} returned insufficient content (${result?.content?.length || 0} chars)`);
|
|
374
|
+
}
|
|
375
|
+
} catch (error) {
|
|
376
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
377
|
+
elizaLogger.warn(`${extractorName} extraction failed for ${url}: ${errorMsg}`);
|
|
378
|
+
errors.push(error instanceof Error ? error : new Error(errorMsg));
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// All extractors failed
|
|
383
|
+
elizaLogger.error(`All content extractors failed for ${url}. Errors:`, errors.map(e => e.message));
|
|
384
|
+
|
|
385
|
+
// Return minimal result with error info
|
|
386
|
+
return {
|
|
387
|
+
content: `Failed to extract content from ${url}. Content may be behind paywall, require authentication, or have anti-bot protection.`,
|
|
388
|
+
title: undefined,
|
|
389
|
+
metadata: {
|
|
390
|
+
extractionFailed: true,
|
|
391
|
+
url: url,
|
|
392
|
+
errors: errors.map(e => e.message),
|
|
393
|
+
extractionTime: Date.now()
|
|
394
|
+
}
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
export function createContentExtractor(runtime: IAgentRuntime): ContentExtractor | null {
|
|
400
|
+
return new RobustContentExtractor(runtime);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
export function createAcademicSearchProvider(runtime: IAgentRuntime): SearchProvider {
|
|
404
|
+
const semanticScholarKey = runtime.getSetting('SEMANTIC_SCHOLAR_API_KEY');
|
|
405
|
+
elizaLogger.info('Using Academic search provider (Semantic Scholar, arXiv, CrossRef)');
|
|
406
|
+
|
|
407
|
+
const provider = new AcademicSearchProvider({
|
|
408
|
+
semanticScholarApiKey: semanticScholarKey,
|
|
409
|
+
timeout: 30000,
|
|
410
|
+
});
|
|
411
|
+
|
|
412
|
+
// Wrap with rate limiting and caching
|
|
413
|
+
const rateLimited = new RateLimitedSearchProvider(provider, {
|
|
414
|
+
tokensPerInterval: 100, // Academic sources allow more requests
|
|
415
|
+
interval: 'minute'
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
return new CachedSearchProvider(rateLimited);
|
|
419
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Search Providers
|
|
2
|
+
export * from './search-providers/tavily';
|
|
3
|
+
export * from './search-providers/serper';
|
|
4
|
+
export * from './search-providers/stagehand-google';
|
|
5
|
+
|
|
6
|
+
// Content Extractors
|
|
7
|
+
export * from './content-extractors/firecrawl';
|
|
8
|
+
export * from './content-extractors/playwright';
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
// Rate Limiting
|
|
12
|
+
export * from './rate-limiter';
|
|
13
|
+
|
|
14
|
+
// Caching
|
|
15
|
+
export * from './cache';
|
|
16
|
+
|
|
17
|
+
// Factory for creating providers
|
|
18
|
+
export * from './factory';
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { elizaLogger } from '@elizaos/core';
|
|
2
|
+
import { SearchResult } from '../types';
|
|
3
|
+
|
|
4
|
+
// Simple built-in rate limiter implementation
|
|
5
|
+
class RateLimiter {
|
|
6
|
+
private tokens: number;
|
|
7
|
+
private maxTokens: number;
|
|
8
|
+
private refillRate: number;
|
|
9
|
+
private lastRefill: number;
|
|
10
|
+
|
|
11
|
+
constructor(config: {
|
|
12
|
+
tokensPerInterval: number;
|
|
13
|
+
interval: 'second' | 'minute' | 'hour' | 'day';
|
|
14
|
+
fireImmediately?: boolean;
|
|
15
|
+
}) {
|
|
16
|
+
this.maxTokens = config.tokensPerInterval;
|
|
17
|
+
this.tokens = config.tokensPerInterval;
|
|
18
|
+
this.lastRefill = Date.now();
|
|
19
|
+
|
|
20
|
+
// Convert interval to milliseconds
|
|
21
|
+
const intervalMs = {
|
|
22
|
+
second: 1000,
|
|
23
|
+
minute: 60000,
|
|
24
|
+
hour: 3600000,
|
|
25
|
+
day: 86400000
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
this.refillRate = intervalMs[config.interval] / config.tokensPerInterval;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
private refillTokens(): void {
|
|
32
|
+
const now = Date.now();
|
|
33
|
+
const timePassed = now - this.lastRefill;
|
|
34
|
+
const tokensToAdd = Math.floor(timePassed / this.refillRate);
|
|
35
|
+
|
|
36
|
+
if (tokensToAdd > 0) {
|
|
37
|
+
this.tokens = Math.min(this.maxTokens, this.tokens + tokensToAdd);
|
|
38
|
+
this.lastRefill = now;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async tryRemoveTokens(count: number): Promise<boolean> {
|
|
43
|
+
this.refillTokens();
|
|
44
|
+
|
|
45
|
+
if (this.tokens >= count) {
|
|
46
|
+
this.tokens -= count;
|
|
47
|
+
return true;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async removeTokens(count: number): Promise<void> {
|
|
54
|
+
while (!(await this.tryRemoveTokens(count))) {
|
|
55
|
+
// Wait for tokens to become available
|
|
56
|
+
await new Promise(resolve => setTimeout(resolve, this.refillRate));
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async getTokensRemaining(): Promise<number> {
|
|
61
|
+
this.refillTokens();
|
|
62
|
+
return this.tokens;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface SearchProvider {
|
|
67
|
+
search(query: string, maxResults?: number): Promise<any[]>;
|
|
68
|
+
name?: string;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export interface ContentExtractor {
|
|
72
|
+
extractContent(url: string): Promise<{ content: string; title?: string; metadata?: any }>;
|
|
73
|
+
name?: string;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export class RateLimitedProvider implements SearchProvider {
|
|
77
|
+
private limiter: RateLimiter;
|
|
78
|
+
public readonly name: string;
|
|
79
|
+
|
|
80
|
+
constructor(
|
|
81
|
+
private provider: SearchProvider,
|
|
82
|
+
config: {
|
|
83
|
+
tokensPerInterval: number;
|
|
84
|
+
interval: 'second' | 'minute' | 'hour' | 'day';
|
|
85
|
+
fireImmediately?: boolean;
|
|
86
|
+
}
|
|
87
|
+
) {
|
|
88
|
+
this.name = `RateLimited(${provider.name || 'Unknown'})`;
|
|
89
|
+
this.limiter = new RateLimiter(config);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async search(query: string, maxResults?: number): Promise<any[]> {
|
|
93
|
+
const hasTokens = await this.limiter.tryRemoveTokens(1);
|
|
94
|
+
if (!hasTokens) {
|
|
95
|
+
elizaLogger.warn(`[${this.name}] Rate limit reached, waiting...`);
|
|
96
|
+
await this.limiter.removeTokens(1);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return this.provider.search(query, maxResults);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Alias for backwards compatibility
|
|
104
|
+
export const RateLimitedSearchProvider = RateLimitedProvider;
|
|
105
|
+
|
|
106
|
+
export interface RateLimiterConfig {
|
|
107
|
+
requestsPerMinute?: number;
|
|
108
|
+
requestsPerHour?: number;
|
|
109
|
+
requestsPerDay?: number;
|
|
110
|
+
burstSize?: number;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Convenience function to create rate-limited providers
|
|
114
|
+
export function createRateLimitedProvider(
|
|
115
|
+
provider: SearchProvider,
|
|
116
|
+
config: RateLimiterConfig = {}
|
|
117
|
+
): SearchProvider {
|
|
118
|
+
// Use the most restrictive limit
|
|
119
|
+
if (config.requestsPerMinute) {
|
|
120
|
+
return new RateLimitedProvider(provider, {
|
|
121
|
+
tokensPerInterval: config.requestsPerMinute,
|
|
122
|
+
interval: 'minute',
|
|
123
|
+
});
|
|
124
|
+
} else if (config.requestsPerHour) {
|
|
125
|
+
return new RateLimitedProvider(provider, {
|
|
126
|
+
tokensPerInterval: config.requestsPerHour,
|
|
127
|
+
interval: 'hour',
|
|
128
|
+
});
|
|
129
|
+
} else if (config.requestsPerDay) {
|
|
130
|
+
return new RateLimitedProvider(provider, {
|
|
131
|
+
tokensPerInterval: config.requestsPerDay,
|
|
132
|
+
interval: 'day',
|
|
133
|
+
});
|
|
134
|
+
} else {
|
|
135
|
+
// Default to 60 requests per minute
|
|
136
|
+
return new RateLimitedProvider(provider, {
|
|
137
|
+
tokensPerInterval: 60,
|
|
138
|
+
interval: 'minute',
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Advanced rate limiter with adaptive behavior
|
|
144
|
+
export class AdaptiveRateLimiter extends RateLimitedProvider {
|
|
145
|
+
private errorCount: number = 0;
|
|
146
|
+
private successCount: number = 0;
|
|
147
|
+
private lastRateLimitError: number = 0;
|
|
148
|
+
|
|
149
|
+
async search(query: string, maxResults?: number): Promise<SearchResult[]> {
|
|
150
|
+
try {
|
|
151
|
+
const results = await super.search(query, maxResults);
|
|
152
|
+
this.successCount++;
|
|
153
|
+
this.errorCount = Math.max(0, this.errorCount - 1); // Gradually reduce error count
|
|
154
|
+
return results;
|
|
155
|
+
} catch (error: any) {
|
|
156
|
+
if (error.message?.includes('rate limit') || error.message?.includes('429')) {
|
|
157
|
+
this.errorCount++;
|
|
158
|
+
this.lastRateLimitError = Date.now();
|
|
159
|
+
|
|
160
|
+
// Exponential backoff based on error count
|
|
161
|
+
const backoffMs = Math.min(60000, 1000 * Math.pow(2, this.errorCount));
|
|
162
|
+
elizaLogger.warn(`[AdaptiveRateLimiter] Backing off for ${backoffMs}ms after ${this.errorCount} rate limit errors`);
|
|
163
|
+
|
|
164
|
+
await new Promise(resolve => setTimeout(resolve, backoffMs));
|
|
165
|
+
|
|
166
|
+
// Retry once after backoff
|
|
167
|
+
return super.search(query, maxResults);
|
|
168
|
+
}
|
|
169
|
+
throw error;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
getStats() {
|
|
174
|
+
return {
|
|
175
|
+
errorCount: this.errorCount,
|
|
176
|
+
successCount: this.successCount,
|
|
177
|
+
lastRateLimitError: this.lastRateLimitError,
|
|
178
|
+
timeSinceLastError: this.lastRateLimitError ? Date.now() - this.lastRateLimitError : null,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
}
|