@elizaos/plugin-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +400 -0
  2. package/dist/index.cjs +9366 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +9284 -0
  5. package/dist/index.js.map +1 -0
  6. package/package.json +80 -0
  7. package/src/__tests__/action-chaining.test.ts +532 -0
  8. package/src/__tests__/actions.test.ts +118 -0
  9. package/src/__tests__/cache-rate-limiter.test.ts +303 -0
  10. package/src/__tests__/content-extractors.test.ts +26 -0
  11. package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
  12. package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
  13. package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
  14. package/src/__tests__/e2e.test.ts +1870 -0
  15. package/src/__tests__/multi-benchmark-runner.ts +427 -0
  16. package/src/__tests__/providers.test.ts +156 -0
  17. package/src/__tests__/real-world.e2e.test.ts +788 -0
  18. package/src/__tests__/research-scenarios.test.ts +755 -0
  19. package/src/__tests__/research.e2e.test.ts +704 -0
  20. package/src/__tests__/research.test.ts +174 -0
  21. package/src/__tests__/search-providers.test.ts +174 -0
  22. package/src/__tests__/single-benchmark-runner.ts +735 -0
  23. package/src/__tests__/test-search-providers.ts +171 -0
  24. package/src/__tests__/verify-apis.test.ts +82 -0
  25. package/src/actions.ts +1677 -0
  26. package/src/benchmark/deepresearch-benchmark.ts +369 -0
  27. package/src/evaluation/research-evaluator.ts +444 -0
  28. package/src/examples/api-integration.md +498 -0
  29. package/src/examples/browserbase-integration.md +132 -0
  30. package/src/examples/debug-research-query.ts +162 -0
  31. package/src/examples/defi-code-scenarios.md +536 -0
  32. package/src/examples/defi-implementation-guide.md +454 -0
  33. package/src/examples/eliza-research-example.ts +142 -0
  34. package/src/examples/fix-renewable-energy-research.ts +209 -0
  35. package/src/examples/research-scenarios.md +408 -0
  36. package/src/examples/run-complete-renewable-research.ts +303 -0
  37. package/src/examples/run-deep-research.ts +352 -0
  38. package/src/examples/run-logged-research.ts +304 -0
  39. package/src/examples/run-real-research.ts +151 -0
  40. package/src/examples/save-research-output.ts +133 -0
  41. package/src/examples/test-file-logging.ts +199 -0
  42. package/src/examples/test-real-research.ts +67 -0
  43. package/src/examples/test-renewable-energy-research.ts +229 -0
  44. package/src/index.ts +28 -0
  45. package/src/integrations/cache.ts +128 -0
  46. package/src/integrations/content-extractors/firecrawl.ts +314 -0
  47. package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
  48. package/src/integrations/content-extractors/playwright.ts +420 -0
  49. package/src/integrations/factory.ts +419 -0
  50. package/src/integrations/index.ts +18 -0
  51. package/src/integrations/rate-limiter.ts +181 -0
  52. package/src/integrations/search-providers/academic.ts +290 -0
  53. package/src/integrations/search-providers/exa.ts +205 -0
  54. package/src/integrations/search-providers/npm.ts +330 -0
  55. package/src/integrations/search-providers/pypi.ts +211 -0
  56. package/src/integrations/search-providers/serpapi.ts +277 -0
  57. package/src/integrations/search-providers/serper.ts +358 -0
  58. package/src/integrations/search-providers/stagehand-google.ts +87 -0
  59. package/src/integrations/search-providers/tavily.ts +187 -0
  60. package/src/processing/relevance-analyzer.ts +353 -0
  61. package/src/processing/research-logger.ts +450 -0
  62. package/src/processing/result-processor.ts +372 -0
  63. package/src/prompts/research-prompts.ts +419 -0
  64. package/src/providers/cacheProvider.ts +164 -0
  65. package/src/providers.ts +173 -0
  66. package/src/service.ts +2588 -0
  67. package/src/services/swe-bench.ts +286 -0
  68. package/src/strategies/research-strategies.ts +790 -0
  69. package/src/types/pdf-parse.d.ts +34 -0
  70. package/src/types.ts +551 -0
  71. package/src/verification/claim-verifier.ts +443 -0
@@ -0,0 +1,420 @@
1
+ import { chromium, Browser, Page, BrowserContext } from 'playwright';
2
+ import { elizaLogger } from '@elizaos/core';
3
+ import * as cheerio from 'cheerio';
4
+ import { ExtractedContent } from './firecrawl';
5
+
6
+ export interface PlaywrightConfig {
7
+ headless?: boolean;
8
+ timeout?: number;
9
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle';
10
+ userAgent?: string;
11
+ viewport?: { width: number; height: number };
12
+ blockResources?: string[]; // ['image', 'media', 'font']
13
+ maxRetries?: number;
14
+ enableJavaScript?: boolean;
15
+ enableCookies?: boolean;
16
+ }
17
+
18
+ export class PlaywrightContentExtractor {
19
+ private browser: Browser | null = null;
20
+ private context: BrowserContext | null = null;
21
+ private readonly config: PlaywrightConfig;
22
+
23
+ constructor(config: PlaywrightConfig = {}) {
24
+ this.config = {
25
+ headless: true,
26
+ timeout: 30000,
27
+ waitUntil: 'networkidle',
28
+ viewport: { width: 1920, height: 1080 },
29
+ blockResources: ['image', 'media', 'font'],
30
+ maxRetries: 3,
31
+ enableJavaScript: true,
32
+ enableCookies: false,
33
+ ...config,
34
+ };
35
+ }
36
+
37
+ async initialize(): Promise<void> {
38
+ if (!this.browser) {
39
+ elizaLogger.info('[Playwright] Initializing browser');
40
+ this.browser = await chromium.launch({
41
+ headless: this.config.headless,
42
+ args: [
43
+ '--no-sandbox',
44
+ '--disable-setuid-sandbox',
45
+ '--disable-dev-shm-usage',
46
+ '--disable-accelerated-2d-canvas',
47
+ '--no-first-run',
48
+ '--no-zygote',
49
+ '--disable-gpu',
50
+ ],
51
+ });
52
+
53
+ this.context = await this.browser.newContext({
54
+ userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
55
+ viewport: this.config.viewport,
56
+ javaScriptEnabled: this.config.enableJavaScript,
57
+ bypassCSP: true,
58
+ ignoreHTTPSErrors: true,
59
+ });
60
+
61
+ // Block unnecessary resources to speed up loading
62
+ if (this.config.blockResources && this.config.blockResources.length > 0) {
63
+ await this.context.route('**/*', (route) => {
64
+ const resourceType = route.request().resourceType();
65
+ if (this.config.blockResources?.includes(resourceType)) {
66
+ route.abort();
67
+ } else {
68
+ route.continue();
69
+ }
70
+ });
71
+ }
72
+ }
73
+ }
74
+
75
+ async close(): Promise<void> {
76
+ if (this.context) {
77
+ await this.context.close();
78
+ this.context = null;
79
+ }
80
+ if (this.browser) {
81
+ await this.browser.close();
82
+ this.browser = null;
83
+ }
84
+ }
85
+
86
+ async extractContent(url: string, retryCount: number = 0): Promise<ExtractedContent | null> {
87
+ const startTime = Date.now();
88
+ let page: Page | null = null;
89
+
90
+ try {
91
+ await this.initialize();
92
+
93
+ if (!this.context) {
94
+ throw new Error('Browser context not initialized');
95
+ }
96
+
97
+ elizaLogger.info(`[Playwright] Extracting content from: ${url}`);
98
+
99
+ page = await this.context.newPage();
100
+
101
+ // Set additional headers to appear more like a real browser
102
+ await page.setExtraHTTPHeaders({
103
+ 'Accept-Language': 'en-US,en;q=0.9',
104
+ 'Accept-Encoding': 'gzip, deflate, br',
105
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
106
+ });
107
+
108
+ // Navigate to the page
109
+ await page.goto(url, {
110
+ waitUntil: this.config.waitUntil,
111
+ timeout: this.config.timeout,
112
+ });
113
+
114
+ // Wait for content to be visible
115
+ await this.waitForContent(page);
116
+
117
+ // Extract the content
118
+ const content = await this.extractPageContent(page);
119
+
120
+ const duration = Date.now() - startTime;
121
+ elizaLogger.info(`[Playwright] Content extracted in ${duration}ms (${content.content.length} characters)`);
122
+
123
+ return content;
124
+ } catch (error) {
125
+ const duration = Date.now() - startTime;
126
+ elizaLogger.error(`[Playwright] Extraction error after ${duration}ms:`, error);
127
+
128
+ // Retry logic
129
+ if (retryCount < (this.config.maxRetries || 3)) {
130
+ elizaLogger.info(`[Playwright] Retrying extraction (attempt ${retryCount + 1})`);
131
+ await new Promise(resolve => setTimeout(resolve, 1000 * (retryCount + 1)));
132
+ return this.extractContent(url, retryCount + 1);
133
+ }
134
+
135
+ return null;
136
+ } finally {
137
+ if (page) {
138
+ await page.close().catch(() => {});
139
+ }
140
+ }
141
+ }
142
+
143
+ private async waitForContent(page: Page): Promise<void> {
144
+ try {
145
+ // Wait for common content selectors
146
+ const contentSelectors = [
147
+ 'main',
148
+ 'article',
149
+ '[role="main"]',
150
+ '#content',
151
+ '.content',
152
+ '.post',
153
+ '.entry-content',
154
+ ];
155
+
156
+ for (const selector of contentSelectors) {
157
+ try {
158
+ await page.waitForSelector(selector, { timeout: 5000 });
159
+ break;
160
+ } catch {
161
+ // Continue to next selector
162
+ }
163
+ }
164
+
165
+ // Additional wait for dynamic content
166
+ await page.waitForTimeout(1000);
167
+
168
+ // Scroll to load lazy-loaded content
169
+ await page.evaluate(() => {
170
+ // @ts-ignore - This runs in browser context
171
+ window.scrollTo(0, document.body.scrollHeight);
172
+ });
173
+
174
+ await page.waitForTimeout(500);
175
+
176
+ // Scroll back to top
177
+ await page.evaluate(() => {
178
+ // @ts-ignore - This runs in browser context
179
+ window.scrollTo(0, 0);
180
+ });
181
+ } catch (error) {
182
+ elizaLogger.warn('[Playwright] Could not wait for specific content selectors');
183
+ }
184
+ }
185
+
186
+ private async extractPageContent(page: Page): Promise<ExtractedContent> {
187
+ // Get the page HTML
188
+ const html = await page.content();
189
+
190
+ // Extract text content using Playwright's built-in methods
191
+ const textContent = await page.evaluate(() => {
192
+ // @ts-ignore - This entire function runs in browser context
193
+ const scripts = document.querySelectorAll('script, style, noscript');
194
+ scripts.forEach((el: Element) => el.remove());
195
+
196
+ // @ts-ignore - This runs in browser context
197
+ const unwanted = document.querySelectorAll('nav, footer, aside, .sidebar, .advertisement, .ad');
198
+ unwanted.forEach((el: Element) => el.remove());
199
+
200
+ // Try to find main content areas
201
+ const contentSelectors = [
202
+ 'main',
203
+ 'article',
204
+ '[role="main"]',
205
+ '#content',
206
+ '.content',
207
+ '.post',
208
+ '.entry-content',
209
+ 'body'
210
+ ];
211
+
212
+ for (const selector of contentSelectors) {
213
+ // @ts-ignore - This runs in browser context
214
+ const element = document.querySelector(selector);
215
+ if (element) {
216
+ // @ts-ignore - HTMLElement exists in browser context
217
+ return (element as any).innerText || element.textContent || '';
218
+ }
219
+ }
220
+
221
+ // @ts-ignore - This runs in browser context
222
+ return document.body.innerText || document.body.textContent || '';
223
+ });
224
+
225
+ // Extract metadata
226
+ // @ts-ignore - All page.evaluate functions run in browser context
227
+ const metadata = await page.evaluate(() => {
228
+ const getMetaContent = (name: string): string | undefined => {
229
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
230
+ return meta?.getAttribute('content') || undefined;
231
+ };
232
+
233
+ return {
234
+ title: document.title,
235
+ description: getMetaContent('description') || getMetaContent('og:description'),
236
+ author: getMetaContent('author'),
237
+ publishedTime: getMetaContent('article:published_time'),
238
+ modifiedTime: getMetaContent('article:modified_time'),
239
+ language: document.documentElement.lang || getMetaContent('language'),
240
+ ogTitle: getMetaContent('og:title'),
241
+ ogDescription: getMetaContent('og:description'),
242
+ ogImage: getMetaContent('og:image'),
243
+ ogUrl: getMetaContent('og:url'),
244
+ canonical: document.querySelector('link[rel="canonical"]')?.getAttribute('href'),
245
+ keywords: getMetaContent('keywords'),
246
+ };
247
+ });
248
+
249
+ // Extract links
250
+ // @ts-ignore - All page.evaluate functions run in browser context
251
+ const links = await page.evaluate(() => {
252
+ const anchors = document.querySelectorAll('a[href]');
253
+ return Array.from(anchors)
254
+ .map(a => (a as any).href)
255
+ .filter(href => href && !href.startsWith('#') && !href.startsWith('javascript:'));
256
+ });
257
+
258
+ // Extract images
259
+ // @ts-ignore - All page.evaluate functions run in browser context
260
+ const images = await page.evaluate(() => {
261
+ const imgs = document.querySelectorAll('img[src]');
262
+ return Array.from(imgs)
263
+ .map(img => (img as any).src)
264
+ .filter(src => src && !src.includes('data:image'));
265
+ });
266
+
267
+ // Convert to markdown using cheerio
268
+ const $ = cheerio.load(html);
269
+
270
+ // Remove unwanted elements
271
+ $('script, style, nav, footer, aside, .sidebar, .advertisement, .ad').remove();
272
+
273
+ // Convert to markdown-like format
274
+ let markdown = '';
275
+
276
+ // Process headings
277
+ $('h1, h2, h3, h4, h5, h6').each((_, elem) => {
278
+ const level = parseInt(elem.tagName.charAt(1));
279
+ const text = $(elem).text().trim();
280
+ if (text) {
281
+ markdown += '\n' + '#'.repeat(level) + ' ' + text + '\n\n';
282
+ }
283
+ });
284
+
285
+ // Process paragraphs
286
+ $('p').each((_, elem) => {
287
+ const text = $(elem).text().trim();
288
+ if (text) {
289
+ markdown += text + '\n\n';
290
+ }
291
+ });
292
+
293
+ // Process lists
294
+ $('ul, ol').each((_, elem) => {
295
+ $(elem).find('li').each((index, li) => {
296
+ const text = $(li).text().trim();
297
+ if (text) {
298
+ const bullet = elem.tagName === 'ol' ? `${index + 1}.` : '-';
299
+ markdown += `${bullet} ${text}\n`;
300
+ }
301
+ });
302
+ markdown += '\n';
303
+ });
304
+
305
+ // Process code blocks
306
+ $('pre, code').each((_, elem) => {
307
+ const text = $(elem).text().trim();
308
+ if (text) {
309
+ if (elem.tagName === 'pre') {
310
+ markdown += '```\n' + text + '\n```\n\n';
311
+ } else {
312
+ markdown += '`' + text + '`';
313
+ }
314
+ }
315
+ });
316
+
317
+ // Process blockquotes
318
+ $('blockquote').each((_, elem) => {
319
+ const text = $(elem).text().trim();
320
+ if (text) {
321
+ markdown += '> ' + text.replace(/\n/g, '\n> ') + '\n\n';
322
+ }
323
+ });
324
+
325
+ return {
326
+ content: textContent.trim(),
327
+ markdown: markdown.trim() || textContent.trim(),
328
+ html: html,
329
+ metadata: metadata,
330
+ links: [...new Set(links)], // Remove duplicates
331
+ images: [...new Set(images)], // Remove duplicates
332
+ };
333
+ }
334
+
335
+ async extractBatch(urls: string[]): Promise<Map<string, ExtractedContent | null>> {
336
+ elizaLogger.info(`[Playwright] Extracting content from ${urls.length} URLs`);
337
+
338
+ const results = new Map<string, ExtractedContent | null>();
339
+
340
+ // Process sequentially to avoid overwhelming the browser
341
+ for (const url of urls) {
342
+ try {
343
+ const content = await this.extractContent(url);
344
+ results.set(url, content);
345
+ } catch (error) {
346
+ elizaLogger.error(`[Playwright] Failed to extract ${url}:`, error);
347
+ results.set(url, null);
348
+ }
349
+ }
350
+
351
+ return results;
352
+ }
353
+
354
+ async screenshot(url: string, outputPath: string): Promise<boolean> {
355
+ let page: Page | null = null;
356
+
357
+ try {
358
+ await this.initialize();
359
+
360
+ if (!this.context) {
361
+ throw new Error('Browser context not initialized');
362
+ }
363
+
364
+ page = await this.context.newPage();
365
+ await page.goto(url, {
366
+ waitUntil: 'networkidle',
367
+ timeout: this.config.timeout,
368
+ });
369
+
370
+ await page.screenshot({
371
+ path: outputPath,
372
+ fullPage: true,
373
+ });
374
+
375
+ elizaLogger.info(`[Playwright] Screenshot saved to ${outputPath}`);
376
+ return true;
377
+ } catch (error) {
378
+ elizaLogger.error('[Playwright] Screenshot error:', error);
379
+ return false;
380
+ } finally {
381
+ if (page) {
382
+ await page.close().catch(() => {});
383
+ }
384
+ }
385
+ }
386
+
387
+ async pdf(url: string, outputPath: string): Promise<boolean> {
388
+ let page: Page | null = null;
389
+
390
+ try {
391
+ await this.initialize();
392
+
393
+ if (!this.context) {
394
+ throw new Error('Browser context not initialized');
395
+ }
396
+
397
+ page = await this.context.newPage();
398
+ await page.goto(url, {
399
+ waitUntil: 'networkidle',
400
+ timeout: this.config.timeout,
401
+ });
402
+
403
+ await page.pdf({
404
+ path: outputPath,
405
+ format: 'A4',
406
+ printBackground: true,
407
+ });
408
+
409
+ elizaLogger.info(`[Playwright] PDF saved to ${outputPath}`);
410
+ return true;
411
+ } catch (error) {
412
+ elizaLogger.error('[Playwright] PDF error:', error);
413
+ return false;
414
+ } finally {
415
+ if (page) {
416
+ await page.close().catch(() => {});
417
+ }
418
+ }
419
+ }
420
+ }