@elizaos/plugin-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +400 -0
  2. package/dist/index.cjs +9366 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +9284 -0
  5. package/dist/index.js.map +1 -0
  6. package/package.json +80 -0
  7. package/src/__tests__/action-chaining.test.ts +532 -0
  8. package/src/__tests__/actions.test.ts +118 -0
  9. package/src/__tests__/cache-rate-limiter.test.ts +303 -0
  10. package/src/__tests__/content-extractors.test.ts +26 -0
  11. package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
  12. package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
  13. package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
  14. package/src/__tests__/e2e.test.ts +1870 -0
  15. package/src/__tests__/multi-benchmark-runner.ts +427 -0
  16. package/src/__tests__/providers.test.ts +156 -0
  17. package/src/__tests__/real-world.e2e.test.ts +788 -0
  18. package/src/__tests__/research-scenarios.test.ts +755 -0
  19. package/src/__tests__/research.e2e.test.ts +704 -0
  20. package/src/__tests__/research.test.ts +174 -0
  21. package/src/__tests__/search-providers.test.ts +174 -0
  22. package/src/__tests__/single-benchmark-runner.ts +735 -0
  23. package/src/__tests__/test-search-providers.ts +171 -0
  24. package/src/__tests__/verify-apis.test.ts +82 -0
  25. package/src/actions.ts +1677 -0
  26. package/src/benchmark/deepresearch-benchmark.ts +369 -0
  27. package/src/evaluation/research-evaluator.ts +444 -0
  28. package/src/examples/api-integration.md +498 -0
  29. package/src/examples/browserbase-integration.md +132 -0
  30. package/src/examples/debug-research-query.ts +162 -0
  31. package/src/examples/defi-code-scenarios.md +536 -0
  32. package/src/examples/defi-implementation-guide.md +454 -0
  33. package/src/examples/eliza-research-example.ts +142 -0
  34. package/src/examples/fix-renewable-energy-research.ts +209 -0
  35. package/src/examples/research-scenarios.md +408 -0
  36. package/src/examples/run-complete-renewable-research.ts +303 -0
  37. package/src/examples/run-deep-research.ts +352 -0
  38. package/src/examples/run-logged-research.ts +304 -0
  39. package/src/examples/run-real-research.ts +151 -0
  40. package/src/examples/save-research-output.ts +133 -0
  41. package/src/examples/test-file-logging.ts +199 -0
  42. package/src/examples/test-real-research.ts +67 -0
  43. package/src/examples/test-renewable-energy-research.ts +229 -0
  44. package/src/index.ts +28 -0
  45. package/src/integrations/cache.ts +128 -0
  46. package/src/integrations/content-extractors/firecrawl.ts +314 -0
  47. package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
  48. package/src/integrations/content-extractors/playwright.ts +420 -0
  49. package/src/integrations/factory.ts +419 -0
  50. package/src/integrations/index.ts +18 -0
  51. package/src/integrations/rate-limiter.ts +181 -0
  52. package/src/integrations/search-providers/academic.ts +290 -0
  53. package/src/integrations/search-providers/exa.ts +205 -0
  54. package/src/integrations/search-providers/npm.ts +330 -0
  55. package/src/integrations/search-providers/pypi.ts +211 -0
  56. package/src/integrations/search-providers/serpapi.ts +277 -0
  57. package/src/integrations/search-providers/serper.ts +358 -0
  58. package/src/integrations/search-providers/stagehand-google.ts +87 -0
  59. package/src/integrations/search-providers/tavily.ts +187 -0
  60. package/src/processing/relevance-analyzer.ts +353 -0
  61. package/src/processing/research-logger.ts +450 -0
  62. package/src/processing/result-processor.ts +372 -0
  63. package/src/prompts/research-prompts.ts +419 -0
  64. package/src/providers/cacheProvider.ts +164 -0
  65. package/src/providers.ts +173 -0
  66. package/src/service.ts +2588 -0
  67. package/src/services/swe-bench.ts +286 -0
  68. package/src/strategies/research-strategies.ts +790 -0
  69. package/src/types/pdf-parse.d.ts +34 -0
  70. package/src/types.ts +551 -0
  71. package/src/verification/claim-verifier.ts +443 -0
@@ -0,0 +1,314 @@
1
+ import axios, { AxiosError } from 'axios';
2
+ import { elizaLogger } from '@elizaos/core';
3
+ import { z } from 'zod';
4
+
5
+ // Firecrawl API response schema
6
+ const FirecrawlResponseSchema = z.object({
7
+ success: z.boolean(),
8
+ data: z.object({
9
+ content: z.string().optional(),
10
+ markdown: z.string().optional(),
11
+ html: z.string().optional(),
12
+ metadata: z.object({
13
+ title: z.string().optional(),
14
+ description: z.string().optional(),
15
+ language: z.string().optional(),
16
+ keywords: z.string().optional(),
17
+ robots: z.string().optional(),
18
+ ogTitle: z.string().optional(),
19
+ ogDescription: z.string().optional(),
20
+ ogImage: z.string().optional(),
21
+ ogUrl: z.string().optional(),
22
+ canonical: z.string().optional(),
23
+ }).optional(),
24
+ links: z.array(z.string()).optional(),
25
+ images: z.array(z.string()).optional(),
26
+ }).optional(),
27
+ error: z.string().optional(),
28
+ });
29
+
30
+ export interface FirecrawlConfig {
31
+ apiKey: string;
32
+ baseUrl?: string;
33
+ timeout?: number;
34
+ includeHtml?: boolean;
35
+ includeMarkdown?: boolean;
36
+ includeMetadata?: boolean;
37
+ includeLinks?: boolean;
38
+ waitFor?: number;
39
+ screenshot?: boolean;
40
+ }
41
+
42
+ export interface ExtractedContent {
43
+ content: string;
44
+ markdown?: string;
45
+ html?: string;
46
+ metadata?: {
47
+ title?: string;
48
+ description?: string;
49
+ language?: string;
50
+ ogImage?: string;
51
+ [key: string]: any;
52
+ };
53
+ links?: string[];
54
+ images?: string[];
55
+ }
56
+
57
+ export class FirecrawlContentExtractor {
58
+ private readonly apiKey: string;
59
+ private readonly baseUrl: string;
60
+ private readonly config: FirecrawlConfig;
61
+
62
+ constructor(config: FirecrawlConfig) {
63
+ if (!config.apiKey) {
64
+ throw new Error('Firecrawl API key is required');
65
+ }
66
+ this.apiKey = config.apiKey;
67
+ this.baseUrl = config.baseUrl || 'https://api.firecrawl.dev/v0';
68
+ this.config = {
69
+ timeout: 60000, // 60 seconds default
70
+ includeHtml: false,
71
+ includeMarkdown: true,
72
+ includeMetadata: true,
73
+ includeLinks: true,
74
+ waitFor: 0,
75
+ screenshot: false,
76
+ ...config,
77
+ };
78
+ }
79
+
80
+ async extractContent(url: string): Promise<ExtractedContent | null> {
81
+ const startTime = Date.now();
82
+
83
+ try {
84
+ elizaLogger.info(`[Firecrawl] Extracting content from: ${url}`);
85
+
86
+ const response = await axios.post(
87
+ `${this.baseUrl}/scrape`,
88
+ {
89
+ url,
90
+ formats: this.getFormats(),
91
+ waitFor: this.config.waitFor,
92
+ screenshot: this.config.screenshot,
93
+ includeTags: ['main', 'article', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code', 'pre'],
94
+ excludeTags: ['nav', 'footer', 'aside', 'script', 'style'],
95
+ },
96
+ {
97
+ headers: {
98
+ 'Authorization': `Bearer ${this.apiKey}`,
99
+ 'Content-Type': 'application/json',
100
+ },
101
+ timeout: this.config.timeout,
102
+ }
103
+ );
104
+
105
+ // Validate response
106
+ const validatedData = FirecrawlResponseSchema.parse(response.data);
107
+
108
+ if (!validatedData.success || !validatedData.data) {
109
+ elizaLogger.error(`[Firecrawl] Failed to extract content: ${validatedData.error}`);
110
+ return null;
111
+ }
112
+
113
+ const data = validatedData.data;
114
+ const content = data.markdown || data.content || '';
115
+
116
+ if (!content) {
117
+ elizaLogger.warn(`[Firecrawl] No content extracted from ${url}`);
118
+ return null;
119
+ }
120
+
121
+ const duration = Date.now() - startTime;
122
+ elizaLogger.info(`[Firecrawl] Content extracted in ${duration}ms (${content.length} characters)`);
123
+
124
+ return {
125
+ content,
126
+ markdown: data.markdown,
127
+ html: this.config.includeHtml ? data.html : undefined,
128
+ metadata: data.metadata,
129
+ links: data.links,
130
+ images: data.images,
131
+ };
132
+ } catch (error) {
133
+ const duration = Date.now() - startTime;
134
+
135
+ if (axios.isAxiosError(error)) {
136
+ const axiosError = error as AxiosError;
137
+
138
+ // Handle specific error cases
139
+ if (axiosError.response?.status === 401) {
140
+ elizaLogger.error('[Firecrawl] Invalid API key');
141
+ throw new Error('Invalid Firecrawl API key');
142
+ } else if (axiosError.response?.status === 429) {
143
+ elizaLogger.error('[Firecrawl] Rate limit exceeded');
144
+ throw new Error('Firecrawl rate limit exceeded');
145
+ } else if (axiosError.response?.status === 402) {
146
+ elizaLogger.error('[Firecrawl] Payment required - check your plan');
147
+ throw new Error('Firecrawl payment required');
148
+ } else if (axiosError.code === 'ECONNABORTED') {
149
+ elizaLogger.error(`[Firecrawl] Request timeout after ${duration}ms`);
150
+ throw new Error('Firecrawl extraction timeout');
151
+ }
152
+
153
+ elizaLogger.error(`[Firecrawl] API error: ${axiosError.message}`, {
154
+ status: axiosError.response?.status,
155
+ data: axiosError.response?.data,
156
+ });
157
+ } else if (error instanceof z.ZodError) {
158
+ elizaLogger.error('[Firecrawl] Invalid response format:', error.issues);
159
+ } else {
160
+ elizaLogger.error('[Firecrawl] Unknown error:', error);
161
+ }
162
+
163
+ return null;
164
+ }
165
+ }
166
+
167
+ async extractBatch(urls: string[]): Promise<Map<string, ExtractedContent | null>> {
168
+ elizaLogger.info(`[Firecrawl] Extracting content from ${urls.length} URLs`);
169
+
170
+ const results = new Map<string, ExtractedContent | null>();
171
+
172
+ // Process in batches to avoid rate limits
173
+ const batchSize = 5;
174
+ for (let i = 0; i < urls.length; i += batchSize) {
175
+ const batch = urls.slice(i, i + batchSize);
176
+ const batchPromises = batch.map(url =>
177
+ this.extractContent(url)
178
+ .then(content => ({ url, content }))
179
+ .catch(error => {
180
+ elizaLogger.error(`[Firecrawl] Failed to extract ${url}:`, error);
181
+ return { url, content: null };
182
+ })
183
+ );
184
+
185
+ const batchResults = await Promise.all(batchPromises);
186
+ batchResults.forEach(({ url, content }) => {
187
+ results.set(url, content);
188
+ });
189
+
190
+ // Add delay between batches to respect rate limits
191
+ if (i + batchSize < urls.length) {
192
+ await new Promise(resolve => setTimeout(resolve, 1000));
193
+ }
194
+ }
195
+
196
+ return results;
197
+ }
198
+
199
+ async crawlSite(startUrl: string, maxPages: number = 50): Promise<Map<string, ExtractedContent | null>> {
200
+ try {
201
+ elizaLogger.info(`[Firecrawl] Starting site crawl from: ${startUrl}`);
202
+
203
+ const response = await axios.post(
204
+ `${this.baseUrl}/crawl`,
205
+ {
206
+ url: startUrl,
207
+ crawlerOptions: {
208
+ maxCrawledLinks: maxPages,
209
+ includes: [], // Add patterns to include
210
+ excludes: ['*/tag/*', '*/category/*', '*/page/*'], // Common pagination patterns
211
+ },
212
+ pageOptions: {
213
+ includeMarkdown: true,
214
+ includeHtml: false,
215
+ },
216
+ },
217
+ {
218
+ headers: {
219
+ 'Authorization': `Bearer ${this.apiKey}`,
220
+ 'Content-Type': 'application/json',
221
+ },
222
+ timeout: (this.config.timeout || 60000) * 2, // Longer timeout for crawls
223
+ }
224
+ );
225
+
226
+ const jobId = response.data?.jobId;
227
+
228
+ if (!jobId) {
229
+ elizaLogger.error('[Firecrawl] No job ID returned from crawl request');
230
+ return new Map();
231
+ }
232
+
233
+ // Poll for completion
234
+ return await this.pollCrawlJob(jobId, maxPages);
235
+ } catch (error) {
236
+ elizaLogger.error('[Firecrawl] Site crawl error:', error);
237
+ return new Map();
238
+ }
239
+ }
240
+
241
+ private async pollCrawlJob(jobId: string, maxPages: number): Promise<Map<string, ExtractedContent | null>> {
242
+ const maxAttempts = 60; // 5 minutes max
243
+ const pollInterval = 5000; // 5 seconds
244
+
245
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
246
+ try {
247
+ const response = await axios.get(
248
+ `${this.baseUrl}/crawl/status/${jobId}`,
249
+ {
250
+ headers: {
251
+ 'Authorization': `Bearer ${this.apiKey}`,
252
+ },
253
+ }
254
+ );
255
+
256
+ const { status, data } = response.data;
257
+
258
+ if (status === 'completed' && data) {
259
+ const results = new Map<string, ExtractedContent | null>();
260
+
261
+ data.forEach((page: any) => {
262
+ if (page.markdown || page.content) {
263
+ results.set(page.url, {
264
+ content: page.markdown || page.content,
265
+ markdown: page.markdown,
266
+ metadata: page.metadata,
267
+ });
268
+ }
269
+ });
270
+
271
+ elizaLogger.info(`[Firecrawl] Crawl completed: ${results.size} pages extracted`);
272
+ return results;
273
+ } else if (status === 'failed') {
274
+ elizaLogger.error('[Firecrawl] Crawl job failed');
275
+ return new Map();
276
+ }
277
+
278
+ // Still processing
279
+ await new Promise(resolve => setTimeout(resolve, pollInterval));
280
+ } catch (error) {
281
+ elizaLogger.error('[Firecrawl] Error polling crawl job:', error);
282
+ return new Map();
283
+ }
284
+ }
285
+
286
+ elizaLogger.error('[Firecrawl] Crawl job timeout');
287
+ return new Map();
288
+ }
289
+
290
+ private getFormats(): string[] {
291
+ const formats: string[] = ['text'];
292
+ if (this.config.includeMarkdown) formats.push('markdown');
293
+ if (this.config.includeHtml) formats.push('html');
294
+ return formats;
295
+ }
296
+
297
+ // Get current API usage
298
+ async getUsage(): Promise<{ used: number; limit: number; remaining: number } | null> {
299
+ try {
300
+ const response = await axios.get(`${this.baseUrl}/usage`, {
301
+ headers: { 'Authorization': `Bearer ${this.apiKey}` },
302
+ });
303
+
304
+ return {
305
+ used: response.data.used || 0,
306
+ limit: response.data.limit || 0,
307
+ remaining: response.data.remaining || 0,
308
+ };
309
+ } catch (error) {
310
+ elizaLogger.warn('[Firecrawl] Could not fetch usage data');
311
+ return null;
312
+ }
313
+ }
314
+ }
@@ -0,0 +1,350 @@
1
+ import { elizaLogger } from '@elizaos/core';
2
+ import axios from 'axios';
3
+ import { ExtractedContent } from './firecrawl';
4
+
5
+ // Dynamic import to avoid pdf-parse test code execution
6
+ let pdfParse: any;
7
+ const loadPdfParse = async () => {
8
+ if (!pdfParse) {
9
+ try {
10
+ pdfParse = (await import('pdf-parse')).default;
11
+ } catch (error) {
12
+ elizaLogger.warn('[PDFExtractor] pdf-parse not available, PDF extraction disabled');
13
+ }
14
+ }
15
+ return pdfParse;
16
+ };
17
+
18
+ export interface PDFMetadata {
19
+ title?: string;
20
+ author?: string[];
21
+ subject?: string;
22
+ keywords?: string[];
23
+ creator?: string;
24
+ producer?: string;
25
+ creationDate?: Date;
26
+ modificationDate?: Date;
27
+ pages?: number;
28
+ }
29
+
30
+ export interface AcademicPaperStructure {
31
+ abstract?: string;
32
+ introduction?: string;
33
+ methodology?: string;
34
+ results?: string;
35
+ discussion?: string;
36
+ conclusion?: string;
37
+ references?: Reference[];
38
+ figures?: Figure[];
39
+ tables?: Table[];
40
+ }
41
+
42
+ export interface Reference {
43
+ id: string;
44
+ text: string;
45
+ authors?: string[];
46
+ title?: string;
47
+ year?: number;
48
+ journal?: string;
49
+ doi?: string;
50
+ url?: string;
51
+ }
52
+
53
+ export interface Figure {
54
+ id: string;
55
+ caption: string;
56
+ pageNumber: number;
57
+ mentioned: string[];
58
+ }
59
+
60
+ export interface Table {
61
+ id: string;
62
+ caption: string;
63
+ headers?: string[];
64
+ rows?: string[][];
65
+ pageNumber: number;
66
+ }
67
+
68
+ export class PDFExtractor {
69
+ private readonly maxFileSize = 50 * 1024 * 1024; // 50MB limit
70
+
71
+ async extractFromURL(url: string): Promise<ExtractedContent | null> {
72
+ try {
73
+ elizaLogger.info(`[PDFExtractor] Downloading PDF from: ${url}`);
74
+
75
+ // Download PDF
76
+ const response = await axios.get(url, {
77
+ responseType: 'arraybuffer',
78
+ timeout: 30000,
79
+ maxContentLength: this.maxFileSize,
80
+ headers: {
81
+ 'User-Agent': 'Mozilla/5.0 (compatible; ElizaOS/1.0)',
82
+ },
83
+ });
84
+
85
+ const buffer = Buffer.from(response.data);
86
+ return await this.extractFromBuffer(buffer, url);
87
+ } catch (error) {
88
+ elizaLogger.error('[PDFExtractor] Failed to download PDF:', error);
89
+ return null;
90
+ }
91
+ }
92
+
93
+ async extractFromBuffer(buffer: Buffer, sourceUrl?: string): Promise<ExtractedContent | null> {
94
+ try {
95
+ elizaLogger.info('[PDFExtractor] Parsing PDF buffer');
96
+
97
+ const parser = await loadPdfParse();
98
+ if (!parser) {
99
+ elizaLogger.warn('[PDFExtractor] PDF parser not available');
100
+ return null;
101
+ }
102
+
103
+ const data = await parser(buffer);
104
+
105
+ // Extract metadata
106
+ const metadata = this.extractMetadata(data.info);
107
+
108
+ // Extract text content
109
+ const text = data.text;
110
+
111
+ // Extract academic structure
112
+ const structure = this.extractAcademicStructure(text);
113
+
114
+ // Extract references
115
+ const references = this.extractReferences(text);
116
+
117
+ // Format as markdown
118
+ const markdown = this.formatAsMarkdown(structure, metadata, references);
119
+
120
+ return {
121
+ content: text,
122
+ markdown,
123
+ metadata: {
124
+ ...metadata,
125
+ pageCount: data.numpages,
126
+ textLength: text.length,
127
+ sourceUrl,
128
+ },
129
+ };
130
+ } catch (error) {
131
+ elizaLogger.error('[PDFExtractor] Failed to parse PDF:', error);
132
+ return null;
133
+ }
134
+ }
135
+
136
+ private extractMetadata(info: any): PDFMetadata {
137
+ return {
138
+ title: info.Title,
139
+ author: info.Author ? [info.Author] : undefined,
140
+ subject: info.Subject,
141
+ keywords: info.Keywords ? info.Keywords.split(/[,;]/).map((k: string) => k.trim()) : undefined,
142
+ creator: info.Creator,
143
+ producer: info.Producer,
144
+ creationDate: info.CreationDate ? new Date(info.CreationDate) : undefined,
145
+ modificationDate: info.ModDate ? new Date(info.ModDate) : undefined,
146
+ };
147
+ }
148
+
149
+ private extractAcademicStructure(text: string): AcademicPaperStructure {
150
+ const structure: AcademicPaperStructure = {};
151
+
152
+ // Extract abstract
153
+ const abstractMatch = text.match(/abstract[:\s]*\n([\s\S]*?)(?=\n\s*(?:introduction|keywords|1\.|i\.))/i);
154
+ if (abstractMatch) {
155
+ structure.abstract = this.cleanText(abstractMatch[1]);
156
+ }
157
+
158
+ // Extract introduction
159
+ const introMatch = text.match(/(?:1\.|i\.|\n)\s*introduction[:\s]*\n([\s\S]*?)(?=\n\s*(?:2\.|ii\.|method|related))/i);
160
+ if (introMatch) {
161
+ structure.introduction = this.cleanText(introMatch[1]);
162
+ }
163
+
164
+ // Extract methodology
165
+ const methodMatch = text.match(/(?:method|methodology|approach)[:\s]*\n([\s\S]*?)(?=\n\s*(?:\d\.|results|experiment))/i);
166
+ if (methodMatch) {
167
+ structure.methodology = this.cleanText(methodMatch[1]);
168
+ }
169
+
170
+ // Extract results
171
+ const resultsMatch = text.match(/(?:results|findings|experiments)[:\s]*\n([\s\S]*?)(?=\n\s*(?:discussion|conclusion|\d\.))/i);
172
+ if (resultsMatch) {
173
+ structure.results = this.cleanText(resultsMatch[1]);
174
+ }
175
+
176
+ // Extract discussion
177
+ const discussionMatch = text.match(/discussion[:\s]*\n([\s\S]*?)(?=\n\s*(?:conclusion|acknowledgment|references))/i);
178
+ if (discussionMatch) {
179
+ structure.discussion = this.cleanText(discussionMatch[1]);
180
+ }
181
+
182
+ // Extract conclusion
183
+ const conclusionMatch = text.match(/conclusion[:\s]*\n([\s\S]*?)(?=\n\s*(?:references|acknowledgment|appendix))/i);
184
+ if (conclusionMatch) {
185
+ structure.conclusion = this.cleanText(conclusionMatch[1]);
186
+ }
187
+
188
+ return structure;
189
+ }
190
+
191
+ private extractReferences(text: string): Reference[] {
192
+ const references: Reference[] = [];
193
+
194
+ // Find references section
195
+ const refMatch = text.match(/(?:references|bibliography)[:\s]*\n([\s\S]*?)(?=\n\s*(?:appendix|$))/i);
196
+ if (!refMatch) return references;
197
+
198
+ const refText = refMatch[1];
199
+
200
+ // Split into individual references (numbered or bulleted)
201
+ const refLines = refText.split(/\n(?:\[\d+\]|\d+\.|\•)/);
202
+
203
+ for (const line of refLines) {
204
+ if (line.trim().length < 20) continue; // Skip short lines
205
+
206
+ const ref = this.parseReference(line.trim());
207
+ if (ref) references.push(ref);
208
+ }
209
+
210
+ return references;
211
+ }
212
+
213
+ private parseReference(text: string): Reference | null {
214
+ const ref: Reference = {
215
+ id: `ref-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
216
+ text: text,
217
+ };
218
+
219
+ // Extract DOI
220
+ const doiMatch = text.match(/(?:doi:|https?:\/\/doi\.org\/)(10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+)/i);
221
+ if (doiMatch) {
222
+ ref.doi = doiMatch[1];
223
+ }
224
+
225
+ // Extract year (look for 4-digit year in parentheses or after comma)
226
+ const yearMatch = text.match(/\((\d{4})\)|,\s*(\d{4})/);
227
+ if (yearMatch) {
228
+ ref.year = parseInt(yearMatch[1] || yearMatch[2]);
229
+ }
230
+
231
+ // Extract authors (before year or first period)
232
+ const authorsMatch = text.match(/^([^.(]+?)(?:\s*\(\d{4}\)|\.)/);
233
+ if (authorsMatch) {
234
+ ref.authors = authorsMatch[1].split(/,|&|and/).map(a => a.trim());
235
+ }
236
+
237
+ // Extract title (usually in quotes or after year)
238
+ const titleMatch = text.match(/"([^"]+)"|['"]([^'"]+)['"]|\d{4}\)\s*\.?\s*([^.]+)\./);
239
+ if (titleMatch) {
240
+ ref.title = titleMatch[1] || titleMatch[2] || titleMatch[3];
241
+ }
242
+
243
+ // Extract journal (often in italics or after "In")
244
+ const journalMatch = text.match(/(?:In\s+|[.,]\s*)([A-Z][^,.(]+(?:Journal|Conference|Proceedings|Review)[^,.]*)/);
245
+ if (journalMatch) {
246
+ ref.journal = journalMatch[1].trim();
247
+ }
248
+
249
+ return ref;
250
+ }
251
+
252
+ private cleanText(text: string): string {
253
+ return text
254
+ .replace(/\s+/g, ' ')
255
+ .replace(/\n{3,}/g, '\n\n')
256
+ .trim();
257
+ }
258
+
259
+ private formatAsMarkdown(
260
+ structure: AcademicPaperStructure,
261
+ metadata: PDFMetadata,
262
+ references: Reference[]
263
+ ): string {
264
+ const sections: string[] = [];
265
+
266
+ // Title and metadata
267
+ if (metadata.title) {
268
+ sections.push(`# ${metadata.title}\n`);
269
+ }
270
+
271
+ if (metadata.author?.length) {
272
+ sections.push(`**Authors:** ${metadata.author.join(', ')}\n`);
273
+ }
274
+
275
+ // Abstract
276
+ if (structure.abstract) {
277
+ sections.push(`## Abstract\n\n${structure.abstract}\n`);
278
+ }
279
+
280
+ // Main sections
281
+ if (structure.introduction) {
282
+ sections.push(`## Introduction\n\n${structure.introduction}\n`);
283
+ }
284
+
285
+ if (structure.methodology) {
286
+ sections.push(`## Methodology\n\n${structure.methodology}\n`);
287
+ }
288
+
289
+ if (structure.results) {
290
+ sections.push(`## Results\n\n${structure.results}\n`);
291
+ }
292
+
293
+ if (structure.discussion) {
294
+ sections.push(`## Discussion\n\n${structure.discussion}\n`);
295
+ }
296
+
297
+ if (structure.conclusion) {
298
+ sections.push(`## Conclusion\n\n${structure.conclusion}\n`);
299
+ }
300
+
301
+ // References
302
+ if (references.length > 0) {
303
+ sections.push(`## References\n`);
304
+ for (const ref of references) {
305
+ const parts = [];
306
+ if (ref.authors?.length) parts.push(ref.authors.join(', '));
307
+ if (ref.year) parts.push(`(${ref.year})`);
308
+ if (ref.title) parts.push(`"${ref.title}"`);
309
+ if (ref.journal) parts.push(ref.journal);
310
+ if (ref.doi) parts.push(`DOI: ${ref.doi}`);
311
+
312
+ sections.push(`- ${parts.join('. ')}\n`);
313
+ }
314
+ }
315
+
316
+ return sections.join('\n');
317
+ }
318
+
319
+ // Extract specific sections for targeted analysis
320
+ async extractSection(buffer: Buffer, sectionName: string): Promise<string | null> {
321
+ try {
322
+ const parser = await loadPdfParse();
323
+ if (!parser) {
324
+ elizaLogger.warn('[PDFExtractor] PDF parser not available');
325
+ return null;
326
+ }
327
+
328
+ const data = await parser(buffer);
329
+ const text = data.text;
330
+
331
+ const sectionRegex = new RegExp(
332
+ `${sectionName}[:\\s]*\\n([\\s\\S]*?)(?=\\n\\s*(?:\\d+\\.|[A-Z][^\\n]*:|References|$))`,
333
+ 'i'
334
+ );
335
+
336
+ const match = text.match(sectionRegex);
337
+ return match ? this.cleanText(match[1]) : null;
338
+ } catch (error) {
339
+ elizaLogger.error(`[PDFExtractor] Failed to extract section ${sectionName}:`, error);
340
+ return null;
341
+ }
342
+ }
343
+
344
+ // Check if URL points to a PDF
345
+ static isPDFUrl(url: string): boolean {
346
+ return url.toLowerCase().endsWith('.pdf') ||
347
+ url.includes('pdf') ||
348
+ url.includes('arxiv.org/pdf/');
349
+ }
350
+ }