@elizaos/plugin-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +400 -0
- package/dist/index.cjs +9366 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +9284 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
- package/src/__tests__/action-chaining.test.ts +532 -0
- package/src/__tests__/actions.test.ts +118 -0
- package/src/__tests__/cache-rate-limiter.test.ts +303 -0
- package/src/__tests__/content-extractors.test.ts +26 -0
- package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
- package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
- package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
- package/src/__tests__/e2e.test.ts +1870 -0
- package/src/__tests__/multi-benchmark-runner.ts +427 -0
- package/src/__tests__/providers.test.ts +156 -0
- package/src/__tests__/real-world.e2e.test.ts +788 -0
- package/src/__tests__/research-scenarios.test.ts +755 -0
- package/src/__tests__/research.e2e.test.ts +704 -0
- package/src/__tests__/research.test.ts +174 -0
- package/src/__tests__/search-providers.test.ts +174 -0
- package/src/__tests__/single-benchmark-runner.ts +735 -0
- package/src/__tests__/test-search-providers.ts +171 -0
- package/src/__tests__/verify-apis.test.ts +82 -0
- package/src/actions.ts +1677 -0
- package/src/benchmark/deepresearch-benchmark.ts +369 -0
- package/src/evaluation/research-evaluator.ts +444 -0
- package/src/examples/api-integration.md +498 -0
- package/src/examples/browserbase-integration.md +132 -0
- package/src/examples/debug-research-query.ts +162 -0
- package/src/examples/defi-code-scenarios.md +536 -0
- package/src/examples/defi-implementation-guide.md +454 -0
- package/src/examples/eliza-research-example.ts +142 -0
- package/src/examples/fix-renewable-energy-research.ts +209 -0
- package/src/examples/research-scenarios.md +408 -0
- package/src/examples/run-complete-renewable-research.ts +303 -0
- package/src/examples/run-deep-research.ts +352 -0
- package/src/examples/run-logged-research.ts +304 -0
- package/src/examples/run-real-research.ts +151 -0
- package/src/examples/save-research-output.ts +133 -0
- package/src/examples/test-file-logging.ts +199 -0
- package/src/examples/test-real-research.ts +67 -0
- package/src/examples/test-renewable-energy-research.ts +229 -0
- package/src/index.ts +28 -0
- package/src/integrations/cache.ts +128 -0
- package/src/integrations/content-extractors/firecrawl.ts +314 -0
- package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
- package/src/integrations/content-extractors/playwright.ts +420 -0
- package/src/integrations/factory.ts +419 -0
- package/src/integrations/index.ts +18 -0
- package/src/integrations/rate-limiter.ts +181 -0
- package/src/integrations/search-providers/academic.ts +290 -0
- package/src/integrations/search-providers/exa.ts +205 -0
- package/src/integrations/search-providers/npm.ts +330 -0
- package/src/integrations/search-providers/pypi.ts +211 -0
- package/src/integrations/search-providers/serpapi.ts +277 -0
- package/src/integrations/search-providers/serper.ts +358 -0
- package/src/integrations/search-providers/stagehand-google.ts +87 -0
- package/src/integrations/search-providers/tavily.ts +187 -0
- package/src/processing/relevance-analyzer.ts +353 -0
- package/src/processing/research-logger.ts +450 -0
- package/src/processing/result-processor.ts +372 -0
- package/src/prompts/research-prompts.ts +419 -0
- package/src/providers/cacheProvider.ts +164 -0
- package/src/providers.ts +173 -0
- package/src/service.ts +2588 -0
- package/src/services/swe-bench.ts +286 -0
- package/src/strategies/research-strategies.ts +790 -0
- package/src/types/pdf-parse.d.ts +34 -0
- package/src/types.ts +551 -0
- package/src/verification/claim-verifier.ts +443 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import axios, { AxiosError } from 'axios';
|
|
2
|
+
import { elizaLogger } from '@elizaos/core';
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
|
|
5
|
+
// Firecrawl API response schema
|
|
6
|
+
const FirecrawlResponseSchema = z.object({
|
|
7
|
+
success: z.boolean(),
|
|
8
|
+
data: z.object({
|
|
9
|
+
content: z.string().optional(),
|
|
10
|
+
markdown: z.string().optional(),
|
|
11
|
+
html: z.string().optional(),
|
|
12
|
+
metadata: z.object({
|
|
13
|
+
title: z.string().optional(),
|
|
14
|
+
description: z.string().optional(),
|
|
15
|
+
language: z.string().optional(),
|
|
16
|
+
keywords: z.string().optional(),
|
|
17
|
+
robots: z.string().optional(),
|
|
18
|
+
ogTitle: z.string().optional(),
|
|
19
|
+
ogDescription: z.string().optional(),
|
|
20
|
+
ogImage: z.string().optional(),
|
|
21
|
+
ogUrl: z.string().optional(),
|
|
22
|
+
canonical: z.string().optional(),
|
|
23
|
+
}).optional(),
|
|
24
|
+
links: z.array(z.string()).optional(),
|
|
25
|
+
images: z.array(z.string()).optional(),
|
|
26
|
+
}).optional(),
|
|
27
|
+
error: z.string().optional(),
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
export interface FirecrawlConfig {
|
|
31
|
+
apiKey: string;
|
|
32
|
+
baseUrl?: string;
|
|
33
|
+
timeout?: number;
|
|
34
|
+
includeHtml?: boolean;
|
|
35
|
+
includeMarkdown?: boolean;
|
|
36
|
+
includeMetadata?: boolean;
|
|
37
|
+
includeLinks?: boolean;
|
|
38
|
+
waitFor?: number;
|
|
39
|
+
screenshot?: boolean;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ExtractedContent {
|
|
43
|
+
content: string;
|
|
44
|
+
markdown?: string;
|
|
45
|
+
html?: string;
|
|
46
|
+
metadata?: {
|
|
47
|
+
title?: string;
|
|
48
|
+
description?: string;
|
|
49
|
+
language?: string;
|
|
50
|
+
ogImage?: string;
|
|
51
|
+
[key: string]: any;
|
|
52
|
+
};
|
|
53
|
+
links?: string[];
|
|
54
|
+
images?: string[];
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export class FirecrawlContentExtractor {
|
|
58
|
+
private readonly apiKey: string;
|
|
59
|
+
private readonly baseUrl: string;
|
|
60
|
+
private readonly config: FirecrawlConfig;
|
|
61
|
+
|
|
62
|
+
constructor(config: FirecrawlConfig) {
|
|
63
|
+
if (!config.apiKey) {
|
|
64
|
+
throw new Error('Firecrawl API key is required');
|
|
65
|
+
}
|
|
66
|
+
this.apiKey = config.apiKey;
|
|
67
|
+
this.baseUrl = config.baseUrl || 'https://api.firecrawl.dev/v0';
|
|
68
|
+
this.config = {
|
|
69
|
+
timeout: 60000, // 60 seconds default
|
|
70
|
+
includeHtml: false,
|
|
71
|
+
includeMarkdown: true,
|
|
72
|
+
includeMetadata: true,
|
|
73
|
+
includeLinks: true,
|
|
74
|
+
waitFor: 0,
|
|
75
|
+
screenshot: false,
|
|
76
|
+
...config,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async extractContent(url: string): Promise<ExtractedContent | null> {
|
|
81
|
+
const startTime = Date.now();
|
|
82
|
+
|
|
83
|
+
try {
|
|
84
|
+
elizaLogger.info(`[Firecrawl] Extracting content from: ${url}`);
|
|
85
|
+
|
|
86
|
+
const response = await axios.post(
|
|
87
|
+
`${this.baseUrl}/scrape`,
|
|
88
|
+
{
|
|
89
|
+
url,
|
|
90
|
+
formats: this.getFormats(),
|
|
91
|
+
waitFor: this.config.waitFor,
|
|
92
|
+
screenshot: this.config.screenshot,
|
|
93
|
+
includeTags: ['main', 'article', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code', 'pre'],
|
|
94
|
+
excludeTags: ['nav', 'footer', 'aside', 'script', 'style'],
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
headers: {
|
|
98
|
+
'Authorization': `Bearer ${this.apiKey}`,
|
|
99
|
+
'Content-Type': 'application/json',
|
|
100
|
+
},
|
|
101
|
+
timeout: this.config.timeout,
|
|
102
|
+
}
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
// Validate response
|
|
106
|
+
const validatedData = FirecrawlResponseSchema.parse(response.data);
|
|
107
|
+
|
|
108
|
+
if (!validatedData.success || !validatedData.data) {
|
|
109
|
+
elizaLogger.error(`[Firecrawl] Failed to extract content: ${validatedData.error}`);
|
|
110
|
+
return null;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const data = validatedData.data;
|
|
114
|
+
const content = data.markdown || data.content || '';
|
|
115
|
+
|
|
116
|
+
if (!content) {
|
|
117
|
+
elizaLogger.warn(`[Firecrawl] No content extracted from ${url}`);
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const duration = Date.now() - startTime;
|
|
122
|
+
elizaLogger.info(`[Firecrawl] Content extracted in ${duration}ms (${content.length} characters)`);
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
content,
|
|
126
|
+
markdown: data.markdown,
|
|
127
|
+
html: this.config.includeHtml ? data.html : undefined,
|
|
128
|
+
metadata: data.metadata,
|
|
129
|
+
links: data.links,
|
|
130
|
+
images: data.images,
|
|
131
|
+
};
|
|
132
|
+
} catch (error) {
|
|
133
|
+
const duration = Date.now() - startTime;
|
|
134
|
+
|
|
135
|
+
if (axios.isAxiosError(error)) {
|
|
136
|
+
const axiosError = error as AxiosError;
|
|
137
|
+
|
|
138
|
+
// Handle specific error cases
|
|
139
|
+
if (axiosError.response?.status === 401) {
|
|
140
|
+
elizaLogger.error('[Firecrawl] Invalid API key');
|
|
141
|
+
throw new Error('Invalid Firecrawl API key');
|
|
142
|
+
} else if (axiosError.response?.status === 429) {
|
|
143
|
+
elizaLogger.error('[Firecrawl] Rate limit exceeded');
|
|
144
|
+
throw new Error('Firecrawl rate limit exceeded');
|
|
145
|
+
} else if (axiosError.response?.status === 402) {
|
|
146
|
+
elizaLogger.error('[Firecrawl] Payment required - check your plan');
|
|
147
|
+
throw new Error('Firecrawl payment required');
|
|
148
|
+
} else if (axiosError.code === 'ECONNABORTED') {
|
|
149
|
+
elizaLogger.error(`[Firecrawl] Request timeout after ${duration}ms`);
|
|
150
|
+
throw new Error('Firecrawl extraction timeout');
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
elizaLogger.error(`[Firecrawl] API error: ${axiosError.message}`, {
|
|
154
|
+
status: axiosError.response?.status,
|
|
155
|
+
data: axiosError.response?.data,
|
|
156
|
+
});
|
|
157
|
+
} else if (error instanceof z.ZodError) {
|
|
158
|
+
elizaLogger.error('[Firecrawl] Invalid response format:', error.issues);
|
|
159
|
+
} else {
|
|
160
|
+
elizaLogger.error('[Firecrawl] Unknown error:', error);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return null;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
async extractBatch(urls: string[]): Promise<Map<string, ExtractedContent | null>> {
|
|
168
|
+
elizaLogger.info(`[Firecrawl] Extracting content from ${urls.length} URLs`);
|
|
169
|
+
|
|
170
|
+
const results = new Map<string, ExtractedContent | null>();
|
|
171
|
+
|
|
172
|
+
// Process in batches to avoid rate limits
|
|
173
|
+
const batchSize = 5;
|
|
174
|
+
for (let i = 0; i < urls.length; i += batchSize) {
|
|
175
|
+
const batch = urls.slice(i, i + batchSize);
|
|
176
|
+
const batchPromises = batch.map(url =>
|
|
177
|
+
this.extractContent(url)
|
|
178
|
+
.then(content => ({ url, content }))
|
|
179
|
+
.catch(error => {
|
|
180
|
+
elizaLogger.error(`[Firecrawl] Failed to extract ${url}:`, error);
|
|
181
|
+
return { url, content: null };
|
|
182
|
+
})
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
const batchResults = await Promise.all(batchPromises);
|
|
186
|
+
batchResults.forEach(({ url, content }) => {
|
|
187
|
+
results.set(url, content);
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Add delay between batches to respect rate limits
|
|
191
|
+
if (i + batchSize < urls.length) {
|
|
192
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return results;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async crawlSite(startUrl: string, maxPages: number = 50): Promise<Map<string, ExtractedContent | null>> {
|
|
200
|
+
try {
|
|
201
|
+
elizaLogger.info(`[Firecrawl] Starting site crawl from: ${startUrl}`);
|
|
202
|
+
|
|
203
|
+
const response = await axios.post(
|
|
204
|
+
`${this.baseUrl}/crawl`,
|
|
205
|
+
{
|
|
206
|
+
url: startUrl,
|
|
207
|
+
crawlerOptions: {
|
|
208
|
+
maxCrawledLinks: maxPages,
|
|
209
|
+
includes: [], // Add patterns to include
|
|
210
|
+
excludes: ['*/tag/*', '*/category/*', '*/page/*'], // Common pagination patterns
|
|
211
|
+
},
|
|
212
|
+
pageOptions: {
|
|
213
|
+
includeMarkdown: true,
|
|
214
|
+
includeHtml: false,
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
headers: {
|
|
219
|
+
'Authorization': `Bearer ${this.apiKey}`,
|
|
220
|
+
'Content-Type': 'application/json',
|
|
221
|
+
},
|
|
222
|
+
timeout: (this.config.timeout || 60000) * 2, // Longer timeout for crawls
|
|
223
|
+
}
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
const jobId = response.data?.jobId;
|
|
227
|
+
|
|
228
|
+
if (!jobId) {
|
|
229
|
+
elizaLogger.error('[Firecrawl] No job ID returned from crawl request');
|
|
230
|
+
return new Map();
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Poll for completion
|
|
234
|
+
return await this.pollCrawlJob(jobId, maxPages);
|
|
235
|
+
} catch (error) {
|
|
236
|
+
elizaLogger.error('[Firecrawl] Site crawl error:', error);
|
|
237
|
+
return new Map();
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
private async pollCrawlJob(jobId: string, maxPages: number): Promise<Map<string, ExtractedContent | null>> {
|
|
242
|
+
const maxAttempts = 60; // 5 minutes max
|
|
243
|
+
const pollInterval = 5000; // 5 seconds
|
|
244
|
+
|
|
245
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
246
|
+
try {
|
|
247
|
+
const response = await axios.get(
|
|
248
|
+
`${this.baseUrl}/crawl/status/${jobId}`,
|
|
249
|
+
{
|
|
250
|
+
headers: {
|
|
251
|
+
'Authorization': `Bearer ${this.apiKey}`,
|
|
252
|
+
},
|
|
253
|
+
}
|
|
254
|
+
);
|
|
255
|
+
|
|
256
|
+
const { status, data } = response.data;
|
|
257
|
+
|
|
258
|
+
if (status === 'completed' && data) {
|
|
259
|
+
const results = new Map<string, ExtractedContent | null>();
|
|
260
|
+
|
|
261
|
+
data.forEach((page: any) => {
|
|
262
|
+
if (page.markdown || page.content) {
|
|
263
|
+
results.set(page.url, {
|
|
264
|
+
content: page.markdown || page.content,
|
|
265
|
+
markdown: page.markdown,
|
|
266
|
+
metadata: page.metadata,
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
elizaLogger.info(`[Firecrawl] Crawl completed: ${results.size} pages extracted`);
|
|
272
|
+
return results;
|
|
273
|
+
} else if (status === 'failed') {
|
|
274
|
+
elizaLogger.error('[Firecrawl] Crawl job failed');
|
|
275
|
+
return new Map();
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Still processing
|
|
279
|
+
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
280
|
+
} catch (error) {
|
|
281
|
+
elizaLogger.error('[Firecrawl] Error polling crawl job:', error);
|
|
282
|
+
return new Map();
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
elizaLogger.error('[Firecrawl] Crawl job timeout');
|
|
287
|
+
return new Map();
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
private getFormats(): string[] {
|
|
291
|
+
const formats: string[] = ['text'];
|
|
292
|
+
if (this.config.includeMarkdown) formats.push('markdown');
|
|
293
|
+
if (this.config.includeHtml) formats.push('html');
|
|
294
|
+
return formats;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Get current API usage
|
|
298
|
+
async getUsage(): Promise<{ used: number; limit: number; remaining: number } | null> {
|
|
299
|
+
try {
|
|
300
|
+
const response = await axios.get(`${this.baseUrl}/usage`, {
|
|
301
|
+
headers: { 'Authorization': `Bearer ${this.apiKey}` },
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
used: response.data.used || 0,
|
|
306
|
+
limit: response.data.limit || 0,
|
|
307
|
+
remaining: response.data.remaining || 0,
|
|
308
|
+
};
|
|
309
|
+
} catch (error) {
|
|
310
|
+
elizaLogger.warn('[Firecrawl] Could not fetch usage data');
|
|
311
|
+
return null;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import { elizaLogger } from '@elizaos/core';
|
|
2
|
+
import axios from 'axios';
|
|
3
|
+
import { ExtractedContent } from './firecrawl';
|
|
4
|
+
|
|
5
|
+
// Dynamic import to avoid pdf-parse test code execution
|
|
6
|
+
let pdfParse: any;
|
|
7
|
+
const loadPdfParse = async () => {
|
|
8
|
+
if (!pdfParse) {
|
|
9
|
+
try {
|
|
10
|
+
pdfParse = (await import('pdf-parse')).default;
|
|
11
|
+
} catch (error) {
|
|
12
|
+
elizaLogger.warn('[PDFExtractor] pdf-parse not available, PDF extraction disabled');
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
return pdfParse;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
export interface PDFMetadata {
|
|
19
|
+
title?: string;
|
|
20
|
+
author?: string[];
|
|
21
|
+
subject?: string;
|
|
22
|
+
keywords?: string[];
|
|
23
|
+
creator?: string;
|
|
24
|
+
producer?: string;
|
|
25
|
+
creationDate?: Date;
|
|
26
|
+
modificationDate?: Date;
|
|
27
|
+
pages?: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface AcademicPaperStructure {
|
|
31
|
+
abstract?: string;
|
|
32
|
+
introduction?: string;
|
|
33
|
+
methodology?: string;
|
|
34
|
+
results?: string;
|
|
35
|
+
discussion?: string;
|
|
36
|
+
conclusion?: string;
|
|
37
|
+
references?: Reference[];
|
|
38
|
+
figures?: Figure[];
|
|
39
|
+
tables?: Table[];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface Reference {
|
|
43
|
+
id: string;
|
|
44
|
+
text: string;
|
|
45
|
+
authors?: string[];
|
|
46
|
+
title?: string;
|
|
47
|
+
year?: number;
|
|
48
|
+
journal?: string;
|
|
49
|
+
doi?: string;
|
|
50
|
+
url?: string;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface Figure {
|
|
54
|
+
id: string;
|
|
55
|
+
caption: string;
|
|
56
|
+
pageNumber: number;
|
|
57
|
+
mentioned: string[];
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface Table {
|
|
61
|
+
id: string;
|
|
62
|
+
caption: string;
|
|
63
|
+
headers?: string[];
|
|
64
|
+
rows?: string[][];
|
|
65
|
+
pageNumber: number;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export class PDFExtractor {
|
|
69
|
+
private readonly maxFileSize = 50 * 1024 * 1024; // 50MB limit
|
|
70
|
+
|
|
71
|
+
async extractFromURL(url: string): Promise<ExtractedContent | null> {
|
|
72
|
+
try {
|
|
73
|
+
elizaLogger.info(`[PDFExtractor] Downloading PDF from: ${url}`);
|
|
74
|
+
|
|
75
|
+
// Download PDF
|
|
76
|
+
const response = await axios.get(url, {
|
|
77
|
+
responseType: 'arraybuffer',
|
|
78
|
+
timeout: 30000,
|
|
79
|
+
maxContentLength: this.maxFileSize,
|
|
80
|
+
headers: {
|
|
81
|
+
'User-Agent': 'Mozilla/5.0 (compatible; ElizaOS/1.0)',
|
|
82
|
+
},
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
const buffer = Buffer.from(response.data);
|
|
86
|
+
return await this.extractFromBuffer(buffer, url);
|
|
87
|
+
} catch (error) {
|
|
88
|
+
elizaLogger.error('[PDFExtractor] Failed to download PDF:', error);
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async extractFromBuffer(buffer: Buffer, sourceUrl?: string): Promise<ExtractedContent | null> {
|
|
94
|
+
try {
|
|
95
|
+
elizaLogger.info('[PDFExtractor] Parsing PDF buffer');
|
|
96
|
+
|
|
97
|
+
const parser = await loadPdfParse();
|
|
98
|
+
if (!parser) {
|
|
99
|
+
elizaLogger.warn('[PDFExtractor] PDF parser not available');
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const data = await parser(buffer);
|
|
104
|
+
|
|
105
|
+
// Extract metadata
|
|
106
|
+
const metadata = this.extractMetadata(data.info);
|
|
107
|
+
|
|
108
|
+
// Extract text content
|
|
109
|
+
const text = data.text;
|
|
110
|
+
|
|
111
|
+
// Extract academic structure
|
|
112
|
+
const structure = this.extractAcademicStructure(text);
|
|
113
|
+
|
|
114
|
+
// Extract references
|
|
115
|
+
const references = this.extractReferences(text);
|
|
116
|
+
|
|
117
|
+
// Format as markdown
|
|
118
|
+
const markdown = this.formatAsMarkdown(structure, metadata, references);
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
content: text,
|
|
122
|
+
markdown,
|
|
123
|
+
metadata: {
|
|
124
|
+
...metadata,
|
|
125
|
+
pageCount: data.numpages,
|
|
126
|
+
textLength: text.length,
|
|
127
|
+
sourceUrl,
|
|
128
|
+
},
|
|
129
|
+
};
|
|
130
|
+
} catch (error) {
|
|
131
|
+
elizaLogger.error('[PDFExtractor] Failed to parse PDF:', error);
|
|
132
|
+
return null;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
private extractMetadata(info: any): PDFMetadata {
|
|
137
|
+
return {
|
|
138
|
+
title: info.Title,
|
|
139
|
+
author: info.Author ? [info.Author] : undefined,
|
|
140
|
+
subject: info.Subject,
|
|
141
|
+
keywords: info.Keywords ? info.Keywords.split(/[,;]/).map((k: string) => k.trim()) : undefined,
|
|
142
|
+
creator: info.Creator,
|
|
143
|
+
producer: info.Producer,
|
|
144
|
+
creationDate: info.CreationDate ? new Date(info.CreationDate) : undefined,
|
|
145
|
+
modificationDate: info.ModDate ? new Date(info.ModDate) : undefined,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
private extractAcademicStructure(text: string): AcademicPaperStructure {
|
|
150
|
+
const structure: AcademicPaperStructure = {};
|
|
151
|
+
|
|
152
|
+
// Extract abstract
|
|
153
|
+
const abstractMatch = text.match(/abstract[:\s]*\n([\s\S]*?)(?=\n\s*(?:introduction|keywords|1\.|i\.))/i);
|
|
154
|
+
if (abstractMatch) {
|
|
155
|
+
structure.abstract = this.cleanText(abstractMatch[1]);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Extract introduction
|
|
159
|
+
const introMatch = text.match(/(?:1\.|i\.|\n)\s*introduction[:\s]*\n([\s\S]*?)(?=\n\s*(?:2\.|ii\.|method|related))/i);
|
|
160
|
+
if (introMatch) {
|
|
161
|
+
structure.introduction = this.cleanText(introMatch[1]);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Extract methodology
|
|
165
|
+
const methodMatch = text.match(/(?:method|methodology|approach)[:\s]*\n([\s\S]*?)(?=\n\s*(?:\d\.|results|experiment))/i);
|
|
166
|
+
if (methodMatch) {
|
|
167
|
+
structure.methodology = this.cleanText(methodMatch[1]);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Extract results
|
|
171
|
+
const resultsMatch = text.match(/(?:results|findings|experiments)[:\s]*\n([\s\S]*?)(?=\n\s*(?:discussion|conclusion|\d\.))/i);
|
|
172
|
+
if (resultsMatch) {
|
|
173
|
+
structure.results = this.cleanText(resultsMatch[1]);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Extract discussion
|
|
177
|
+
const discussionMatch = text.match(/discussion[:\s]*\n([\s\S]*?)(?=\n\s*(?:conclusion|acknowledgment|references))/i);
|
|
178
|
+
if (discussionMatch) {
|
|
179
|
+
structure.discussion = this.cleanText(discussionMatch[1]);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Extract conclusion
|
|
183
|
+
const conclusionMatch = text.match(/conclusion[:\s]*\n([\s\S]*?)(?=\n\s*(?:references|acknowledgment|appendix))/i);
|
|
184
|
+
if (conclusionMatch) {
|
|
185
|
+
structure.conclusion = this.cleanText(conclusionMatch[1]);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return structure;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
private extractReferences(text: string): Reference[] {
|
|
192
|
+
const references: Reference[] = [];
|
|
193
|
+
|
|
194
|
+
// Find references section
|
|
195
|
+
const refMatch = text.match(/(?:references|bibliography)[:\s]*\n([\s\S]*?)(?=\n\s*(?:appendix|$))/i);
|
|
196
|
+
if (!refMatch) return references;
|
|
197
|
+
|
|
198
|
+
const refText = refMatch[1];
|
|
199
|
+
|
|
200
|
+
// Split into individual references (numbered or bulleted)
|
|
201
|
+
const refLines = refText.split(/\n(?:\[\d+\]|\d+\.|\•)/);
|
|
202
|
+
|
|
203
|
+
for (const line of refLines) {
|
|
204
|
+
if (line.trim().length < 20) continue; // Skip short lines
|
|
205
|
+
|
|
206
|
+
const ref = this.parseReference(line.trim());
|
|
207
|
+
if (ref) references.push(ref);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return references;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
private parseReference(text: string): Reference | null {
|
|
214
|
+
const ref: Reference = {
|
|
215
|
+
id: `ref-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
|
|
216
|
+
text: text,
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
// Extract DOI
|
|
220
|
+
const doiMatch = text.match(/(?:doi:|https?:\/\/doi\.org\/)(10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+)/i);
|
|
221
|
+
if (doiMatch) {
|
|
222
|
+
ref.doi = doiMatch[1];
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Extract year (look for 4-digit year in parentheses or after comma)
|
|
226
|
+
const yearMatch = text.match(/\((\d{4})\)|,\s*(\d{4})/);
|
|
227
|
+
if (yearMatch) {
|
|
228
|
+
ref.year = parseInt(yearMatch[1] || yearMatch[2]);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Extract authors (before year or first period)
|
|
232
|
+
const authorsMatch = text.match(/^([^.(]+?)(?:\s*\(\d{4}\)|\.)/);
|
|
233
|
+
if (authorsMatch) {
|
|
234
|
+
ref.authors = authorsMatch[1].split(/,|&|and/).map(a => a.trim());
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Extract title (usually in quotes or after year)
|
|
238
|
+
const titleMatch = text.match(/"([^"]+)"|['"]([^'"]+)['"]|\d{4}\)\s*\.?\s*([^.]+)\./);
|
|
239
|
+
if (titleMatch) {
|
|
240
|
+
ref.title = titleMatch[1] || titleMatch[2] || titleMatch[3];
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Extract journal (often in italics or after "In")
|
|
244
|
+
const journalMatch = text.match(/(?:In\s+|[.,]\s*)([A-Z][^,.(]+(?:Journal|Conference|Proceedings|Review)[^,.]*)/);
|
|
245
|
+
if (journalMatch) {
|
|
246
|
+
ref.journal = journalMatch[1].trim();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return ref;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
private cleanText(text: string): string {
|
|
253
|
+
return text
|
|
254
|
+
.replace(/\s+/g, ' ')
|
|
255
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
256
|
+
.trim();
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
private formatAsMarkdown(
|
|
260
|
+
structure: AcademicPaperStructure,
|
|
261
|
+
metadata: PDFMetadata,
|
|
262
|
+
references: Reference[]
|
|
263
|
+
): string {
|
|
264
|
+
const sections: string[] = [];
|
|
265
|
+
|
|
266
|
+
// Title and metadata
|
|
267
|
+
if (metadata.title) {
|
|
268
|
+
sections.push(`# ${metadata.title}\n`);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (metadata.author?.length) {
|
|
272
|
+
sections.push(`**Authors:** ${metadata.author.join(', ')}\n`);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Abstract
|
|
276
|
+
if (structure.abstract) {
|
|
277
|
+
sections.push(`## Abstract\n\n${structure.abstract}\n`);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Main sections
|
|
281
|
+
if (structure.introduction) {
|
|
282
|
+
sections.push(`## Introduction\n\n${structure.introduction}\n`);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
if (structure.methodology) {
|
|
286
|
+
sections.push(`## Methodology\n\n${structure.methodology}\n`);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (structure.results) {
|
|
290
|
+
sections.push(`## Results\n\n${structure.results}\n`);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (structure.discussion) {
|
|
294
|
+
sections.push(`## Discussion\n\n${structure.discussion}\n`);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (structure.conclusion) {
|
|
298
|
+
sections.push(`## Conclusion\n\n${structure.conclusion}\n`);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// References
|
|
302
|
+
if (references.length > 0) {
|
|
303
|
+
sections.push(`## References\n`);
|
|
304
|
+
for (const ref of references) {
|
|
305
|
+
const parts = [];
|
|
306
|
+
if (ref.authors?.length) parts.push(ref.authors.join(', '));
|
|
307
|
+
if (ref.year) parts.push(`(${ref.year})`);
|
|
308
|
+
if (ref.title) parts.push(`"${ref.title}"`);
|
|
309
|
+
if (ref.journal) parts.push(ref.journal);
|
|
310
|
+
if (ref.doi) parts.push(`DOI: ${ref.doi}`);
|
|
311
|
+
|
|
312
|
+
sections.push(`- ${parts.join('. ')}\n`);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return sections.join('\n');
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Extract specific sections for targeted analysis
|
|
320
|
+
async extractSection(buffer: Buffer, sectionName: string): Promise<string | null> {
|
|
321
|
+
try {
|
|
322
|
+
const parser = await loadPdfParse();
|
|
323
|
+
if (!parser) {
|
|
324
|
+
elizaLogger.warn('[PDFExtractor] PDF parser not available');
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const data = await parser(buffer);
|
|
329
|
+
const text = data.text;
|
|
330
|
+
|
|
331
|
+
const sectionRegex = new RegExp(
|
|
332
|
+
`${sectionName}[:\\s]*\\n([\\s\\S]*?)(?=\\n\\s*(?:\\d+\\.|[A-Z][^\\n]*:|References|$))`,
|
|
333
|
+
'i'
|
|
334
|
+
);
|
|
335
|
+
|
|
336
|
+
const match = text.match(sectionRegex);
|
|
337
|
+
return match ? this.cleanText(match[1]) : null;
|
|
338
|
+
} catch (error) {
|
|
339
|
+
elizaLogger.error(`[PDFExtractor] Failed to extract section ${sectionName}:`, error);
|
|
340
|
+
return null;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Check if URL points to a PDF
|
|
345
|
+
static isPDFUrl(url: string): boolean {
|
|
346
|
+
return url.toLowerCase().endsWith('.pdf') ||
|
|
347
|
+
url.includes('pdf') ||
|
|
348
|
+
url.includes('arxiv.org/pdf/');
|
|
349
|
+
}
|
|
350
|
+
}
|