crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDFProcessor - PDF document processing with text and metadata extraction
|
|
3
|
+
* Handles PDF files from URLs or local paths with comprehensive error handling
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Use dynamic import for pdf-parse to avoid initialization issues
|
|
7
|
+
import { z } from 'zod';
|
|
8
|
+
import fs from 'fs/promises';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
|
|
11
|
+
const PDFProcessorSchema = z.object({
|
|
12
|
+
source: z.string().min(1),
|
|
13
|
+
sourceType: z.enum(['url', 'file', 'buffer']).default('url'),
|
|
14
|
+
options: z.object({
|
|
15
|
+
extractMetadata: z.boolean().default(true),
|
|
16
|
+
extractText: z.boolean().default(true),
|
|
17
|
+
password: z.string().optional(),
|
|
18
|
+
maxPages: z.number().min(1).max(1000).default(100),
|
|
19
|
+
parseOptions: z.object({
|
|
20
|
+
normalizeWhitespace: z.boolean().default(true),
|
|
21
|
+
disableCombineTextItems: z.boolean().default(false)
|
|
22
|
+
}).optional().default({})
|
|
23
|
+
}).optional().default({})
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
const PDFResult = z.object({
|
|
27
|
+
source: z.string(),
|
|
28
|
+
sourceType: z.string(),
|
|
29
|
+
text: z.string().optional(),
|
|
30
|
+
metadata: z.object({
|
|
31
|
+
title: z.string().nullable(),
|
|
32
|
+
author: z.string().nullable(),
|
|
33
|
+
subject: z.string().nullable(),
|
|
34
|
+
creator: z.string().nullable(),
|
|
35
|
+
producer: z.string().nullable(),
|
|
36
|
+
creationDate: z.string().nullable(),
|
|
37
|
+
modificationDate: z.string().nullable(),
|
|
38
|
+
format: z.string().nullable(),
|
|
39
|
+
pages: z.number().nullable(),
|
|
40
|
+
encrypted: z.boolean().nullable(),
|
|
41
|
+
linearized: z.boolean().nullable(),
|
|
42
|
+
pdfVersion: z.string().nullable()
|
|
43
|
+
}).optional(),
|
|
44
|
+
pageCount: z.number(),
|
|
45
|
+
extractedAt: z.string(),
|
|
46
|
+
processingTime: z.number(),
|
|
47
|
+
success: z.boolean(),
|
|
48
|
+
error: z.string().optional()
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
export class PDFProcessor {
|
|
52
|
+
constructor() {
|
|
53
|
+
this.defaultOptions = {
|
|
54
|
+
extractMetadata: true,
|
|
55
|
+
extractText: true,
|
|
56
|
+
maxPages: 100,
|
|
57
|
+
parseOptions: {
|
|
58
|
+
normalizeWhitespace: true,
|
|
59
|
+
disableCombineTextItems: false
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Process PDF document from various sources
|
|
66
|
+
* @param {Object} params - Processing parameters
|
|
67
|
+
* @param {string} params.source - PDF source (URL, file path, or buffer)
|
|
68
|
+
* @param {string} params.sourceType - Type of source ('url', 'file', 'buffer')
|
|
69
|
+
* @param {Object} params.options - Processing options
|
|
70
|
+
* @returns {Promise<Object>} - Processing result with text and metadata
|
|
71
|
+
*/
|
|
72
|
+
async processPDF(params) {
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const validated = PDFProcessorSchema.parse(params);
|
|
77
|
+
const { source, sourceType, options } = validated;
|
|
78
|
+
const processingOptions = { ...this.defaultOptions, ...options };
|
|
79
|
+
|
|
80
|
+
const result = {
|
|
81
|
+
source,
|
|
82
|
+
sourceType,
|
|
83
|
+
extractedAt: new Date().toISOString(),
|
|
84
|
+
success: false,
|
|
85
|
+
processingTime: 0
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
// Get PDF buffer based on source type
|
|
89
|
+
let pdfBuffer;
|
|
90
|
+
try {
|
|
91
|
+
pdfBuffer = await this.getPDFBuffer(source, sourceType);
|
|
92
|
+
} catch (error) {
|
|
93
|
+
result.error = `Failed to load PDF: ${error.message}`;
|
|
94
|
+
result.processingTime = Date.now() - startTime;
|
|
95
|
+
return result;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Parse PDF with options
|
|
99
|
+
const parseOptions = {
|
|
100
|
+
...processingOptions.parseOptions,
|
|
101
|
+
max: processingOptions.maxPages
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
if (processingOptions.password) {
|
|
105
|
+
parseOptions.password = processingOptions.password;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let pdfData;
|
|
109
|
+
try {
|
|
110
|
+
// Dynamic import to avoid initialization issues
|
|
111
|
+
const pdfParse = (await import('pdf-parse')).default;
|
|
112
|
+
pdfData = await pdfParse(pdfBuffer, parseOptions);
|
|
113
|
+
} catch (error) {
|
|
114
|
+
result.error = `PDF parsing failed: ${error.message}`;
|
|
115
|
+
result.processingTime = Date.now() - startTime;
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Extract text content
|
|
120
|
+
if (processingOptions.extractText) {
|
|
121
|
+
result.text = this.cleanPDFText(pdfData.text);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Extract metadata
|
|
125
|
+
if (processingOptions.extractMetadata) {
|
|
126
|
+
result.metadata = this.extractPDFMetadata(pdfData);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Set page count
|
|
130
|
+
result.pageCount = pdfData.numpages || 0;
|
|
131
|
+
|
|
132
|
+
// Calculate processing time
|
|
133
|
+
result.processingTime = Date.now() - startTime;
|
|
134
|
+
result.success = true;
|
|
135
|
+
|
|
136
|
+
return result;
|
|
137
|
+
|
|
138
|
+
} catch (error) {
|
|
139
|
+
return {
|
|
140
|
+
source: params.source || 'unknown',
|
|
141
|
+
sourceType: params.sourceType || 'unknown',
|
|
142
|
+
extractedAt: new Date().toISOString(),
|
|
143
|
+
success: false,
|
|
144
|
+
error: `PDF processing failed: ${error.message}`,
|
|
145
|
+
processingTime: Date.now() - startTime,
|
|
146
|
+
pageCount: 0
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Get PDF buffer from various sources
|
|
153
|
+
* @param {string} source - PDF source
|
|
154
|
+
* @param {string} sourceType - Source type
|
|
155
|
+
* @returns {Promise<Buffer>} - PDF buffer
|
|
156
|
+
*/
|
|
157
|
+
async getPDFBuffer(source, sourceType) {
|
|
158
|
+
switch (sourceType) {
|
|
159
|
+
case 'url':
|
|
160
|
+
return await this.downloadPDFFromURL(source);
|
|
161
|
+
case 'file':
|
|
162
|
+
return await this.readPDFFromFile(source);
|
|
163
|
+
case 'buffer':
|
|
164
|
+
return Buffer.isBuffer(source) ? source : Buffer.from(source);
|
|
165
|
+
default:
|
|
166
|
+
throw new Error(`Unsupported source type: ${sourceType}`);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Download PDF from URL
|
|
172
|
+
* @param {string} url - PDF URL
|
|
173
|
+
* @returns {Promise<Buffer>} - PDF buffer
|
|
174
|
+
*/
|
|
175
|
+
async downloadPDFFromURL(url) {
|
|
176
|
+
try {
|
|
177
|
+
const response = await fetch(url, {
|
|
178
|
+
headers: {
|
|
179
|
+
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/2.0; PDF-Processor)'
|
|
180
|
+
},
|
|
181
|
+
timeout: 30000
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
if (!response.ok) {
|
|
185
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const contentType = response.headers.get('content-type');
|
|
189
|
+
if (contentType && !contentType.includes('pdf')) {
|
|
190
|
+
console.warn(`Warning: Content-Type is ${contentType}, expected PDF`);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
194
|
+
return Buffer.from(arrayBuffer);
|
|
195
|
+
|
|
196
|
+
} catch (error) {
|
|
197
|
+
throw new Error(`Failed to download PDF from URL: ${error.message}`);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Read PDF from local file
|
|
203
|
+
* @param {string} filePath - Local file path
|
|
204
|
+
* @returns {Promise<Buffer>} - PDF buffer
|
|
205
|
+
*/
|
|
206
|
+
async readPDFFromFile(filePath) {
|
|
207
|
+
try {
|
|
208
|
+
// Validate file path
|
|
209
|
+
const resolvedPath = path.resolve(filePath);
|
|
210
|
+
const stats = await fs.stat(resolvedPath);
|
|
211
|
+
|
|
212
|
+
if (!stats.isFile()) {
|
|
213
|
+
throw new Error('Path is not a file');
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Check file extension
|
|
217
|
+
const ext = path.extname(resolvedPath).toLowerCase();
|
|
218
|
+
if (ext !== '.pdf') {
|
|
219
|
+
console.warn(`Warning: File extension is ${ext}, expected .pdf`);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Read file
|
|
223
|
+
return await fs.readFile(resolvedPath);
|
|
224
|
+
|
|
225
|
+
} catch (error) {
|
|
226
|
+
throw new Error(`Failed to read PDF file: ${error.message}`);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Extract and format PDF metadata
|
|
232
|
+
* @param {Object} pdfData - Parsed PDF data from pdf-parse
|
|
233
|
+
* @returns {Object} - Formatted metadata
|
|
234
|
+
*/
|
|
235
|
+
extractPDFMetadata(pdfData) {
|
|
236
|
+
const info = pdfData.info || {};
|
|
237
|
+
const metadata = pdfData.metadata || {};
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
title: this.cleanMetadataValue(info.Title || metadata.title),
|
|
241
|
+
author: this.cleanMetadataValue(info.Author || metadata.author),
|
|
242
|
+
subject: this.cleanMetadataValue(info.Subject || metadata.subject),
|
|
243
|
+
creator: this.cleanMetadataValue(info.Creator || metadata.creator),
|
|
244
|
+
producer: this.cleanMetadataValue(info.Producer || metadata.producer),
|
|
245
|
+
creationDate: this.formatPDFDate(info.CreationDate || metadata.creationDate),
|
|
246
|
+
modificationDate: this.formatPDFDate(info.ModDate || metadata.modificationDate),
|
|
247
|
+
format: this.cleanMetadataValue(info.Format || metadata.format),
|
|
248
|
+
pages: pdfData.numpages || null,
|
|
249
|
+
encrypted: info.IsEncrypted || false,
|
|
250
|
+
linearized: info.IsLinearized || false,
|
|
251
|
+
pdfVersion: this.cleanMetadataValue(info.PDFFormatVersion || metadata.pdfVersion)
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Clean metadata value
|
|
257
|
+
* @param {any} value - Raw metadata value
|
|
258
|
+
* @returns {string|null} - Cleaned value
|
|
259
|
+
*/
|
|
260
|
+
cleanMetadataValue(value) {
|
|
261
|
+
if (value === undefined || value === null) {
|
|
262
|
+
return null;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const stringValue = String(value).trim();
|
|
266
|
+
return stringValue.length > 0 ? stringValue : null;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Format PDF date string
|
|
271
|
+
* @param {string} dateString - Raw PDF date
|
|
272
|
+
* @returns {string|null} - Formatted date
|
|
273
|
+
*/
|
|
274
|
+
formatPDFDate(dateString) {
|
|
275
|
+
if (!dateString) return null;
|
|
276
|
+
|
|
277
|
+
try {
|
|
278
|
+
// PDF dates are often in format: D:YYYYMMDDHHmmSSOHH'mm'
|
|
279
|
+
let cleanDate = dateString.toString().trim();
|
|
280
|
+
|
|
281
|
+
// Remove D: prefix if present
|
|
282
|
+
if (cleanDate.startsWith('D:')) {
|
|
283
|
+
cleanDate = cleanDate.substring(2);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Extract YYYYMMDDHHMMSS part
|
|
287
|
+
const match = cleanDate.match(/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/);
|
|
288
|
+
if (match) {
|
|
289
|
+
const [, year, month, day, hour, minute, second] = match;
|
|
290
|
+
const date = new Date(
|
|
291
|
+
parseInt(year),
|
|
292
|
+
parseInt(month) - 1, // Month is 0-indexed
|
|
293
|
+
parseInt(day),
|
|
294
|
+
parseInt(hour),
|
|
295
|
+
parseInt(minute),
|
|
296
|
+
parseInt(second)
|
|
297
|
+
);
|
|
298
|
+
|
|
299
|
+
if (!isNaN(date.getTime())) {
|
|
300
|
+
return date.toISOString();
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Try to parse as regular date
|
|
305
|
+
const date = new Date(cleanDate);
|
|
306
|
+
if (!isNaN(date.getTime())) {
|
|
307
|
+
return date.toISOString();
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return cleanDate; // Return as-is if can't parse
|
|
311
|
+
|
|
312
|
+
} catch (error) {
|
|
313
|
+
return dateString; // Return original if parsing fails
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Clean and normalize PDF text content
|
|
319
|
+
* @param {string} text - Raw PDF text
|
|
320
|
+
* @returns {string} - Cleaned text
|
|
321
|
+
*/
|
|
322
|
+
cleanPDFText(text) {
|
|
323
|
+
if (!text || typeof text !== 'string') {
|
|
324
|
+
return '';
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return text
|
|
328
|
+
// Normalize line breaks
|
|
329
|
+
.replace(/\r\n/g, '\n')
|
|
330
|
+
.replace(/\r/g, '\n')
|
|
331
|
+
// Remove excessive whitespace
|
|
332
|
+
.replace(/[ \t]+/g, ' ')
|
|
333
|
+
// Remove excessive line breaks (more than 2)
|
|
334
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
335
|
+
// Remove leading/trailing whitespace from lines
|
|
336
|
+
.split('\n')
|
|
337
|
+
.map(line => line.trim())
|
|
338
|
+
.join('\n')
|
|
339
|
+
// Remove leading/trailing whitespace from entire text
|
|
340
|
+
.trim();
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Process multiple PDFs concurrently
|
|
345
|
+
* @param {Array} sources - Array of PDF sources
|
|
346
|
+
* @param {Object} options - Processing options
|
|
347
|
+
* @returns {Promise<Array>} - Array of processing results
|
|
348
|
+
*/
|
|
349
|
+
async processMultiplePDFs(sources, options = {}) {
|
|
350
|
+
const concurrency = options.concurrency || 3;
|
|
351
|
+
const results = [];
|
|
352
|
+
|
|
353
|
+
// Process in batches to avoid overwhelming the system
|
|
354
|
+
for (let i = 0; i < sources.length; i += concurrency) {
|
|
355
|
+
const batch = sources.slice(i, i + concurrency);
|
|
356
|
+
const batchPromises = batch.map(source => {
|
|
357
|
+
const params = typeof source === 'string'
|
|
358
|
+
? { source, sourceType: 'url', options }
|
|
359
|
+
: { ...source, options: { ...options, ...source.options } };
|
|
360
|
+
|
|
361
|
+
return this.processPDF(params).catch(error => ({
|
|
362
|
+
source: params.source,
|
|
363
|
+
success: false,
|
|
364
|
+
error: error.message,
|
|
365
|
+
extractedAt: new Date().toISOString(),
|
|
366
|
+
processingTime: 0,
|
|
367
|
+
pageCount: 0
|
|
368
|
+
}));
|
|
369
|
+
});
|
|
370
|
+
|
|
371
|
+
const batchResults = await Promise.all(batchPromises);
|
|
372
|
+
results.push(...batchResults);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return results;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Get text statistics from extracted content
|
|
380
|
+
* @param {string} text - Extracted text
|
|
381
|
+
* @returns {Object} - Text statistics
|
|
382
|
+
*/
|
|
383
|
+
getTextStatistics(text) {
|
|
384
|
+
if (!text || typeof text !== 'string') {
|
|
385
|
+
return {
|
|
386
|
+
characters: 0,
|
|
387
|
+
charactersNoSpaces: 0,
|
|
388
|
+
words: 0,
|
|
389
|
+
sentences: 0,
|
|
390
|
+
paragraphs: 0,
|
|
391
|
+
lines: 0,
|
|
392
|
+
averageWordsPerSentence: 0,
|
|
393
|
+
averageCharactersPerWord: 0
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
const characters = text.length;
|
|
398
|
+
const charactersNoSpaces = text.replace(/\s/g, '').length;
|
|
399
|
+
const words = text.split(/\s+/).filter(word => word.length > 0);
|
|
400
|
+
const sentences = text.split(/[.!?]+/).filter(sentence => sentence.trim().length > 0);
|
|
401
|
+
const paragraphs = text.split(/\n\s*\n/).filter(paragraph => paragraph.trim().length > 0);
|
|
402
|
+
const lines = text.split('\n').length;
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
characters,
|
|
406
|
+
charactersNoSpaces,
|
|
407
|
+
words: words.length,
|
|
408
|
+
sentences: sentences.length,
|
|
409
|
+
paragraphs: paragraphs.length,
|
|
410
|
+
lines,
|
|
411
|
+
averageWordsPerSentence: sentences.length > 0 ? Math.round((words.length / sentences.length) * 100) / 100 : 0,
|
|
412
|
+
averageCharactersPerWord: words.length > 0 ? Math.round((charactersNoSpaces / words.length) * 100) / 100 : 0
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Extract specific pages from PDF
|
|
418
|
+
* @param {Object} params - Processing parameters with page range
|
|
419
|
+
* @returns {Promise<Object>} - Processing result for specified pages
|
|
420
|
+
*/
|
|
421
|
+
async extractPDFPages(params) {
|
|
422
|
+
const { startPage = 1, endPage, ...processingParams } = params;
|
|
423
|
+
|
|
424
|
+
// Override parse options to limit page range
|
|
425
|
+
const options = {
|
|
426
|
+
...processingParams.options,
|
|
427
|
+
parseOptions: {
|
|
428
|
+
...processingParams.options?.parseOptions,
|
|
429
|
+
max: endPage || processingParams.options?.maxPages || 100
|
|
430
|
+
}
|
|
431
|
+
};
|
|
432
|
+
|
|
433
|
+
const result = await this.processPDF({
|
|
434
|
+
...processingParams,
|
|
435
|
+
options
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
if (result.success && result.text && startPage > 1) {
|
|
439
|
+
// This is a simplified approach - pdf-parse doesn't provide per-page text
|
|
440
|
+
// For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
|
|
441
|
+
console.warn('Page-specific extraction is limited with current PDF parser');
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
return result;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
export default PDFProcessor;
|