crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,1089 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BatchScrapeTool - Process multiple URLs simultaneously with job management
|
|
3
|
+
* Features: parallel processing, async/sync modes, webhook notifications, result pagination
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import { EventEmitter } from 'events';
|
|
8
|
+
import JobManager from '../../core/JobManager.js';
|
|
9
|
+
import WebhookDispatcher from '../../core/WebhookDispatcher.js';
|
|
10
|
+
import { load } from 'cheerio';
|
|
11
|
+
|
|
12
|
+
// Schema for individual URL configuration
|
|
13
|
+
const UrlConfigSchema = z.object({
|
|
14
|
+
url: z.string().url(),
|
|
15
|
+
selectors: z.record(z.string()).optional(),
|
|
16
|
+
headers: z.record(z.string()).optional(),
|
|
17
|
+
timeout: z.number().min(1000).max(30000).optional(),
|
|
18
|
+
metadata: z.record(z.any()).optional()
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// Main batch scrape schema
|
|
22
|
+
const BatchScrapeSchema = z.object({
|
|
23
|
+
urls: z.array(z.union([
|
|
24
|
+
z.string().url(),
|
|
25
|
+
UrlConfigSchema
|
|
26
|
+
])).min(1).max(50),
|
|
27
|
+
|
|
28
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
29
|
+
mode: z.enum(['sync', 'async']).default('sync'),
|
|
30
|
+
|
|
31
|
+
// Webhook configuration
|
|
32
|
+
webhook: z.object({
|
|
33
|
+
url: z.string().url(),
|
|
34
|
+
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
35
|
+
headers: z.record(z.string()).optional(),
|
|
36
|
+
signingSecret: z.string().optional()
|
|
37
|
+
}).optional(),
|
|
38
|
+
|
|
39
|
+
// Structured extraction schema (applied to all URLs)
|
|
40
|
+
extractionSchema: z.record(z.string()).optional(),
|
|
41
|
+
|
|
42
|
+
// Concurrency and timing
|
|
43
|
+
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
44
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
45
|
+
|
|
46
|
+
// Result handling
|
|
47
|
+
includeMetadata: z.boolean().default(true),
|
|
48
|
+
includeFailed: z.boolean().default(true),
|
|
49
|
+
pageSize: z.number().min(1).max(100).default(25),
|
|
50
|
+
|
|
51
|
+
// Job configuration (for async mode)
|
|
52
|
+
jobOptions: z.object({
|
|
53
|
+
priority: z.number().default(0),
|
|
54
|
+
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000), // 24 hours
|
|
55
|
+
maxRetries: z.number().min(0).max(5).default(1),
|
|
56
|
+
tags: z.array(z.string()).default([])
|
|
57
|
+
}).optional()
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
export class BatchScrapeTool extends EventEmitter {
|
|
61
|
+
constructor(options = {}) {
|
|
62
|
+
super();
|
|
63
|
+
|
|
64
|
+
const {
|
|
65
|
+
jobManager = null,
|
|
66
|
+
webhookDispatcher = null,
|
|
67
|
+
enableJobPersistence = true,
|
|
68
|
+
enableWebhookNotifications = true,
|
|
69
|
+
defaultTimeout = 15000,
|
|
70
|
+
maxBatchSize = 50,
|
|
71
|
+
enableResultCaching = true,
|
|
72
|
+
enableLogging = true
|
|
73
|
+
} = options;
|
|
74
|
+
|
|
75
|
+
this.jobManager = jobManager || new JobManager({
|
|
76
|
+
enablePersistence: enableJobPersistence,
|
|
77
|
+
defaultTtl: 24 * 60 * 60 * 1000 // 24 hours
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
this.webhookDispatcher = webhookDispatcher || new WebhookDispatcher({
|
|
81
|
+
enablePersistence: enableJobPersistence
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
this.defaultTimeout = defaultTimeout;
|
|
85
|
+
this.maxBatchSize = maxBatchSize;
|
|
86
|
+
this.enableResultCaching = enableResultCaching;
|
|
87
|
+
this.enableLogging = enableLogging;
|
|
88
|
+
this.enableWebhookNotifications = enableWebhookNotifications;
|
|
89
|
+
|
|
90
|
+
// Active batch tracking
|
|
91
|
+
this.activeBatches = new Map();
|
|
92
|
+
this.batchResults = new Map();
|
|
93
|
+
|
|
94
|
+
// Statistics
|
|
95
|
+
this.stats = {
|
|
96
|
+
totalBatches: 0,
|
|
97
|
+
completedBatches: 0,
|
|
98
|
+
failedBatches: 0,
|
|
99
|
+
totalUrls: 0,
|
|
100
|
+
successfulUrls: 0,
|
|
101
|
+
failedUrls: 0,
|
|
102
|
+
averageBatchTime: 0,
|
|
103
|
+
lastUpdated: Date.now()
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
// Register job executors
|
|
107
|
+
this.initializeJobExecutors();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Execute batch scraping operation
|
|
112
|
+
* @param {Object} params - Batch scraping parameters
|
|
113
|
+
* @returns {Promise<Object>} Batch result or job info
|
|
114
|
+
*/
|
|
115
|
+
async execute(params) {
|
|
116
|
+
try {
|
|
117
|
+
const validated = BatchScrapeSchema.parse(params);
|
|
118
|
+
|
|
119
|
+
this.stats.totalBatches++;
|
|
120
|
+
const batchId = this.generateBatchId();
|
|
121
|
+
const startTime = Date.now();
|
|
122
|
+
|
|
123
|
+
if (this.enableLogging) {
|
|
124
|
+
console.log(`Starting batch scrape ${batchId} with ${validated.urls.length} URLs in ${validated.mode} mode`);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Normalize URL configurations
|
|
128
|
+
const urlConfigs = this.normalizeUrlConfigs(validated.urls, validated);
|
|
129
|
+
|
|
130
|
+
// Register webhook if provided
|
|
131
|
+
let webhookConfig = null;
|
|
132
|
+
if (validated.webhook && this.enableWebhookNotifications) {
|
|
133
|
+
webhookConfig = await this.registerWebhook(validated.webhook, batchId);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (validated.mode === 'sync') {
|
|
137
|
+
// Process synchronously and return results
|
|
138
|
+
return await this.processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
|
|
139
|
+
} else {
|
|
140
|
+
// Create async job and return job info
|
|
141
|
+
return await this.processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
} catch (error) {
|
|
145
|
+
this.stats.failedBatches++;
|
|
146
|
+
this.log('error', `Batch scrape failed: ${error.message}`);
|
|
147
|
+
throw new Error(`Batch scrape failed: ${error.message}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Process batch synchronously
|
|
153
|
+
* @param {string} batchId - Batch identifier
|
|
154
|
+
* @param {Array} urlConfigs - Normalized URL configurations
|
|
155
|
+
* @param {Object} validated - Validated parameters
|
|
156
|
+
* @param {Object} webhookConfig - Webhook configuration
|
|
157
|
+
* @param {number} startTime - Start time
|
|
158
|
+
* @returns {Promise<Object>} Batch results
|
|
159
|
+
*/
|
|
160
|
+
async processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime) {
|
|
161
|
+
try {
|
|
162
|
+
const batchContext = {
|
|
163
|
+
id: batchId,
|
|
164
|
+
mode: 'sync',
|
|
165
|
+
startTime,
|
|
166
|
+
urlConfigs,
|
|
167
|
+
validated,
|
|
168
|
+
webhookConfig,
|
|
169
|
+
results: [],
|
|
170
|
+
errors: [],
|
|
171
|
+
completed: 0,
|
|
172
|
+
total: urlConfigs.length
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
this.activeBatches.set(batchId, batchContext);
|
|
176
|
+
|
|
177
|
+
// Process URLs with controlled concurrency
|
|
178
|
+
const results = await this.scrapeUrlsBatch(urlConfigs, validated);
|
|
179
|
+
|
|
180
|
+
// Process and format results
|
|
181
|
+
const processedResults = await this.processResults(results, validated);
|
|
182
|
+
|
|
183
|
+
const executionTime = Date.now() - startTime;
|
|
184
|
+
this.updateAverageBatchTime(executionTime);
|
|
185
|
+
|
|
186
|
+
const batchResult = {
|
|
187
|
+
batchId,
|
|
188
|
+
mode: 'sync',
|
|
189
|
+
success: true,
|
|
190
|
+
executionTime,
|
|
191
|
+
totalUrls: urlConfigs.length,
|
|
192
|
+
successfulUrls: processedResults.filter(r => r.success).length,
|
|
193
|
+
failedUrls: processedResults.filter(r => !r.success).length,
|
|
194
|
+
results: this.paginateResults(processedResults, 0, validated.pageSize),
|
|
195
|
+
pagination: {
|
|
196
|
+
page: 1,
|
|
197
|
+
pageSize: validated.pageSize,
|
|
198
|
+
totalResults: processedResults.length,
|
|
199
|
+
totalPages: Math.ceil(processedResults.length / validated.pageSize)
|
|
200
|
+
},
|
|
201
|
+
formats: validated.formats,
|
|
202
|
+
metadata: {
|
|
203
|
+
concurrency: validated.maxConcurrency,
|
|
204
|
+
extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null,
|
|
205
|
+
timestamp: Date.now()
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
// Cache results for pagination
|
|
210
|
+
if (this.enableResultCaching) {
|
|
211
|
+
this.batchResults.set(batchId, {
|
|
212
|
+
results: processedResults,
|
|
213
|
+
timestamp: Date.now(),
|
|
214
|
+
ttl: 3600000 // 1 hour
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
this.stats.completedBatches++;
|
|
219
|
+
this.stats.totalUrls += urlConfigs.length;
|
|
220
|
+
this.stats.successfulUrls += batchResult.successfulUrls;
|
|
221
|
+
this.stats.failedUrls += batchResult.failedUrls;
|
|
222
|
+
this.updateStats();
|
|
223
|
+
|
|
224
|
+
this.activeBatches.delete(batchId);
|
|
225
|
+
|
|
226
|
+
// Send webhook notification
|
|
227
|
+
if (webhookConfig) {
|
|
228
|
+
await this.sendWebhookNotification('batch_completed', batchResult, webhookConfig);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
this.emit('batchCompleted', batchResult);
|
|
232
|
+
return batchResult;
|
|
233
|
+
|
|
234
|
+
} catch (error) {
|
|
235
|
+
this.stats.failedBatches++;
|
|
236
|
+
this.activeBatches.delete(batchId);
|
|
237
|
+
|
|
238
|
+
if (webhookConfig) {
|
|
239
|
+
await this.sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
throw error;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Process batch asynchronously using job manager
|
|
248
|
+
* @param {string} batchId - Batch identifier
|
|
249
|
+
* @param {Array} urlConfigs - Normalized URL configurations
|
|
250
|
+
* @param {Object} validated - Validated parameters
|
|
251
|
+
* @param {Object} webhookConfig - Webhook configuration
|
|
252
|
+
* @param {number} startTime - Start time
|
|
253
|
+
* @returns {Promise<Object>} Job information
|
|
254
|
+
*/
|
|
255
|
+
async processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime) {
|
|
256
|
+
try {
|
|
257
|
+
const jobData = {
|
|
258
|
+
batchId,
|
|
259
|
+
urlConfigs,
|
|
260
|
+
validated,
|
|
261
|
+
webhookConfig,
|
|
262
|
+
startTime
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
const jobOptions = {
|
|
266
|
+
...validated.jobOptions,
|
|
267
|
+
webhooks: webhookConfig ? [webhookConfig] : [],
|
|
268
|
+
tags: ['batch_scrape', batchId, ...(validated.jobOptions?.tags || [])],
|
|
269
|
+
metadata: {
|
|
270
|
+
batchId,
|
|
271
|
+
urlCount: urlConfigs.length,
|
|
272
|
+
formats: validated.formats,
|
|
273
|
+
extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null
|
|
274
|
+
}
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
const job = await this.jobManager.createJob('batch_scrape', jobData, jobOptions);
|
|
278
|
+
|
|
279
|
+
// Start job execution asynchronously
|
|
280
|
+
this.jobManager.executeJob(job.id).catch(error => {
|
|
281
|
+
this.log('error', `Async batch job ${job.id} failed: ${error.message}`);
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
this.emit('batchJobCreated', job);
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
batchId,
|
|
288
|
+
mode: 'async',
|
|
289
|
+
jobId: job.id,
|
|
290
|
+
status: 'queued',
|
|
291
|
+
totalUrls: urlConfigs.length,
|
|
292
|
+
createdAt: job.createdAt,
|
|
293
|
+
estimatedCompletion: new Date(job.createdAt + (urlConfigs.length * 2000)), // Rough estimate
|
|
294
|
+
statusCheckUrl: `batch_scrape_status?jobId=${job.id}`,
|
|
295
|
+
webhook: webhookConfig ? {
|
|
296
|
+
url: webhookConfig.url,
|
|
297
|
+
events: webhookConfig.events
|
|
298
|
+
} : null
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
} catch (error) {
|
|
302
|
+
this.stats.failedBatches++;
|
|
303
|
+
throw error;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Scrape URLs in batch with concurrency control
|
|
309
|
+
* @param {Array} urlConfigs - URL configurations
|
|
310
|
+
* @param {Object} options - Scraping options
|
|
311
|
+
* @returns {Promise<Array>} Scraping results
|
|
312
|
+
*/
|
|
313
|
+
async scrapeUrlsBatch(urlConfigs, options) {
|
|
314
|
+
const results = [];
|
|
315
|
+
const semaphore = new Semaphore(options.maxConcurrency);
|
|
316
|
+
|
|
317
|
+
const scrapePromises = urlConfigs.map(async (config, index) => {
|
|
318
|
+
return semaphore.acquire(async () => {
|
|
319
|
+
try {
|
|
320
|
+
// Add delay between requests if configured
|
|
321
|
+
if (options.delayBetweenRequests > 0 && index > 0) {
|
|
322
|
+
await this.delay(options.delayBetweenRequests);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
return await this.scrapeUrl(config, options);
|
|
326
|
+
} catch (error) {
|
|
327
|
+
return {
|
|
328
|
+
success: false,
|
|
329
|
+
url: config.url,
|
|
330
|
+
error: error.message,
|
|
331
|
+
timestamp: Date.now()
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
});
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
const settledResults = await Promise.allSettled(scrapePromises);
|
|
338
|
+
|
|
339
|
+
settledResults.forEach((result, index) => {
|
|
340
|
+
if (result.status === 'fulfilled') {
|
|
341
|
+
results.push(result.value);
|
|
342
|
+
} else {
|
|
343
|
+
results.push({
|
|
344
|
+
success: false,
|
|
345
|
+
url: urlConfigs[index].url,
|
|
346
|
+
error: result.reason.message || 'Unknown error',
|
|
347
|
+
timestamp: Date.now()
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
return results;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Scrape individual URL
|
|
357
|
+
* @param {Object} config - URL configuration
|
|
358
|
+
* @param {Object} options - Scraping options
|
|
359
|
+
* @returns {Promise<Object>} Scrape result
|
|
360
|
+
*/
|
|
361
|
+
async scrapeUrl(config, options) {
|
|
362
|
+
const startTime = Date.now();
|
|
363
|
+
|
|
364
|
+
try {
|
|
365
|
+
const response = await this.fetchUrl(config.url, {
|
|
366
|
+
headers: config.headers,
|
|
367
|
+
timeout: config.timeout || this.defaultTimeout
|
|
368
|
+
});
|
|
369
|
+
|
|
370
|
+
if (!response.ok) {
|
|
371
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
const html = await response.text();
|
|
375
|
+
const $ = load(html);
|
|
376
|
+
|
|
377
|
+
const result = {
|
|
378
|
+
success: true,
|
|
379
|
+
url: config.url,
|
|
380
|
+
timestamp: Date.now(),
|
|
381
|
+
executionTime: Date.now() - startTime,
|
|
382
|
+
metadata: {
|
|
383
|
+
status: response.status,
|
|
384
|
+
contentType: response.headers.get('content-type'),
|
|
385
|
+
contentLength: html.length,
|
|
386
|
+
...(config.metadata || {})
|
|
387
|
+
}
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
// Apply extraction schemas
|
|
391
|
+
if (options.extractionSchema || config.selectors) {
|
|
392
|
+
const selectors = { ...config.selectors, ...options.extractionSchema };
|
|
393
|
+
result.extracted = this.extractStructuredData($, selectors);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Generate different formats
|
|
397
|
+
result.content = this.generateFormats($, html, options.formats);
|
|
398
|
+
|
|
399
|
+
return result;
|
|
400
|
+
|
|
401
|
+
} catch (error) {
|
|
402
|
+
return {
|
|
403
|
+
success: false,
|
|
404
|
+
url: config.url,
|
|
405
|
+
error: error.message,
|
|
406
|
+
timestamp: Date.now(),
|
|
407
|
+
executionTime: Date.now() - startTime,
|
|
408
|
+
metadata: config.metadata || {}
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Extract structured data using selectors
|
|
415
|
+
* @param {Object} $ - Cheerio instance
|
|
416
|
+
* @param {Object} selectors - CSS selectors
|
|
417
|
+
* @returns {Object} Extracted data
|
|
418
|
+
*/
|
|
419
|
+
extractStructuredData($, selectors) {
|
|
420
|
+
const extracted = {};
|
|
421
|
+
|
|
422
|
+
for (const [key, selector] of Object.entries(selectors)) {
|
|
423
|
+
try {
|
|
424
|
+
const elements = $(selector);
|
|
425
|
+
|
|
426
|
+
if (elements.length === 0) {
|
|
427
|
+
extracted[key] = null;
|
|
428
|
+
} else if (elements.length === 1) {
|
|
429
|
+
extracted[key] = elements.text().trim();
|
|
430
|
+
} else {
|
|
431
|
+
extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
|
|
432
|
+
}
|
|
433
|
+
} catch (error) {
|
|
434
|
+
extracted[key] = { error: `Invalid selector: ${selector}` };
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return extracted;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Generate content in different formats
|
|
443
|
+
* @param {Object} $ - Cheerio instance
|
|
444
|
+
* @param {string} html - Raw HTML
|
|
445
|
+
* @param {Array} formats - Requested formats
|
|
446
|
+
* @returns {Object} Content in different formats
|
|
447
|
+
*/
|
|
448
|
+
generateFormats($, html, formats) {
|
|
449
|
+
const content = {};
|
|
450
|
+
|
|
451
|
+
if (formats.includes('html')) {
|
|
452
|
+
content.html = html;
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
if (formats.includes('text')) {
|
|
456
|
+
content.text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
if (formats.includes('markdown')) {
|
|
460
|
+
content.markdown = this.convertToMarkdown($);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
if (formats.includes('json')) {
|
|
464
|
+
content.json = {
|
|
465
|
+
title: $('title').text().trim(),
|
|
466
|
+
headings: this.extractHeadings($),
|
|
467
|
+
links: this.extractLinks($),
|
|
468
|
+
images: this.extractImages($),
|
|
469
|
+
metadata: this.extractMetadata($)
|
|
470
|
+
};
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
return content;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
/**
|
|
477
|
+
* Convert HTML to Markdown (basic implementation)
|
|
478
|
+
* @param {Object} $ - Cheerio instance
|
|
479
|
+
* @returns {string} Markdown content
|
|
480
|
+
*/
|
|
481
|
+
convertToMarkdown($) {
|
|
482
|
+
let markdown = '';
|
|
483
|
+
|
|
484
|
+
// Extract title
|
|
485
|
+
const title = $('title').text().trim();
|
|
486
|
+
if (title) {
|
|
487
|
+
markdown += `# ${title}\n\n`;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Extract main content
|
|
491
|
+
const contentSelectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
|
|
492
|
+
let $content = null;
|
|
493
|
+
|
|
494
|
+
for (const selector of contentSelectors) {
|
|
495
|
+
$content = $(selector);
|
|
496
|
+
if ($content.length > 0) break;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
if (!$content || $content.length === 0) {
|
|
500
|
+
$content = $('body');
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// Basic markdown conversion
|
|
504
|
+
$content.find('h1').each((_, el) => {
|
|
505
|
+
markdown += `# ${$(el).text().trim()}\n\n`;
|
|
506
|
+
});
|
|
507
|
+
|
|
508
|
+
$content.find('h2').each((_, el) => {
|
|
509
|
+
markdown += `## ${$(el).text().trim()}\n\n`;
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
$content.find('h3').each((_, el) => {
|
|
513
|
+
markdown += `### ${$(el).text().trim()}\n\n`;
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
$content.find('p').each((_, el) => {
|
|
517
|
+
const text = $(el).text().trim();
|
|
518
|
+
if (text) {
|
|
519
|
+
markdown += `${text}\n\n`;
|
|
520
|
+
}
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
$content.find('ul li').each((_, el) => {
|
|
524
|
+
markdown += `- ${$(el).text().trim()}\n`;
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
$content.find('ol li').each((_, el) => {
|
|
528
|
+
markdown += `1. ${$(el).text().trim()}\n`;
|
|
529
|
+
});
|
|
530
|
+
|
|
531
|
+
return markdown.trim();
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* Extract headings
|
|
536
|
+
* @param {Object} $ - Cheerio instance
|
|
537
|
+
* @returns {Array} Headings
|
|
538
|
+
*/
|
|
539
|
+
extractHeadings($) {
|
|
540
|
+
const headings = [];
|
|
541
|
+
|
|
542
|
+
$('h1, h2, h3, h4, h5, h6').each((_, el) => {
|
|
543
|
+
headings.push({
|
|
544
|
+
level: parseInt(el.name.substring(1)),
|
|
545
|
+
text: $(el).text().trim(),
|
|
546
|
+
id: $(el).attr('id') || null
|
|
547
|
+
});
|
|
548
|
+
});
|
|
549
|
+
|
|
550
|
+
return headings;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Extract links
|
|
555
|
+
* @param {Object} $ - Cheerio instance
|
|
556
|
+
* @returns {Array} Links
|
|
557
|
+
*/
|
|
558
|
+
extractLinks($) {
|
|
559
|
+
const links = [];
|
|
560
|
+
|
|
561
|
+
$('a[href]').each((_, el) => {
|
|
562
|
+
const href = $(el).attr('href');
|
|
563
|
+
const text = $(el).text().trim();
|
|
564
|
+
|
|
565
|
+
if (href && text) {
|
|
566
|
+
links.push({
|
|
567
|
+
href,
|
|
568
|
+
text,
|
|
569
|
+
title: $(el).attr('title') || null
|
|
570
|
+
});
|
|
571
|
+
}
|
|
572
|
+
});
|
|
573
|
+
|
|
574
|
+
return links;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* Extract images
|
|
579
|
+
* @param {Object} $ - Cheerio instance
|
|
580
|
+
* @returns {Array} Images
|
|
581
|
+
*/
|
|
582
|
+
extractImages($) {
|
|
583
|
+
const images = [];
|
|
584
|
+
|
|
585
|
+
$('img[src]').each((_, el) => {
|
|
586
|
+
images.push({
|
|
587
|
+
src: $(el).attr('src'),
|
|
588
|
+
alt: $(el).attr('alt') || null,
|
|
589
|
+
title: $(el).attr('title') || null,
|
|
590
|
+
width: $(el).attr('width') || null,
|
|
591
|
+
height: $(el).attr('height') || null
|
|
592
|
+
});
|
|
593
|
+
});
|
|
594
|
+
|
|
595
|
+
return images;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/**
|
|
599
|
+
* Extract metadata
|
|
600
|
+
* @param {Object} $ - Cheerio instance
|
|
601
|
+
* @returns {Object} Metadata
|
|
602
|
+
*/
|
|
603
|
+
extractMetadata($) {
|
|
604
|
+
const metadata = {};
|
|
605
|
+
|
|
606
|
+
// Basic metadata
|
|
607
|
+
metadata.title = $('title').text().trim();
|
|
608
|
+
metadata.description = $('meta[name="description"]').attr('content') || '';
|
|
609
|
+
|
|
610
|
+
// Open Graph
|
|
611
|
+
metadata.og = {};
|
|
612
|
+
$('meta[property^="og:"]').each((_, el) => {
|
|
613
|
+
const property = $(el).attr('property').replace('og:', '');
|
|
614
|
+
metadata.og[property] = $(el).attr('content');
|
|
615
|
+
});
|
|
616
|
+
|
|
617
|
+
// Twitter Cards
|
|
618
|
+
metadata.twitter = {};
|
|
619
|
+
$('meta[name^="twitter:"]').each((_, el) => {
|
|
620
|
+
const name = $(el).attr('name').replace('twitter:', '');
|
|
621
|
+
metadata.twitter[name] = $(el).attr('content');
|
|
622
|
+
});
|
|
623
|
+
|
|
624
|
+
return metadata;
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
/**
|
|
628
|
+
* Process and format results
|
|
629
|
+
* @param {Array} results - Raw results
|
|
630
|
+
* @param {Object} options - Processing options
|
|
631
|
+
* @returns {Promise<Array>} Processed results
|
|
632
|
+
*/
|
|
633
|
+
async processResults(results, options) {
|
|
634
|
+
let processedResults = [...results];
|
|
635
|
+
|
|
636
|
+
// Filter out failed results if not requested
|
|
637
|
+
if (!options.includeFailed) {
|
|
638
|
+
processedResults = processedResults.filter(r => r.success);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// Add metadata if requested
|
|
642
|
+
if (options.includeMetadata) {
|
|
643
|
+
processedResults = processedResults.map(result => ({
|
|
644
|
+
...result,
|
|
645
|
+
processingMetadata: {
|
|
646
|
+
formats: options.formats,
|
|
647
|
+
extractionApplied: !!options.extractionSchema,
|
|
648
|
+
processedAt: Date.now()
|
|
649
|
+
}
|
|
650
|
+
}));
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
return processedResults;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
/**
|
|
657
|
+
* Paginate results
|
|
658
|
+
* @param {Array} results - All results
|
|
659
|
+
* @param {number} offset - Offset
|
|
660
|
+
* @param {number} limit - Limit
|
|
661
|
+
* @returns {Array} Paginated results
|
|
662
|
+
*/
|
|
663
|
+
paginateResults(results, offset, limit) {
|
|
664
|
+
return results.slice(offset, offset + limit);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
/**
|
|
668
|
+
* Get batch status and results
|
|
669
|
+
* @param {string} batchId - Batch identifier
|
|
670
|
+
* @param {number} page - Page number (1-based)
|
|
671
|
+
* @param {number} pageSize - Page size
|
|
672
|
+
* @returns {Object} Batch status and results
|
|
673
|
+
*/
|
|
674
|
+
async getBatchResults(batchId, page = 1, pageSize = 25) {
|
|
675
|
+
// Check if results are cached
|
|
676
|
+
const cached = this.batchResults.get(batchId);
|
|
677
|
+
if (cached && Date.now() - cached.timestamp < cached.ttl) {
|
|
678
|
+
const offset = (page - 1) * pageSize;
|
|
679
|
+
return {
|
|
680
|
+
batchId,
|
|
681
|
+
success: true,
|
|
682
|
+
results: this.paginateResults(cached.results, offset, pageSize),
|
|
683
|
+
pagination: {
|
|
684
|
+
page,
|
|
685
|
+
pageSize,
|
|
686
|
+
totalResults: cached.results.length,
|
|
687
|
+
totalPages: Math.ceil(cached.results.length / pageSize)
|
|
688
|
+
},
|
|
689
|
+
cached: true,
|
|
690
|
+
timestamp: cached.timestamp
|
|
691
|
+
};
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// Check active batches
|
|
695
|
+
const active = this.activeBatches.get(batchId);
|
|
696
|
+
if (active) {
|
|
697
|
+
return {
|
|
698
|
+
batchId,
|
|
699
|
+
status: 'in_progress',
|
|
700
|
+
mode: active.mode,
|
|
701
|
+
progress: {
|
|
702
|
+
completed: active.completed,
|
|
703
|
+
total: active.total,
|
|
704
|
+
percentage: Math.round((active.completed / active.total) * 100)
|
|
705
|
+
},
|
|
706
|
+
startTime: active.startTime,
|
|
707
|
+
runningTime: Date.now() - active.startTime
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
throw new Error(`Batch ${batchId} not found`);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
/**
|
|
715
|
+
* Get job status for async batches
|
|
716
|
+
* @param {string} jobId - Job identifier
|
|
717
|
+
* @returns {Object} Job status
|
|
718
|
+
*/
|
|
719
|
+
async getJobStatus(jobId) {
|
|
720
|
+
const job = this.jobManager.getJob(jobId);
|
|
721
|
+
if (!job) {
|
|
722
|
+
throw new Error(`Job ${jobId} not found`);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
const status = {
|
|
726
|
+
jobId,
|
|
727
|
+
batchId: job.metadata?.batchId,
|
|
728
|
+
status: job.status,
|
|
729
|
+
progress: job.progress,
|
|
730
|
+
createdAt: job.createdAt,
|
|
731
|
+
startedAt: job.startedAt,
|
|
732
|
+
completedAt: job.completedAt,
|
|
733
|
+
error: job.error,
|
|
734
|
+
metadata: job.metadata
|
|
735
|
+
};
|
|
736
|
+
|
|
737
|
+
// If job is completed, include results
|
|
738
|
+
if (job.status === 'completed' && job.result) {
|
|
739
|
+
status.results = job.result;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
return status;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
/**
|
|
746
|
+
* Cancel batch operation
|
|
747
|
+
* @param {string} batchId - Batch identifier
|
|
748
|
+
* @returns {Object} Cancellation result
|
|
749
|
+
*/
|
|
750
|
+
async cancelBatch(batchId) {
|
|
751
|
+
// Check active batches
|
|
752
|
+
if (this.activeBatches.has(batchId)) {
|
|
753
|
+
this.activeBatches.delete(batchId);
|
|
754
|
+
return { success: true, message: `Active batch ${batchId} cancelled` };
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// Try to cancel job
|
|
758
|
+
const jobs = this.jobManager.getJobsByTag(batchId);
|
|
759
|
+
if (jobs.length > 0) {
|
|
760
|
+
const job = jobs[0];
|
|
761
|
+
await this.jobManager.cancelJob(job.id);
|
|
762
|
+
return { success: true, message: `Job ${job.id} for batch ${batchId} cancelled` };
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
throw new Error(`Batch ${batchId} not found or already completed`);
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
/**
|
|
769
|
+
* Initialize job executors
|
|
770
|
+
*/
|
|
771
|
+
initializeJobExecutors() {
|
|
772
|
+
this.jobManager.registerExecutor('batch_scrape', async (job) => {
|
|
773
|
+
const { batchId, urlConfigs, validated, webhookConfig, startTime } = job.data;
|
|
774
|
+
|
|
775
|
+
try {
|
|
776
|
+
// Update job progress
|
|
777
|
+
await this.jobManager.updateJobProgress(job.id, 0, 'Starting batch scrape');
|
|
778
|
+
|
|
779
|
+
// Process URLs with progress updates
|
|
780
|
+
const results = [];
|
|
781
|
+
const total = urlConfigs.length;
|
|
782
|
+
|
|
783
|
+
for (let i = 0; i < total; i += validated.maxConcurrency) {
|
|
784
|
+
const batch = urlConfigs.slice(i, i + validated.maxConcurrency);
|
|
785
|
+
const batchResults = await this.scrapeUrlsBatch(batch, validated);
|
|
786
|
+
results.push(...batchResults);
|
|
787
|
+
|
|
788
|
+
const progress = Math.round(((i + batch.length) / total) * 100);
|
|
789
|
+
await this.jobManager.updateJobProgress(
|
|
790
|
+
job.id,
|
|
791
|
+
progress,
|
|
792
|
+
`Processed ${i + batch.length}/${total} URLs`
|
|
793
|
+
);
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
// Process and format results
|
|
797
|
+
const processedResults = await this.processResults(results, validated);
|
|
798
|
+
const executionTime = Date.now() - startTime;
|
|
799
|
+
|
|
800
|
+
const batchResult = {
|
|
801
|
+
batchId,
|
|
802
|
+
mode: 'async',
|
|
803
|
+
success: true,
|
|
804
|
+
executionTime,
|
|
805
|
+
totalUrls: urlConfigs.length,
|
|
806
|
+
successfulUrls: processedResults.filter(r => r.success).length,
|
|
807
|
+
failedUrls: processedResults.filter(r => !r.success).length,
|
|
808
|
+
results: processedResults,
|
|
809
|
+
formats: validated.formats,
|
|
810
|
+
metadata: {
|
|
811
|
+
concurrency: validated.maxConcurrency,
|
|
812
|
+
extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null,
|
|
813
|
+
timestamp: Date.now(),
|
|
814
|
+
jobId: job.id
|
|
815
|
+
}
|
|
816
|
+
};
|
|
817
|
+
|
|
818
|
+
// Cache results
|
|
819
|
+
if (this.enableResultCaching) {
|
|
820
|
+
this.batchResults.set(batchId, {
|
|
821
|
+
results: processedResults,
|
|
822
|
+
timestamp: Date.now(),
|
|
823
|
+
ttl: 3600000 // 1 hour
|
|
824
|
+
});
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// Update statistics
|
|
828
|
+
this.stats.completedBatches++;
|
|
829
|
+
this.stats.totalUrls += urlConfigs.length;
|
|
830
|
+
this.stats.successfulUrls += batchResult.successfulUrls;
|
|
831
|
+
this.stats.failedUrls += batchResult.failedUrls;
|
|
832
|
+
this.updateAverageBatchTime(executionTime);
|
|
833
|
+
this.updateStats();
|
|
834
|
+
|
|
835
|
+
// Send webhook notification
|
|
836
|
+
if (webhookConfig) {
|
|
837
|
+
await this.sendWebhookNotification('batch_completed', batchResult, webhookConfig);
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
this.emit('batchCompleted', batchResult);
|
|
841
|
+
return batchResult;
|
|
842
|
+
|
|
843
|
+
} catch (error) {
|
|
844
|
+
this.stats.failedBatches++;
|
|
845
|
+
|
|
846
|
+
if (webhookConfig) {
|
|
847
|
+
await this.sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
throw error;
|
|
851
|
+
}
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
/**
|
|
856
|
+
* Normalize URL configurations
|
|
857
|
+
* @param {Array} urls - Raw URL configurations
|
|
858
|
+
* @param {Object} globalOptions - Global options
|
|
859
|
+
* @returns {Array} Normalized URL configurations
|
|
860
|
+
*/
|
|
861
|
+
normalizeUrlConfigs(urls, globalOptions) {
|
|
862
|
+
return urls.map(url => {
|
|
863
|
+
if (typeof url === 'string') {
|
|
864
|
+
return {
|
|
865
|
+
url,
|
|
866
|
+
selectors: globalOptions.extractionSchema || {},
|
|
867
|
+
headers: {},
|
|
868
|
+
timeout: this.defaultTimeout
|
|
869
|
+
};
|
|
870
|
+
} else {
|
|
871
|
+
return {
|
|
872
|
+
...url,
|
|
873
|
+
selectors: { ...globalOptions.extractionSchema, ...(url.selectors || {}) },
|
|
874
|
+
headers: url.headers || {},
|
|
875
|
+
timeout: url.timeout || this.defaultTimeout
|
|
876
|
+
};
|
|
877
|
+
}
|
|
878
|
+
});
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* Register webhook for notifications
|
|
883
|
+
* @param {Object} webhookConfig - Webhook configuration
|
|
884
|
+
* @param {string} batchId - Batch identifier
|
|
885
|
+
* @returns {Promise<Object>} Registered webhook configuration
|
|
886
|
+
*/
|
|
887
|
+
async registerWebhook(webhookConfig, batchId) {
|
|
888
|
+
const config = {
|
|
889
|
+
...webhookConfig,
|
|
890
|
+
metadata: {
|
|
891
|
+
batchId,
|
|
892
|
+
registeredAt: Date.now()
|
|
893
|
+
}
|
|
894
|
+
};
|
|
895
|
+
|
|
896
|
+
const registeredConfig = this.webhookDispatcher.registerWebhook(webhookConfig.url, config);
|
|
897
|
+
return registeredConfig;
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
/**
|
|
901
|
+
* Send webhook notification
|
|
902
|
+
* @param {string} event - Event type
|
|
903
|
+
* @param {Object} data - Event data
|
|
904
|
+
* @param {Object} webhookConfig - Webhook configuration
|
|
905
|
+
*/
|
|
906
|
+
async sendWebhookNotification(event, data, webhookConfig) {
|
|
907
|
+
if (!this.enableWebhookNotifications || !webhookConfig) return;
|
|
908
|
+
|
|
909
|
+
try {
|
|
910
|
+
await this.webhookDispatcher.dispatch(event, data, {
|
|
911
|
+
urls: [webhookConfig.url],
|
|
912
|
+
immediate: false,
|
|
913
|
+
metadata: {
|
|
914
|
+
batchId: data.batchId,
|
|
915
|
+
timestamp: Date.now()
|
|
916
|
+
}
|
|
917
|
+
});
|
|
918
|
+
} catch (error) {
|
|
919
|
+
this.log('warn', `Webhook notification failed: ${error.message}`);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
/**
|
|
924
|
+
* Fetch URL with error handling
|
|
925
|
+
* @param {string} url - URL to fetch
|
|
926
|
+
* @param {Object} options - Fetch options
|
|
927
|
+
* @returns {Promise<Response>} Response
|
|
928
|
+
*/
|
|
929
|
+
async fetchUrl(url, options = {}) {
|
|
930
|
+
const { timeout = this.defaultTimeout, headers = {} } = options;
|
|
931
|
+
|
|
932
|
+
const controller = new AbortController();
|
|
933
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
934
|
+
|
|
935
|
+
try {
|
|
936
|
+
const response = await fetch(url, {
|
|
937
|
+
signal: controller.signal,
|
|
938
|
+
headers: {
|
|
939
|
+
'User-Agent': 'MCP-WebScraper-BatchTool/1.0.0',
|
|
940
|
+
...headers
|
|
941
|
+
}
|
|
942
|
+
});
|
|
943
|
+
|
|
944
|
+
clearTimeout(timeoutId);
|
|
945
|
+
return response;
|
|
946
|
+
} catch (error) {
|
|
947
|
+
clearTimeout(timeoutId);
|
|
948
|
+
if (error.name === 'AbortError') {
|
|
949
|
+
throw new Error(`Request timeout after ${timeout}ms`);
|
|
950
|
+
}
|
|
951
|
+
throw error;
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
/**
|
|
956
|
+
* Generate unique batch ID
|
|
957
|
+
* @returns {string} Batch ID
|
|
958
|
+
*/
|
|
959
|
+
generateBatchId() {
|
|
960
|
+
return `batch_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
/**
|
|
964
|
+
* Update average batch time statistic
|
|
965
|
+
* @param {number} batchTime - Batch execution time
|
|
966
|
+
*/
|
|
967
|
+
updateAverageBatchTime(batchTime) {
|
|
968
|
+
const currentAverage = this.stats.averageBatchTime;
|
|
969
|
+
const completedBatches = this.stats.completedBatches;
|
|
970
|
+
|
|
971
|
+
if (completedBatches === 1) {
|
|
972
|
+
this.stats.averageBatchTime = batchTime;
|
|
973
|
+
} else {
|
|
974
|
+
this.stats.averageBatchTime =
|
|
975
|
+
((currentAverage * (completedBatches - 1)) + batchTime) / completedBatches;
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
/**
|
|
980
|
+
* Update statistics
|
|
981
|
+
*/
|
|
982
|
+
updateStats() {
|
|
983
|
+
this.stats.lastUpdated = Date.now();
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
/**
|
|
987
|
+
* Utility delay function
|
|
988
|
+
* @param {number} ms - Milliseconds to delay
|
|
989
|
+
* @returns {Promise} Delay promise
|
|
990
|
+
*/
|
|
991
|
+
delay(ms) {
|
|
992
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
/**
|
|
996
|
+
* Log message if logging enabled
|
|
997
|
+
* @param {string} level - Log level
|
|
998
|
+
* @param {string} message - Log message
|
|
999
|
+
*/
|
|
1000
|
+
log(level, message) {
|
|
1001
|
+
if (this.enableLogging) {
|
|
1002
|
+
console.log(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Get comprehensive statistics
|
|
1008
|
+
* @returns {Object} Statistics
|
|
1009
|
+
*/
|
|
1010
|
+
getStats() {
|
|
1011
|
+
return {
|
|
1012
|
+
...this.stats,
|
|
1013
|
+
activeBatches: this.activeBatches.size,
|
|
1014
|
+
cachedResults: this.batchResults.size,
|
|
1015
|
+
jobManagerStats: this.jobManager ? this.jobManager.getStats() : null,
|
|
1016
|
+
webhookStats: this.webhookDispatcher ? this.webhookDispatcher.getStats() : null
|
|
1017
|
+
};
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
/**
|
|
1021
|
+
* Cleanup resources
|
|
1022
|
+
*/
|
|
1023
|
+
async destroy() {
|
|
1024
|
+
// Cancel active batches
|
|
1025
|
+
for (const batchId of this.activeBatches.keys()) {
|
|
1026
|
+
try {
|
|
1027
|
+
await this.cancelBatch(batchId);
|
|
1028
|
+
} catch (error) {
|
|
1029
|
+
this.log('warn', `Failed to cancel batch ${batchId}: ${error.message}`);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// Clear caches
|
|
1034
|
+
this.activeBatches.clear();
|
|
1035
|
+
this.batchResults.clear();
|
|
1036
|
+
|
|
1037
|
+
// Cleanup job manager if we own it
|
|
1038
|
+
if (this.jobManager) {
|
|
1039
|
+
this.jobManager.destroy();
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
// Cleanup webhook dispatcher if we own it
|
|
1043
|
+
if (this.webhookDispatcher) {
|
|
1044
|
+
this.webhookDispatcher.destroy();
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
// Remove event listeners
|
|
1048
|
+
this.removeAllListeners();
|
|
1049
|
+
|
|
1050
|
+
this.emit('destroyed');
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
/**
|
|
1055
|
+
* Simple semaphore implementation for concurrency control
|
|
1056
|
+
*/
|
|
1057
|
+
class Semaphore {
|
|
1058
|
+
constructor(max) {
|
|
1059
|
+
this.max = max;
|
|
1060
|
+
this.current = 0;
|
|
1061
|
+
this.queue = [];
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
async acquire(task) {
|
|
1065
|
+
return new Promise((resolve, reject) => {
|
|
1066
|
+
this.queue.push({ task, resolve, reject });
|
|
1067
|
+
this.tryNext();
|
|
1068
|
+
});
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
tryNext() {
|
|
1072
|
+
if (this.current >= this.max || this.queue.length === 0) {
|
|
1073
|
+
return;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
this.current++;
|
|
1077
|
+
const { task, resolve, reject } = this.queue.shift();
|
|
1078
|
+
|
|
1079
|
+
task()
|
|
1080
|
+
.then(resolve)
|
|
1081
|
+
.catch(reject)
|
|
1082
|
+
.finally(() => {
|
|
1083
|
+
this.current--;
|
|
1084
|
+
this.tryNext();
|
|
1085
|
+
});
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
export default BatchScrapeTool;
|