crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,1089 @@
1
+ /**
2
+ * BatchScrapeTool - Process multiple URLs simultaneously with job management
3
+ * Features: parallel processing, async/sync modes, webhook notifications, result pagination
4
+ */
5
+
6
+ import { z } from 'zod';
7
+ import { EventEmitter } from 'events';
8
+ import JobManager from '../../core/JobManager.js';
9
+ import WebhookDispatcher from '../../core/WebhookDispatcher.js';
10
+ import { load } from 'cheerio';
11
+
12
+ // Schema for individual URL configuration
13
+ const UrlConfigSchema = z.object({
14
+ url: z.string().url(),
15
+ selectors: z.record(z.string()).optional(),
16
+ headers: z.record(z.string()).optional(),
17
+ timeout: z.number().min(1000).max(30000).optional(),
18
+ metadata: z.record(z.any()).optional()
19
+ });
20
+
21
+ // Main batch scrape schema
22
+ const BatchScrapeSchema = z.object({
23
+ urls: z.array(z.union([
24
+ z.string().url(),
25
+ UrlConfigSchema
26
+ ])).min(1).max(50),
27
+
28
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
29
+ mode: z.enum(['sync', 'async']).default('sync'),
30
+
31
+ // Webhook configuration
32
+ webhook: z.object({
33
+ url: z.string().url(),
34
+ events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
35
+ headers: z.record(z.string()).optional(),
36
+ signingSecret: z.string().optional()
37
+ }).optional(),
38
+
39
+ // Structured extraction schema (applied to all URLs)
40
+ extractionSchema: z.record(z.string()).optional(),
41
+
42
+ // Concurrency and timing
43
+ maxConcurrency: z.number().min(1).max(20).default(10),
44
+ delayBetweenRequests: z.number().min(0).max(10000).default(100),
45
+
46
+ // Result handling
47
+ includeMetadata: z.boolean().default(true),
48
+ includeFailed: z.boolean().default(true),
49
+ pageSize: z.number().min(1).max(100).default(25),
50
+
51
+ // Job configuration (for async mode)
52
+ jobOptions: z.object({
53
+ priority: z.number().default(0),
54
+ ttl: z.number().min(60000).default(24 * 60 * 60 * 1000), // 24 hours
55
+ maxRetries: z.number().min(0).max(5).default(1),
56
+ tags: z.array(z.string()).default([])
57
+ }).optional()
58
+ });
59
+
60
+ export class BatchScrapeTool extends EventEmitter {
61
+ constructor(options = {}) {
62
+ super();
63
+
64
+ const {
65
+ jobManager = null,
66
+ webhookDispatcher = null,
67
+ enableJobPersistence = true,
68
+ enableWebhookNotifications = true,
69
+ defaultTimeout = 15000,
70
+ maxBatchSize = 50,
71
+ enableResultCaching = true,
72
+ enableLogging = true
73
+ } = options;
74
+
75
+ this.jobManager = jobManager || new JobManager({
76
+ enablePersistence: enableJobPersistence,
77
+ defaultTtl: 24 * 60 * 60 * 1000 // 24 hours
78
+ });
79
+
80
+ this.webhookDispatcher = webhookDispatcher || new WebhookDispatcher({
81
+ enablePersistence: enableJobPersistence
82
+ });
83
+
84
+ this.defaultTimeout = defaultTimeout;
85
+ this.maxBatchSize = maxBatchSize;
86
+ this.enableResultCaching = enableResultCaching;
87
+ this.enableLogging = enableLogging;
88
+ this.enableWebhookNotifications = enableWebhookNotifications;
89
+
90
+ // Active batch tracking
91
+ this.activeBatches = new Map();
92
+ this.batchResults = new Map();
93
+
94
+ // Statistics
95
+ this.stats = {
96
+ totalBatches: 0,
97
+ completedBatches: 0,
98
+ failedBatches: 0,
99
+ totalUrls: 0,
100
+ successfulUrls: 0,
101
+ failedUrls: 0,
102
+ averageBatchTime: 0,
103
+ lastUpdated: Date.now()
104
+ };
105
+
106
+ // Register job executors
107
+ this.initializeJobExecutors();
108
+ }
109
+
110
+ /**
111
+ * Execute batch scraping operation
112
+ * @param {Object} params - Batch scraping parameters
113
+ * @returns {Promise<Object>} Batch result or job info
114
+ */
115
+ async execute(params) {
116
+ try {
117
+ const validated = BatchScrapeSchema.parse(params);
118
+
119
+ this.stats.totalBatches++;
120
+ const batchId = this.generateBatchId();
121
+ const startTime = Date.now();
122
+
123
+ if (this.enableLogging) {
124
+ console.log(`Starting batch scrape ${batchId} with ${validated.urls.length} URLs in ${validated.mode} mode`);
125
+ }
126
+
127
+ // Normalize URL configurations
128
+ const urlConfigs = this.normalizeUrlConfigs(validated.urls, validated);
129
+
130
+ // Register webhook if provided
131
+ let webhookConfig = null;
132
+ if (validated.webhook && this.enableWebhookNotifications) {
133
+ webhookConfig = await this.registerWebhook(validated.webhook, batchId);
134
+ }
135
+
136
+ if (validated.mode === 'sync') {
137
+ // Process synchronously and return results
138
+ return await this.processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
139
+ } else {
140
+ // Create async job and return job info
141
+ return await this.processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime);
142
+ }
143
+
144
+ } catch (error) {
145
+ this.stats.failedBatches++;
146
+ this.log('error', `Batch scrape failed: ${error.message}`);
147
+ throw new Error(`Batch scrape failed: ${error.message}`);
148
+ }
149
+ }
150
+
151
+ /**
152
+ * Process batch synchronously
153
+ * @param {string} batchId - Batch identifier
154
+ * @param {Array} urlConfigs - Normalized URL configurations
155
+ * @param {Object} validated - Validated parameters
156
+ * @param {Object} webhookConfig - Webhook configuration
157
+ * @param {number} startTime - Start time
158
+ * @returns {Promise<Object>} Batch results
159
+ */
160
+ async processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime) {
161
+ try {
162
+ const batchContext = {
163
+ id: batchId,
164
+ mode: 'sync',
165
+ startTime,
166
+ urlConfigs,
167
+ validated,
168
+ webhookConfig,
169
+ results: [],
170
+ errors: [],
171
+ completed: 0,
172
+ total: urlConfigs.length
173
+ };
174
+
175
+ this.activeBatches.set(batchId, batchContext);
176
+
177
+ // Process URLs with controlled concurrency
178
+ const results = await this.scrapeUrlsBatch(urlConfigs, validated);
179
+
180
+ // Process and format results
181
+ const processedResults = await this.processResults(results, validated);
182
+
183
+ const executionTime = Date.now() - startTime;
184
+ this.updateAverageBatchTime(executionTime);
185
+
186
+ const batchResult = {
187
+ batchId,
188
+ mode: 'sync',
189
+ success: true,
190
+ executionTime,
191
+ totalUrls: urlConfigs.length,
192
+ successfulUrls: processedResults.filter(r => r.success).length,
193
+ failedUrls: processedResults.filter(r => !r.success).length,
194
+ results: this.paginateResults(processedResults, 0, validated.pageSize),
195
+ pagination: {
196
+ page: 1,
197
+ pageSize: validated.pageSize,
198
+ totalResults: processedResults.length,
199
+ totalPages: Math.ceil(processedResults.length / validated.pageSize)
200
+ },
201
+ formats: validated.formats,
202
+ metadata: {
203
+ concurrency: validated.maxConcurrency,
204
+ extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null,
205
+ timestamp: Date.now()
206
+ }
207
+ };
208
+
209
+ // Cache results for pagination
210
+ if (this.enableResultCaching) {
211
+ this.batchResults.set(batchId, {
212
+ results: processedResults,
213
+ timestamp: Date.now(),
214
+ ttl: 3600000 // 1 hour
215
+ });
216
+ }
217
+
218
+ this.stats.completedBatches++;
219
+ this.stats.totalUrls += urlConfigs.length;
220
+ this.stats.successfulUrls += batchResult.successfulUrls;
221
+ this.stats.failedUrls += batchResult.failedUrls;
222
+ this.updateStats();
223
+
224
+ this.activeBatches.delete(batchId);
225
+
226
+ // Send webhook notification
227
+ if (webhookConfig) {
228
+ await this.sendWebhookNotification('batch_completed', batchResult, webhookConfig);
229
+ }
230
+
231
+ this.emit('batchCompleted', batchResult);
232
+ return batchResult;
233
+
234
+ } catch (error) {
235
+ this.stats.failedBatches++;
236
+ this.activeBatches.delete(batchId);
237
+
238
+ if (webhookConfig) {
239
+ await this.sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig);
240
+ }
241
+
242
+ throw error;
243
+ }
244
+ }
245
+
246
+ /**
247
+ * Process batch asynchronously using job manager
248
+ * @param {string} batchId - Batch identifier
249
+ * @param {Array} urlConfigs - Normalized URL configurations
250
+ * @param {Object} validated - Validated parameters
251
+ * @param {Object} webhookConfig - Webhook configuration
252
+ * @param {number} startTime - Start time
253
+ * @returns {Promise<Object>} Job information
254
+ */
255
+ async processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime) {
256
+ try {
257
+ const jobData = {
258
+ batchId,
259
+ urlConfigs,
260
+ validated,
261
+ webhookConfig,
262
+ startTime
263
+ };
264
+
265
+ const jobOptions = {
266
+ ...validated.jobOptions,
267
+ webhooks: webhookConfig ? [webhookConfig] : [],
268
+ tags: ['batch_scrape', batchId, ...(validated.jobOptions?.tags || [])],
269
+ metadata: {
270
+ batchId,
271
+ urlCount: urlConfigs.length,
272
+ formats: validated.formats,
273
+ extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null
274
+ }
275
+ };
276
+
277
+ const job = await this.jobManager.createJob('batch_scrape', jobData, jobOptions);
278
+
279
+ // Start job execution asynchronously
280
+ this.jobManager.executeJob(job.id).catch(error => {
281
+ this.log('error', `Async batch job ${job.id} failed: ${error.message}`);
282
+ });
283
+
284
+ this.emit('batchJobCreated', job);
285
+
286
+ return {
287
+ batchId,
288
+ mode: 'async',
289
+ jobId: job.id,
290
+ status: 'queued',
291
+ totalUrls: urlConfigs.length,
292
+ createdAt: job.createdAt,
293
+ estimatedCompletion: new Date(job.createdAt + (urlConfigs.length * 2000)), // Rough estimate
294
+ statusCheckUrl: `batch_scrape_status?jobId=${job.id}`,
295
+ webhook: webhookConfig ? {
296
+ url: webhookConfig.url,
297
+ events: webhookConfig.events
298
+ } : null
299
+ };
300
+
301
+ } catch (error) {
302
+ this.stats.failedBatches++;
303
+ throw error;
304
+ }
305
+ }
306
+
307
+ /**
308
+ * Scrape URLs in batch with concurrency control
309
+ * @param {Array} urlConfigs - URL configurations
310
+ * @param {Object} options - Scraping options
311
+ * @returns {Promise<Array>} Scraping results
312
+ */
313
+ async scrapeUrlsBatch(urlConfigs, options) {
314
+ const results = [];
315
+ const semaphore = new Semaphore(options.maxConcurrency);
316
+
317
+ const scrapePromises = urlConfigs.map(async (config, index) => {
318
+ return semaphore.acquire(async () => {
319
+ try {
320
+ // Add delay between requests if configured
321
+ if (options.delayBetweenRequests > 0 && index > 0) {
322
+ await this.delay(options.delayBetweenRequests);
323
+ }
324
+
325
+ return await this.scrapeUrl(config, options);
326
+ } catch (error) {
327
+ return {
328
+ success: false,
329
+ url: config.url,
330
+ error: error.message,
331
+ timestamp: Date.now()
332
+ };
333
+ }
334
+ });
335
+ });
336
+
337
+ const settledResults = await Promise.allSettled(scrapePromises);
338
+
339
+ settledResults.forEach((result, index) => {
340
+ if (result.status === 'fulfilled') {
341
+ results.push(result.value);
342
+ } else {
343
+ results.push({
344
+ success: false,
345
+ url: urlConfigs[index].url,
346
+ error: result.reason.message || 'Unknown error',
347
+ timestamp: Date.now()
348
+ });
349
+ }
350
+ });
351
+
352
+ return results;
353
+ }
354
+
355
+ /**
356
+ * Scrape individual URL
357
+ * @param {Object} config - URL configuration
358
+ * @param {Object} options - Scraping options
359
+ * @returns {Promise<Object>} Scrape result
360
+ */
361
+ async scrapeUrl(config, options) {
362
+ const startTime = Date.now();
363
+
364
+ try {
365
+ const response = await this.fetchUrl(config.url, {
366
+ headers: config.headers,
367
+ timeout: config.timeout || this.defaultTimeout
368
+ });
369
+
370
+ if (!response.ok) {
371
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
372
+ }
373
+
374
+ const html = await response.text();
375
+ const $ = load(html);
376
+
377
+ const result = {
378
+ success: true,
379
+ url: config.url,
380
+ timestamp: Date.now(),
381
+ executionTime: Date.now() - startTime,
382
+ metadata: {
383
+ status: response.status,
384
+ contentType: response.headers.get('content-type'),
385
+ contentLength: html.length,
386
+ ...(config.metadata || {})
387
+ }
388
+ };
389
+
390
+ // Apply extraction schemas
391
+ if (options.extractionSchema || config.selectors) {
392
+ const selectors = { ...config.selectors, ...options.extractionSchema };
393
+ result.extracted = this.extractStructuredData($, selectors);
394
+ }
395
+
396
+ // Generate different formats
397
+ result.content = this.generateFormats($, html, options.formats);
398
+
399
+ return result;
400
+
401
+ } catch (error) {
402
+ return {
403
+ success: false,
404
+ url: config.url,
405
+ error: error.message,
406
+ timestamp: Date.now(),
407
+ executionTime: Date.now() - startTime,
408
+ metadata: config.metadata || {}
409
+ };
410
+ }
411
+ }
412
+
413
+ /**
414
+ * Extract structured data using selectors
415
+ * @param {Object} $ - Cheerio instance
416
+ * @param {Object} selectors - CSS selectors
417
+ * @returns {Object} Extracted data
418
+ */
419
+ extractStructuredData($, selectors) {
420
+ const extracted = {};
421
+
422
+ for (const [key, selector] of Object.entries(selectors)) {
423
+ try {
424
+ const elements = $(selector);
425
+
426
+ if (elements.length === 0) {
427
+ extracted[key] = null;
428
+ } else if (elements.length === 1) {
429
+ extracted[key] = elements.text().trim();
430
+ } else {
431
+ extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
432
+ }
433
+ } catch (error) {
434
+ extracted[key] = { error: `Invalid selector: ${selector}` };
435
+ }
436
+ }
437
+
438
+ return extracted;
439
+ }
440
+
441
+ /**
442
+ * Generate content in different formats
443
+ * @param {Object} $ - Cheerio instance
444
+ * @param {string} html - Raw HTML
445
+ * @param {Array} formats - Requested formats
446
+ * @returns {Object} Content in different formats
447
+ */
448
+ generateFormats($, html, formats) {
449
+ const content = {};
450
+
451
+ if (formats.includes('html')) {
452
+ content.html = html;
453
+ }
454
+
455
+ if (formats.includes('text')) {
456
+ content.text = $('body').text().replace(/\s+/g, ' ').trim();
457
+ }
458
+
459
+ if (formats.includes('markdown')) {
460
+ content.markdown = this.convertToMarkdown($);
461
+ }
462
+
463
+ if (formats.includes('json')) {
464
+ content.json = {
465
+ title: $('title').text().trim(),
466
+ headings: this.extractHeadings($),
467
+ links: this.extractLinks($),
468
+ images: this.extractImages($),
469
+ metadata: this.extractMetadata($)
470
+ };
471
+ }
472
+
473
+ return content;
474
+ }
475
+
476
+ /**
477
+ * Convert HTML to Markdown (basic implementation)
478
+ * @param {Object} $ - Cheerio instance
479
+ * @returns {string} Markdown content
480
+ */
481
+ convertToMarkdown($) {
482
+ let markdown = '';
483
+
484
+ // Extract title
485
+ const title = $('title').text().trim();
486
+ if (title) {
487
+ markdown += `# ${title}\n\n`;
488
+ }
489
+
490
+ // Extract main content
491
+ const contentSelectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
492
+ let $content = null;
493
+
494
+ for (const selector of contentSelectors) {
495
+ $content = $(selector);
496
+ if ($content.length > 0) break;
497
+ }
498
+
499
+ if (!$content || $content.length === 0) {
500
+ $content = $('body');
501
+ }
502
+
503
+ // Basic markdown conversion
504
+ $content.find('h1').each((_, el) => {
505
+ markdown += `# ${$(el).text().trim()}\n\n`;
506
+ });
507
+
508
+ $content.find('h2').each((_, el) => {
509
+ markdown += `## ${$(el).text().trim()}\n\n`;
510
+ });
511
+
512
+ $content.find('h3').each((_, el) => {
513
+ markdown += `### ${$(el).text().trim()}\n\n`;
514
+ });
515
+
516
+ $content.find('p').each((_, el) => {
517
+ const text = $(el).text().trim();
518
+ if (text) {
519
+ markdown += `${text}\n\n`;
520
+ }
521
+ });
522
+
523
+ $content.find('ul li').each((_, el) => {
524
+ markdown += `- ${$(el).text().trim()}\n`;
525
+ });
526
+
527
+ $content.find('ol li').each((_, el) => {
528
+ markdown += `1. ${$(el).text().trim()}\n`;
529
+ });
530
+
531
+ return markdown.trim();
532
+ }
533
+
534
+ /**
535
+ * Extract headings
536
+ * @param {Object} $ - Cheerio instance
537
+ * @returns {Array} Headings
538
+ */
539
+ extractHeadings($) {
540
+ const headings = [];
541
+
542
+ $('h1, h2, h3, h4, h5, h6').each((_, el) => {
543
+ headings.push({
544
+ level: parseInt(el.name.substring(1)),
545
+ text: $(el).text().trim(),
546
+ id: $(el).attr('id') || null
547
+ });
548
+ });
549
+
550
+ return headings;
551
+ }
552
+
553
+ /**
554
+ * Extract links
555
+ * @param {Object} $ - Cheerio instance
556
+ * @returns {Array} Links
557
+ */
558
+ extractLinks($) {
559
+ const links = [];
560
+
561
+ $('a[href]').each((_, el) => {
562
+ const href = $(el).attr('href');
563
+ const text = $(el).text().trim();
564
+
565
+ if (href && text) {
566
+ links.push({
567
+ href,
568
+ text,
569
+ title: $(el).attr('title') || null
570
+ });
571
+ }
572
+ });
573
+
574
+ return links;
575
+ }
576
+
577
+ /**
578
+ * Extract images
579
+ * @param {Object} $ - Cheerio instance
580
+ * @returns {Array} Images
581
+ */
582
+ extractImages($) {
583
+ const images = [];
584
+
585
+ $('img[src]').each((_, el) => {
586
+ images.push({
587
+ src: $(el).attr('src'),
588
+ alt: $(el).attr('alt') || null,
589
+ title: $(el).attr('title') || null,
590
+ width: $(el).attr('width') || null,
591
+ height: $(el).attr('height') || null
592
+ });
593
+ });
594
+
595
+ return images;
596
+ }
597
+
598
+ /**
599
+ * Extract metadata
600
+ * @param {Object} $ - Cheerio instance
601
+ * @returns {Object} Metadata
602
+ */
603
+ extractMetadata($) {
604
+ const metadata = {};
605
+
606
+ // Basic metadata
607
+ metadata.title = $('title').text().trim();
608
+ metadata.description = $('meta[name="description"]').attr('content') || '';
609
+
610
+ // Open Graph
611
+ metadata.og = {};
612
+ $('meta[property^="og:"]').each((_, el) => {
613
+ const property = $(el).attr('property').replace('og:', '');
614
+ metadata.og[property] = $(el).attr('content');
615
+ });
616
+
617
+ // Twitter Cards
618
+ metadata.twitter = {};
619
+ $('meta[name^="twitter:"]').each((_, el) => {
620
+ const name = $(el).attr('name').replace('twitter:', '');
621
+ metadata.twitter[name] = $(el).attr('content');
622
+ });
623
+
624
+ return metadata;
625
+ }
626
+
627
+ /**
628
+ * Process and format results
629
+ * @param {Array} results - Raw results
630
+ * @param {Object} options - Processing options
631
+ * @returns {Promise<Array>} Processed results
632
+ */
633
+ async processResults(results, options) {
634
+ let processedResults = [...results];
635
+
636
+ // Filter out failed results if not requested
637
+ if (!options.includeFailed) {
638
+ processedResults = processedResults.filter(r => r.success);
639
+ }
640
+
641
+ // Add metadata if requested
642
+ if (options.includeMetadata) {
643
+ processedResults = processedResults.map(result => ({
644
+ ...result,
645
+ processingMetadata: {
646
+ formats: options.formats,
647
+ extractionApplied: !!options.extractionSchema,
648
+ processedAt: Date.now()
649
+ }
650
+ }));
651
+ }
652
+
653
+ return processedResults;
654
+ }
655
+
656
+ /**
657
+ * Paginate results
658
+ * @param {Array} results - All results
659
+ * @param {number} offset - Offset
660
+ * @param {number} limit - Limit
661
+ * @returns {Array} Paginated results
662
+ */
663
+ paginateResults(results, offset, limit) {
664
+ return results.slice(offset, offset + limit);
665
+ }
666
+
667
+ /**
668
+ * Get batch status and results
669
+ * @param {string} batchId - Batch identifier
670
+ * @param {number} page - Page number (1-based)
671
+ * @param {number} pageSize - Page size
672
+ * @returns {Object} Batch status and results
673
+ */
674
+ async getBatchResults(batchId, page = 1, pageSize = 25) {
675
+ // Check if results are cached
676
+ const cached = this.batchResults.get(batchId);
677
+ if (cached && Date.now() - cached.timestamp < cached.ttl) {
678
+ const offset = (page - 1) * pageSize;
679
+ return {
680
+ batchId,
681
+ success: true,
682
+ results: this.paginateResults(cached.results, offset, pageSize),
683
+ pagination: {
684
+ page,
685
+ pageSize,
686
+ totalResults: cached.results.length,
687
+ totalPages: Math.ceil(cached.results.length / pageSize)
688
+ },
689
+ cached: true,
690
+ timestamp: cached.timestamp
691
+ };
692
+ }
693
+
694
+ // Check active batches
695
+ const active = this.activeBatches.get(batchId);
696
+ if (active) {
697
+ return {
698
+ batchId,
699
+ status: 'in_progress',
700
+ mode: active.mode,
701
+ progress: {
702
+ completed: active.completed,
703
+ total: active.total,
704
+ percentage: Math.round((active.completed / active.total) * 100)
705
+ },
706
+ startTime: active.startTime,
707
+ runningTime: Date.now() - active.startTime
708
+ };
709
+ }
710
+
711
+ throw new Error(`Batch ${batchId} not found`);
712
+ }
713
+
714
+ /**
715
+ * Get job status for async batches
716
+ * @param {string} jobId - Job identifier
717
+ * @returns {Object} Job status
718
+ */
719
+ async getJobStatus(jobId) {
720
+ const job = this.jobManager.getJob(jobId);
721
+ if (!job) {
722
+ throw new Error(`Job ${jobId} not found`);
723
+ }
724
+
725
+ const status = {
726
+ jobId,
727
+ batchId: job.metadata?.batchId,
728
+ status: job.status,
729
+ progress: job.progress,
730
+ createdAt: job.createdAt,
731
+ startedAt: job.startedAt,
732
+ completedAt: job.completedAt,
733
+ error: job.error,
734
+ metadata: job.metadata
735
+ };
736
+
737
+ // If job is completed, include results
738
+ if (job.status === 'completed' && job.result) {
739
+ status.results = job.result;
740
+ }
741
+
742
+ return status;
743
+ }
744
+
745
+ /**
746
+ * Cancel batch operation
747
+ * @param {string} batchId - Batch identifier
748
+ * @returns {Object} Cancellation result
749
+ */
750
+ async cancelBatch(batchId) {
751
+ // Check active batches
752
+ if (this.activeBatches.has(batchId)) {
753
+ this.activeBatches.delete(batchId);
754
+ return { success: true, message: `Active batch ${batchId} cancelled` };
755
+ }
756
+
757
+ // Try to cancel job
758
+ const jobs = this.jobManager.getJobsByTag(batchId);
759
+ if (jobs.length > 0) {
760
+ const job = jobs[0];
761
+ await this.jobManager.cancelJob(job.id);
762
+ return { success: true, message: `Job ${job.id} for batch ${batchId} cancelled` };
763
+ }
764
+
765
+ throw new Error(`Batch ${batchId} not found or already completed`);
766
+ }
767
+
768
+ /**
769
+ * Initialize job executors
770
+ */
771
+ initializeJobExecutors() {
772
+ this.jobManager.registerExecutor('batch_scrape', async (job) => {
773
+ const { batchId, urlConfigs, validated, webhookConfig, startTime } = job.data;
774
+
775
+ try {
776
+ // Update job progress
777
+ await this.jobManager.updateJobProgress(job.id, 0, 'Starting batch scrape');
778
+
779
+ // Process URLs with progress updates
780
+ const results = [];
781
+ const total = urlConfigs.length;
782
+
783
+ for (let i = 0; i < total; i += validated.maxConcurrency) {
784
+ const batch = urlConfigs.slice(i, i + validated.maxConcurrency);
785
+ const batchResults = await this.scrapeUrlsBatch(batch, validated);
786
+ results.push(...batchResults);
787
+
788
+ const progress = Math.round(((i + batch.length) / total) * 100);
789
+ await this.jobManager.updateJobProgress(
790
+ job.id,
791
+ progress,
792
+ `Processed ${i + batch.length}/${total} URLs`
793
+ );
794
+ }
795
+
796
+ // Process and format results
797
+ const processedResults = await this.processResults(results, validated);
798
+ const executionTime = Date.now() - startTime;
799
+
800
+ const batchResult = {
801
+ batchId,
802
+ mode: 'async',
803
+ success: true,
804
+ executionTime,
805
+ totalUrls: urlConfigs.length,
806
+ successfulUrls: processedResults.filter(r => r.success).length,
807
+ failedUrls: processedResults.filter(r => !r.success).length,
808
+ results: processedResults,
809
+ formats: validated.formats,
810
+ metadata: {
811
+ concurrency: validated.maxConcurrency,
812
+ extractionSchema: validated.extractionSchema ? Object.keys(validated.extractionSchema) : null,
813
+ timestamp: Date.now(),
814
+ jobId: job.id
815
+ }
816
+ };
817
+
818
+ // Cache results
819
+ if (this.enableResultCaching) {
820
+ this.batchResults.set(batchId, {
821
+ results: processedResults,
822
+ timestamp: Date.now(),
823
+ ttl: 3600000 // 1 hour
824
+ });
825
+ }
826
+
827
+ // Update statistics
828
+ this.stats.completedBatches++;
829
+ this.stats.totalUrls += urlConfigs.length;
830
+ this.stats.successfulUrls += batchResult.successfulUrls;
831
+ this.stats.failedUrls += batchResult.failedUrls;
832
+ this.updateAverageBatchTime(executionTime);
833
+ this.updateStats();
834
+
835
+ // Send webhook notification
836
+ if (webhookConfig) {
837
+ await this.sendWebhookNotification('batch_completed', batchResult, webhookConfig);
838
+ }
839
+
840
+ this.emit('batchCompleted', batchResult);
841
+ return batchResult;
842
+
843
+ } catch (error) {
844
+ this.stats.failedBatches++;
845
+
846
+ if (webhookConfig) {
847
+ await this.sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig);
848
+ }
849
+
850
+ throw error;
851
+ }
852
+ });
853
+ }
854
+
855
+ /**
856
+ * Normalize URL configurations
857
+ * @param {Array} urls - Raw URL configurations
858
+ * @param {Object} globalOptions - Global options
859
+ * @returns {Array} Normalized URL configurations
860
+ */
861
+ normalizeUrlConfigs(urls, globalOptions) {
862
+ return urls.map(url => {
863
+ if (typeof url === 'string') {
864
+ return {
865
+ url,
866
+ selectors: globalOptions.extractionSchema || {},
867
+ headers: {},
868
+ timeout: this.defaultTimeout
869
+ };
870
+ } else {
871
+ return {
872
+ ...url,
873
+ selectors: { ...globalOptions.extractionSchema, ...(url.selectors || {}) },
874
+ headers: url.headers || {},
875
+ timeout: url.timeout || this.defaultTimeout
876
+ };
877
+ }
878
+ });
879
+ }
880
+
881
+ /**
882
+ * Register webhook for notifications
883
+ * @param {Object} webhookConfig - Webhook configuration
884
+ * @param {string} batchId - Batch identifier
885
+ * @returns {Promise<Object>} Registered webhook configuration
886
+ */
887
+ async registerWebhook(webhookConfig, batchId) {
888
+ const config = {
889
+ ...webhookConfig,
890
+ metadata: {
891
+ batchId,
892
+ registeredAt: Date.now()
893
+ }
894
+ };
895
+
896
+ const registeredConfig = this.webhookDispatcher.registerWebhook(webhookConfig.url, config);
897
+ return registeredConfig;
898
+ }
899
+
900
+ /**
901
+ * Send webhook notification
902
+ * @param {string} event - Event type
903
+ * @param {Object} data - Event data
904
+ * @param {Object} webhookConfig - Webhook configuration
905
+ */
906
+ async sendWebhookNotification(event, data, webhookConfig) {
907
+ if (!this.enableWebhookNotifications || !webhookConfig) return;
908
+
909
+ try {
910
+ await this.webhookDispatcher.dispatch(event, data, {
911
+ urls: [webhookConfig.url],
912
+ immediate: false,
913
+ metadata: {
914
+ batchId: data.batchId,
915
+ timestamp: Date.now()
916
+ }
917
+ });
918
+ } catch (error) {
919
+ this.log('warn', `Webhook notification failed: ${error.message}`);
920
+ }
921
+ }
922
+
923
+ /**
924
+ * Fetch URL with error handling
925
+ * @param {string} url - URL to fetch
926
+ * @param {Object} options - Fetch options
927
+ * @returns {Promise<Response>} Response
928
+ */
929
+ async fetchUrl(url, options = {}) {
930
+ const { timeout = this.defaultTimeout, headers = {} } = options;
931
+
932
+ const controller = new AbortController();
933
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
934
+
935
+ try {
936
+ const response = await fetch(url, {
937
+ signal: controller.signal,
938
+ headers: {
939
+ 'User-Agent': 'MCP-WebScraper-BatchTool/1.0.0',
940
+ ...headers
941
+ }
942
+ });
943
+
944
+ clearTimeout(timeoutId);
945
+ return response;
946
+ } catch (error) {
947
+ clearTimeout(timeoutId);
948
+ if (error.name === 'AbortError') {
949
+ throw new Error(`Request timeout after ${timeout}ms`);
950
+ }
951
+ throw error;
952
+ }
953
+ }
954
+
955
+ /**
956
+ * Generate unique batch ID
957
+ * @returns {string} Batch ID
958
+ */
959
+ generateBatchId() {
960
+ return `batch_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
961
+ }
962
+
963
+ /**
964
+ * Update average batch time statistic
965
+ * @param {number} batchTime - Batch execution time
966
+ */
967
+ updateAverageBatchTime(batchTime) {
968
+ const currentAverage = this.stats.averageBatchTime;
969
+ const completedBatches = this.stats.completedBatches;
970
+
971
+ if (completedBatches === 1) {
972
+ this.stats.averageBatchTime = batchTime;
973
+ } else {
974
+ this.stats.averageBatchTime =
975
+ ((currentAverage * (completedBatches - 1)) + batchTime) / completedBatches;
976
+ }
977
+ }
978
+
979
+ /**
980
+ * Update statistics
981
+ */
982
+ updateStats() {
983
+ this.stats.lastUpdated = Date.now();
984
+ }
985
+
986
+ /**
987
+ * Utility delay function
988
+ * @param {number} ms - Milliseconds to delay
989
+ * @returns {Promise} Delay promise
990
+ */
991
+ delay(ms) {
992
+ return new Promise(resolve => setTimeout(resolve, ms));
993
+ }
994
+
995
+ /**
996
+ * Log message if logging enabled
997
+ * @param {string} level - Log level
998
+ * @param {string} message - Log message
999
+ */
1000
+ log(level, message) {
1001
+ if (this.enableLogging) {
1002
+ console.log(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
1003
+ }
1004
+ }
1005
+
1006
+ /**
1007
+ * Get comprehensive statistics
1008
+ * @returns {Object} Statistics
1009
+ */
1010
+ getStats() {
1011
+ return {
1012
+ ...this.stats,
1013
+ activeBatches: this.activeBatches.size,
1014
+ cachedResults: this.batchResults.size,
1015
+ jobManagerStats: this.jobManager ? this.jobManager.getStats() : null,
1016
+ webhookStats: this.webhookDispatcher ? this.webhookDispatcher.getStats() : null
1017
+ };
1018
+ }
1019
+
1020
+ /**
1021
+ * Cleanup resources
1022
+ */
1023
+ async destroy() {
1024
+ // Cancel active batches
1025
+ for (const batchId of this.activeBatches.keys()) {
1026
+ try {
1027
+ await this.cancelBatch(batchId);
1028
+ } catch (error) {
1029
+ this.log('warn', `Failed to cancel batch ${batchId}: ${error.message}`);
1030
+ }
1031
+ }
1032
+
1033
+ // Clear caches
1034
+ this.activeBatches.clear();
1035
+ this.batchResults.clear();
1036
+
1037
+ // Cleanup job manager if we own it
1038
+ if (this.jobManager) {
1039
+ this.jobManager.destroy();
1040
+ }
1041
+
1042
+ // Cleanup webhook dispatcher if we own it
1043
+ if (this.webhookDispatcher) {
1044
+ this.webhookDispatcher.destroy();
1045
+ }
1046
+
1047
+ // Remove event listeners
1048
+ this.removeAllListeners();
1049
+
1050
+ this.emit('destroyed');
1051
+ }
1052
+ }
1053
+
1054
+ /**
1055
+ * Simple semaphore implementation for concurrency control
1056
+ */
1057
+ class Semaphore {
1058
+ constructor(max) {
1059
+ this.max = max;
1060
+ this.current = 0;
1061
+ this.queue = [];
1062
+ }
1063
+
1064
+ async acquire(task) {
1065
+ return new Promise((resolve, reject) => {
1066
+ this.queue.push({ task, resolve, reject });
1067
+ this.tryNext();
1068
+ });
1069
+ }
1070
+
1071
+ tryNext() {
1072
+ if (this.current >= this.max || this.queue.length === 0) {
1073
+ return;
1074
+ }
1075
+
1076
+ this.current++;
1077
+ const { task, resolve, reject } = this.queue.shift();
1078
+
1079
+ task()
1080
+ .then(resolve)
1081
+ .catch(reject)
1082
+ .finally(() => {
1083
+ this.current--;
1084
+ this.tryNext();
1085
+ });
1086
+ }
1087
+ }
1088
+
1089
+ export default BatchScrapeTool;