npm - crawlforge-mcp-server - Versions diffs - 3.0.18 → 3.4.0 - Mend

crawlforge-mcp-server 3.0.18 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/package.json +5 -2
package/server.js +192 -1277
package/src/core/ActionExecutor.js +2 -43
package/src/core/AuthManager.js +127 -14
package/src/core/BrowserContextPool.js +187 -0
package/src/core/JobManager.js +7 -5
package/src/core/LocalizationManager.js +14 -125
package/src/core/StealthBrowserManager.js +26 -18
package/src/core/cache/CacheManager.js +4 -1
package/src/core/crawlers/BFSCrawler.js +19 -5
package/src/observability/metrics.js +137 -0
package/src/observability/tracing.js +74 -0
package/src/server/auth/oauth.js +388 -0
package/src/server/registerTool.js +41 -0
package/src/server/schemas/common.js +29 -0
package/src/server/transports/http.js +22 -0
package/src/server/transports/stdio.js +16 -0
package/src/server/transports/streamableHttp.js +226 -0
package/src/server/withAuth.js +121 -0
package/src/tools/advanced/BatchScrapeTool.js +12 -1086
package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
package/src/tools/advanced/batchScrape/index.js +328 -0
package/src/tools/advanced/batchScrape/queue.js +91 -0
package/src/tools/advanced/batchScrape/reporter.js +26 -0
package/src/tools/advanced/batchScrape/schema.js +37 -0
package/src/tools/advanced/batchScrape/worker.js +179 -0
package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
package/src/tools/basic/_fetch.js +35 -0
package/src/tools/basic/extractLinks.js +74 -0
package/src/tools/basic/extractMetadata.js +74 -0
package/src/tools/basic/extractText.js +46 -0
package/src/tools/basic/fetchUrl.js +44 -0
package/src/tools/basic/scrapeStructured.js +58 -0
package/src/tools/crawl/_sessionContext.js +234 -0
package/src/tools/crawl/crawlDeep.js +55 -5
package/src/tools/crawl/mapSite.js +23 -2
package/src/tools/extract/_fetchAndParse.js +57 -0
package/src/tools/extract/extractStructured.js +3 -19
package/src/tools/extract/extractWithLlm.js +365 -0
package/src/tools/search/providers/searxng.js +126 -0
package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
package/src/tools/search/ranking/ResultRanker.js +17 -10
package/src/tools/search/ranking/SearchResultCache.js +52 -0
package/src/tools/search/searchWeb.js +112 -6
package/src/tools/tracking/trackChanges/differ.js +98 -0
package/src/tools/tracking/trackChanges/index.js +432 -0
package/src/tools/tracking/trackChanges/monitor.js +93 -0
package/src/tools/tracking/trackChanges/notifier.js +105 -0
package/src/tools/tracking/trackChanges/schema.js +127 -0
package/src/tools/tracking/trackChanges.js +12 -1374

package/src/tools/advanced/ScrapeWithActionsTool.js CHANGED Viewed

@@ -11,6 +11,16 @@ import { load } from 'cheerio';
 // Import existing tool for content extraction
 import ExtractContentTool from '../extract/extractContent.js';
+// Recording / replay helpers
+import {
+  validateRecordingName,
+  saveRecording,
+  loadRecording,
+  listRecordings,
+  buildRecordedEntry,
+  recordedEntryToAction
+} from './scrapeWithActions/recorder.js';
 // Action schemas (re-using from ActionExecutor but with tool-specific additions)
 const BaseActionSchema = z.object({
   type: z.string(),
@@ -102,18 +112,18 @@ const FormFieldSchema = z.object({
 // Main scrape with actions schema
 const ScrapeWithActionsSchema = z.object({
   url: z.string().url(),
-  actions: z.array(ActionSchema).min(1).max(20),
+  actions: z.array(ActionSchema).min(1).max(20).optional(),
   // Output formats
   formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
   // Intermediate state capture
   captureIntermediateStates: z.boolean().default(false),
   captureScreenshots: z.boolean().default(true),
   // Form auto-fill
   formAutoFill: z.record(z.string()).optional(),
   // Browser options
   browserOptions: z.object({
     headless: z.boolean().default(true),
@@ -122,7 +132,7 @@ const ScrapeWithActionsSchema = z.object({
     viewportHeight: z.number().min(600).max(1080).default(720),
     timeout: z.number().min(10000).max(120000).default(30000)
   }).optional(),
   // Content extraction options
   extractionOptions: z.object({
     selectors: z.record(z.string()).optional(),
@@ -130,12 +140,39 @@ const ScrapeWithActionsSchema = z.object({
     includeLinks: z.boolean().default(true),
     includeImages: z.boolean().default(true)
   }).optional(),
   // Error handling
   continueOnActionError: z.boolean().default(false),
   maxRetries: z.number().min(0).max(3).default(1),
-  screenshotOnError: z.boolean().default(true)
-});
+  screenshotOnError: z.boolean().default(true),
+  // ── Recording / replay ──────────────────────────────────────────────────
+  // record: true  → execute actions AND persist them as a named recording.
+  record: z.boolean().default(false),
+  // recordingName: required when record=true; also used to name the saved file.
+  recordingName: z.string().optional(),
+  // replayRecording: load a saved recording by name and execute it.
+  //   Special value "__list__" returns the list of available recordings instead.
+  replayRecording: z.string().optional()
+}).refine(
+  (data) => {
+    // actions is required unless replayRecording is set
+    if (!data.replayRecording && (!data.actions || data.actions.length === 0)) {
+      return false;
+    }
+    return true;
+  },
+  { message: 'actions is required when replayRecording is not set' }
+).refine(
+  (data) => {
+    // recordingName is required when record=true
+    if (data.record && !data.recordingName) {
+      return false;
+    }
+    return true;
+  },
+  { message: 'recordingName is required when record is true' }
+);
 export class ScrapeWithActionsTool extends EventEmitter {
   constructor(options = {}) {
@@ -188,8 +225,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
   async execute(params) {
     try {
+      // ── __list__ shortcut — resolve before full schema parse ─────────────
+      if (params.replayRecording === '__list__') {
+        const recordings = await listRecordings();
+        return { success: true, recordings };
+      }
+      // ── Validate recordingName if provided (path-traversal guard) ─────────
+      if (params.recordingName) {
+        validateRecordingName(params.recordingName);
+      }
+      if (params.replayRecording && params.replayRecording !== '__list__') {
+        validateRecordingName(params.replayRecording);
+      }
       const validated = ScrapeWithActionsSchema.parse(params);
+      // ── Replay mode — load saved recording and substitute actions ─────────
+      if (validated.replayRecording) {
+        const recording = await loadRecording(validated.replayRecording);
+        validated.actions = recording.recordedActions.map(recordedEntryToAction);
+        this.log('info', `Replaying recording "${validated.replayRecording}" with ${validated.actions.length} actions on ${validated.url}`);
+      }
       this.stats.totalSessions++;
       const sessionId = this.generateSessionId();
       const startTime = Date.now();
@@ -270,7 +328,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
     // Build action chain with form auto-fill if provided
     let actionChain = [...params.actions];
     if (params.formAutoFill) {
       actionChain = this.insertFormAutoFillActions(actionChain, params.formAutoFill);
     }
@@ -300,9 +358,32 @@ export class ScrapeWithActionsTool extends EventEmitter {
     sessionContext.actionResults = chainResult.results;
     sessionContext.screenshots = chainResult.screenshots || [];
-    // Process action results
+    // ── Recording mode — persist actions after successful execution ─────────
+    let savedRecordingPath;
+    if (params.record && params.recordingName) {
+      const sessionStartTime = sessionContext.startTime;
+      const recordedActions = actionChain.map((action, index) => {
+        // Use actual result timing if available, otherwise estimate from index
+        const result = (chainResult.results || [])[index];
+        const tMs = result?.timestamp
+          ? result.timestamp - sessionStartTime
+          : index * 100; // fallback estimate
+        return buildRecordedEntry(action, tMs);
+      });
+      try {
+        savedRecordingPath = await saveRecording(params.recordingName, recordedActions, {
+          originalUrl: params.url
+        });
+        this.log('info', `Recording saved: ${savedRecordingPath}`);
+      } catch (err) {
+        this.log('warn', `Failed to save recording: ${err.message}`);
+      }
+    }
+    // Process action results
     const actionResults = this.processActionResults(chainResult.results);
-    const intermediateStates = params.captureIntermediateStates ?
+    const intermediateStates = params.captureIntermediateStates ?
       await this.extractIntermediateStates(actionResults, params) : [];
     // Get final page content after all actions
@@ -331,15 +412,20 @@ export class ScrapeWithActionsTool extends EventEmitter {
       successfulActions: actionResults.filter(r => r.success).length,
       failedActions: actionResults.filter(r => !r.success).length,
       actionsExecuted: actionResults.length, // Total executed (for validation)
       content,
       intermediateStates: params.captureIntermediateStates ? intermediateStates : undefined,
       screenshots: params.captureScreenshots ? sessionContext.screenshots : undefined,
       // Form auto-fill flag (for tests/validation)
       formAutoFillApplied: !!params.formAutoFill,
+      // Recording fields
+      recordingSaved: params.record ? !!savedRecordingPath : undefined,
+      recordingPath: savedRecordingPath || undefined,
+      replayedFrom: params.replayRecording || undefined,
       metadata: {
         browserOptions,
         formAutoFillApplied: !!params.formAutoFill,
@@ -348,10 +434,10 @@ export class ScrapeWithActionsTool extends EventEmitter {
         finalUrl: chainResult.metadata?.finalUrl,
         timestamp: Date.now()
       },
       stats: {
         sessionTime: executionTime,
-        averageActionTime: actionResults.length > 0 ?
+        averageActionTime: actionResults.length > 0 ?
           actionResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / actionResults.length : 0,
         errorRecoveryCount: actionResults.filter(r => r.recovered).length
       }

package/src/tools/advanced/batchScrape/index.js ADDED Viewed

@@ -0,0 +1,328 @@
+/**
+ * batchScrape — entry-point (index.js).
+ *
+ * Preserves the same exports as the original BatchScrapeTool.js:
+ *   export class BatchScrapeTool
+ *   export default BatchScrapeTool
+ *
+ * Heavy work is delegated to:
+ *   schema.js   — Zod input schema
+ *   worker.js   — per-URL fetch + content extraction
+ *   queue.js    — Semaphore concurrency runner
+ *   reporter.js — webhook notification helper
+ *
+ * Reuses JobManager and WebhookDispatcher from src/core/ (no embedded copies).
+ */
+import { EventEmitter } from 'events';
+import JobManager from '../../../core/JobManager.js';
+import WebhookDispatcher from '../../../core/WebhookDispatcher.js';
+import { BatchScrapeSchema } from './schema.js';
+import { scrapeUrlsBatch, processResults, paginateResults } from './queue.js';
+import { sendWebhookNotification } from './reporter.js';
+export class BatchScrapeTool extends EventEmitter {
+  constructor(options = {}) {
+    super();
+    const {
+      jobManager = null,
+      webhookDispatcher = null,
+      enableJobPersistence = true,
+      enableWebhookNotifications = true,
+      defaultTimeout = 15000,
+      maxBatchSize = 50,
+      enableResultCaching = true,
+      enableLogging = true
+    } = options;
+    this.jobManager = jobManager || new JobManager({
+      enablePersistence: enableJobPersistence,
+      defaultTtl: 24 * 60 * 60 * 1000
+    });
+    this.webhookDispatcher = webhookDispatcher || new WebhookDispatcher({
+      enablePersistence: enableJobPersistence
+    });
+    this.defaultTimeout = defaultTimeout;
+    this.maxBatchSize = maxBatchSize;
+    this.enableResultCaching = enableResultCaching;
+    this.enableLogging = enableLogging;
+    this.enableWebhookNotifications = enableWebhookNotifications;
+    this.activeBatches = new Map();
+    this.batchResults = new Map();
+    this.stats = {
+      totalBatches: 0,
+      completedBatches: 0,
+      failedBatches: 0,
+      totalUrls: 0,
+      successfulUrls: 0,
+      failedUrls: 0,
+      averageBatchTime: 0,
+      lastUpdated: Date.now()
+    };
+    this._initializeJobExecutors();
+  }
+  async execute(params) {
+    try {
+      const validated = BatchScrapeSchema.parse(params);
+      this.stats.totalBatches++;
+      const batchId = this._generateBatchId();
+      const startTime = Date.now();
+      this._log('info', `Starting batch scrape ${batchId} with ${validated.urls.length} URLs in ${validated.mode} mode`);
+      const urlConfigs = this._normalizeUrlConfigs(validated.urls, validated);
+      let webhookConfig = null;
+      if (validated.webhook && this.enableWebhookNotifications) {
+        webhookConfig = this._registerWebhook(validated.webhook, batchId);
+      }
+      if (validated.mode === 'sync') {
+        return await this._processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
+      } else {
+        return await this._processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime);
+      }
+    } catch (error) {
+      this.stats.failedBatches++;
+      this._log('error', `Batch scrape failed: ${error.message}`);
+      throw new Error(`Batch scrape failed: ${error.message}`);
+    }
+  }
+  async _processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime) {
+    try {
+      this.activeBatches.set(batchId, { id: batchId, mode: 'sync', startTime, total: urlConfigs.length, completed: 0 });
+      const rawResults = await scrapeUrlsBatch(urlConfigs, validated, this.defaultTimeout);
+      const processedResults = processResults(rawResults, validated);
+      const executionTime = Date.now() - startTime;
+      this._updateAverageBatchTime(executionTime);
+      const batchResult = {
+        batchId, mode: 'sync', success: true, executionTime,
+        totalUrls: urlConfigs.length,
+        successfulUrls: processedResults.filter(r => r.success).length,
+        failedUrls: processedResults.filter(r => !r.success).length,
+        results: paginateResults(processedResults, 0, validated.pageSize),
+        pagination: {
+          page: 1, pageSize: validated.pageSize,
+          totalResults: processedResults.length,
+          totalPages: Math.ceil(processedResults.length / validated.pageSize)
+        },
+        formats: validated.formats,
+        metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now() }
+      };
+      if (this.enableResultCaching) {
+        this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
+      }
+      this.stats.completedBatches++;
+      this.stats.totalUrls += urlConfigs.length;
+      this.stats.successfulUrls += batchResult.successfulUrls;
+      this.stats.failedUrls += batchResult.failedUrls;
+      this.stats.lastUpdated = Date.now();
+      this.activeBatches.delete(batchId);
+      await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+      this.emit('batchCompleted', batchResult);
+      return batchResult;
+    } catch (error) {
+      this.stats.failedBatches++;
+      this.activeBatches.delete(batchId);
+      await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+      throw error;
+    }
+  }
+  async _processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime) {
+    try {
+      const jobData = { batchId, urlConfigs, validated, webhookConfig, startTime };
+      const jobOptions = {
+        ...validated.jobOptions,
+        webhooks: webhookConfig ? [webhookConfig] : [],
+        tags: ['batch_scrape', batchId, ...(validated.jobOptions?.tags || [])],
+        metadata: { batchId, urlCount: urlConfigs.length, formats: validated.formats }
+      };
+      const job = await this.jobManager.createJob('batch_scrape', jobData, jobOptions);
+      this.jobManager.executeJob(job.id).catch(err => {
+        this._log('error', `Async batch job ${job.id} failed: ${err.message}`);
+      });
+      this.emit('batchJobCreated', job);
+      return {
+        batchId, mode: 'async', jobId: job.id, status: 'queued',
+        totalUrls: urlConfigs.length, createdAt: job.createdAt,
+        estimatedCompletion: new Date(job.createdAt + (urlConfigs.length * 2000)),
+        statusCheckUrl: `batch_scrape_status?jobId=${job.id}`,
+        webhook: webhookConfig ? { url: webhookConfig.url, events: webhookConfig.events } : null
+      };
+    } catch (error) {
+      this.stats.failedBatches++;
+      throw error;
+    }
+  }
+  async getBatchResults(batchId, page = 1, pageSize = 25) {
+    const cached = this.batchResults.get(batchId);
+    if (cached && Date.now() - cached.timestamp < cached.ttl) {
+      const offset = (page - 1) * pageSize;
+      return {
+        batchId, success: true,
+        results: paginateResults(cached.results, offset, pageSize),
+        pagination: { page, pageSize, totalResults: cached.results.length, totalPages: Math.ceil(cached.results.length / pageSize) },
+        cached: true, timestamp: cached.timestamp
+      };
+    }
+    const active = this.activeBatches.get(batchId);
+    if (active) {
+      return {
+        batchId, status: 'in_progress', mode: active.mode,
+        progress: { completed: active.completed, total: active.total, percentage: Math.round((active.completed / active.total) * 100) },
+        startTime: active.startTime, runningTime: Date.now() - active.startTime
+      };
+    }
+    throw new Error(`Batch ${batchId} not found`);
+  }
+  async getJobStatus(jobId) {
+    const job = this.jobManager.getJob(jobId);
+    if (!job) throw new Error(`Job ${jobId} not found`);
+    const status = { jobId, batchId: job.metadata?.batchId, status: job.status, progress: job.progress, createdAt: job.createdAt, startedAt: job.startedAt, completedAt: job.completedAt, error: job.error, metadata: job.metadata };
+    if (job.status === 'completed' && job.result) status.results = job.result;
+    return status;
+  }
+  async cancelBatch(batchId) {
+    if (this.activeBatches.has(batchId)) {
+      this.activeBatches.delete(batchId);
+      return { success: true, message: `Active batch ${batchId} cancelled` };
+    }
+    const jobs = this.jobManager.getJobsByTag(batchId);
+    if (jobs.length > 0) {
+      const job = jobs[0];
+      await this.jobManager.cancelJob(job.id);
+      return { success: true, message: `Job ${job.id} for batch ${batchId} cancelled` };
+    }
+    throw new Error(`Batch ${batchId} not found or already completed`);
+  }
+  getStats() {
+    return {
+      ...this.stats,
+      activeBatches: this.activeBatches.size,
+      cachedResults: this.batchResults.size,
+      jobManagerStats: this.jobManager ? this.jobManager.getStats() : null,
+      webhookStats: this.webhookDispatcher ? this.webhookDispatcher.getStats() : null
+    };
+  }
+  async destroy() {
+    for (const batchId of this.activeBatches.keys()) {
+      try { await this.cancelBatch(batchId); } catch (e) { this._log('warn', `Failed to cancel batch ${batchId}: ${e.message}`); }
+    }
+    this.activeBatches.clear();
+    this.batchResults.clear();
+    this.jobManager?.destroy();
+    this.webhookDispatcher?.destroy();
+    this.removeAllListeners();
+    this.emit('destroyed');
+  }
+  // ── Private helpers ──────────────────────────────────────────────────────────
+  _generateBatchId() {
+    return `batch_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
+  }
+  _normalizeUrlConfigs(urls, globalOptions) {
+    return urls.map(url => {
+      if (typeof url === 'string') {
+        return { url, selectors: globalOptions.extractionSchema || {}, headers: {}, timeout: this.defaultTimeout };
+      }
+      return {
+        ...url,
+        selectors: { ...globalOptions.extractionSchema, ...(url.selectors || {}) },
+        headers: url.headers || {},
+        timeout: url.timeout || this.defaultTimeout
+      };
+    });
+  }
+  _registerWebhook(webhookConfig, batchId) {
+    const config = { ...webhookConfig, metadata: { batchId, registeredAt: Date.now() } };
+    return this.webhookDispatcher.registerWebhook(webhookConfig.url, config);
+  }
+  _updateAverageBatchTime(batchTime) {
+    const n = this.stats.completedBatches;
+    this.stats.averageBatchTime = n === 1 ? batchTime : ((this.stats.averageBatchTime * (n - 1)) + batchTime) / n;
+  }
+  _log(level, message) {
+    if (this.enableLogging) console.log(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
+  }
+  _initializeJobExecutors() {
+    this.jobManager.registerExecutor('batch_scrape', async (job) => {
+      const { batchId, urlConfigs, validated, webhookConfig, startTime } = job.data;
+      try {
+        await this.jobManager.updateJobProgress(job.id, 0, 'Starting batch scrape');
+        const results = [];
+        const total = urlConfigs.length;
+        for (let i = 0; i < total; i += validated.maxConcurrency) {
+          const batch = urlConfigs.slice(i, i + validated.maxConcurrency);
+          results.push(...await scrapeUrlsBatch(batch, validated, this.defaultTimeout));
+          const progress = Math.round(((i + batch.length) / total) * 100);
+          await this.jobManager.updateJobProgress(job.id, progress, `Processed ${i + batch.length}/${total} URLs`);
+        }
+        const processedResults = processResults(results, validated);
+        const executionTime = Date.now() - startTime;
+        const batchResult = {
+          batchId, mode: 'async', success: true, executionTime,
+          totalUrls: urlConfigs.length,
+          successfulUrls: processedResults.filter(r => r.success).length,
+          failedUrls: processedResults.filter(r => !r.success).length,
+          results: processedResults, formats: validated.formats,
+          metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now(), jobId: job.id }
+        };
+        if (this.enableResultCaching) {
+          this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
+        }
+        this.stats.completedBatches++;
+        this.stats.totalUrls += urlConfigs.length;
+        this.stats.successfulUrls += batchResult.successfulUrls;
+        this.stats.failedUrls += batchResult.failedUrls;
+        this._updateAverageBatchTime(executionTime);
+        this.stats.lastUpdated = Date.now();
+        await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+        this.emit('batchCompleted', batchResult);
+        return batchResult;
+      } catch (error) {
+        this.stats.failedBatches++;
+        await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+        throw error;
+      }
+    });
+  }
+}
+export default BatchScrapeTool;

package/src/tools/advanced/batchScrape/queue.js ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * batchScrape — queue module.
+ * Semaphore-based concurrency runner that dispatches work to worker.js.
+ */
+import { scrapeUrl } from './worker.js';
+/** Semaphore for concurrency limiting. */
+class Semaphore {
+  constructor(max) {
+    this.max = max;
+    this.current = 0;
+    this.queue = [];
+  }
+  async acquire(task) {
+    return new Promise((resolve, reject) => {
+      this.queue.push({ task, resolve, reject });
+      this._tryNext();
+    });
+  }
+  _tryNext() {
+    if (this.current >= this.max || this.queue.length === 0) return;
+    this.current++;
+    const { task, resolve, reject } = this.queue.shift();
+    task()
+      .then(resolve)
+      .catch(reject)
+      .finally(() => {
+        this.current--;
+        this._tryNext();
+      });
+  }
+}
+/**
+ * Scrape a list of URL configs with controlled concurrency and optional delay.
+ * @param {Array}  urlConfigs
+ * @param {Object} options    — { maxConcurrency, delayBetweenRequests, formats, extractionSchema, ... }
+ * @param {number} defaultTimeout
+ * @returns {Promise<Array>} raw results array
+ */
+export async function scrapeUrlsBatch(urlConfigs, options, defaultTimeout) {
+  const semaphore = new Semaphore(options.maxConcurrency);
+  const promises = urlConfigs.map((config, index) =>
+    semaphore.acquire(async () => {
+      if (options.delayBetweenRequests > 0 && index > 0) {
+        await new Promise(r => setTimeout(r, options.delayBetweenRequests));
+      }
+      return scrapeUrl(config, options, defaultTimeout);
+    })
+  );
+  const settled = await Promise.allSettled(promises);
+  return settled.map((result, index) => {
+    if (result.status === 'fulfilled') return result.value;
+    return {
+      success: false,
+      url: urlConfigs[index].url,
+      error: result.reason?.message || 'Unknown error',
+      timestamp: Date.now()
+    };
+  });
+}
+/**
+ * Filter and enrich results according to options.
+ */
+export function processResults(results, options) {
+  let out = [...results];
+  if (!options.includeFailed) out = out.filter(r => r.success);
+  if (options.includeMetadata) {
+    out = out.map(r => ({
+      ...r,
+      processingMetadata: {
+        formats: options.formats,
+        extractionApplied: !!options.extractionSchema,
+        processedAt: Date.now()
+      }
+    }));
+  }
+  return out;
+}
+/** Return a page-sized slice of results. */
+export function paginateResults(results, offset, limit) {
+  return results.slice(offset, offset + limit);
+}

package/src/tools/advanced/batchScrape/reporter.js ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * batchScrape — reporter module.
+ * Webhook dispatching helper (thin wrapper around WebhookDispatcher).
+ */
+/**
+ * Send a batch event via the webhookDispatcher.
+ * @param {string}  event
+ * @param {Object}  data
+ * @param {Object}  webhookConfig
+ * @param {Object}  webhookDispatcher
+ * @param {boolean} enabled
+ */
+export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
+  if (!enabled || !webhookConfig || !webhookDispatcher) return;
+  try {
+    await webhookDispatcher.dispatch(event, data, {
+      urls: [webhookConfig.url],
+      immediate: false,
+      metadata: { batchId: data.batchId, timestamp: Date.now() }
+    });
+  } catch (error) {
+    console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
+  }
+}

package/src/tools/advanced/batchScrape/schema.js ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * batchScrape — schema module.
+ */
+import { z } from 'zod';
+export const UrlConfigSchema = z.object({
+  url: z.string().url(),
+  selectors: z.record(z.string()).optional(),
+  headers: z.record(z.string()).optional(),
+  timeout: z.number().min(1000).max(30000).optional(),
+  metadata: z.record(z.any()).optional()
+});
+export const BatchScrapeSchema = z.object({
+  urls: z.array(z.union([z.string().url(), UrlConfigSchema])).min(1).max(50),
+  formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
+  mode: z.enum(['sync', 'async']).default('sync'),
+  webhook: z.object({
+    url: z.string().url(),
+    events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
+    headers: z.record(z.string()).optional(),
+    signingSecret: z.string().optional()
+  }).optional(),
+  extractionSchema: z.record(z.string()).optional(),
+  maxConcurrency: z.number().min(1).max(20).default(10),
+  delayBetweenRequests: z.number().min(0).max(10000).default(100),
+  includeMetadata: z.boolean().default(true),
+  includeFailed: z.boolean().default(true),
+  pageSize: z.number().min(1).max(100).default(25),
+  jobOptions: z.object({
+    priority: z.number().default(0),
+    ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
+    maxRetries: z.number().min(0).max(5).default(1),
+    tags: z.array(z.string()).default([])
+  }).optional()
+});