crawlforge-mcp-server 3.0.18 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +5 -2
  2. package/server.js +192 -1277
  3. package/src/core/ActionExecutor.js +2 -43
  4. package/src/core/AuthManager.js +127 -14
  5. package/src/core/BrowserContextPool.js +187 -0
  6. package/src/core/JobManager.js +7 -5
  7. package/src/core/LocalizationManager.js +14 -125
  8. package/src/core/StealthBrowserManager.js +26 -18
  9. package/src/core/cache/CacheManager.js +4 -1
  10. package/src/core/crawlers/BFSCrawler.js +19 -5
  11. package/src/observability/metrics.js +137 -0
  12. package/src/observability/tracing.js +74 -0
  13. package/src/server/auth/oauth.js +388 -0
  14. package/src/server/registerTool.js +41 -0
  15. package/src/server/schemas/common.js +29 -0
  16. package/src/server/transports/http.js +22 -0
  17. package/src/server/transports/stdio.js +16 -0
  18. package/src/server/transports/streamableHttp.js +226 -0
  19. package/src/server/withAuth.js +121 -0
  20. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  21. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  22. package/src/tools/advanced/batchScrape/index.js +328 -0
  23. package/src/tools/advanced/batchScrape/queue.js +91 -0
  24. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  25. package/src/tools/advanced/batchScrape/schema.js +37 -0
  26. package/src/tools/advanced/batchScrape/worker.js +179 -0
  27. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  28. package/src/tools/basic/_fetch.js +35 -0
  29. package/src/tools/basic/extractLinks.js +74 -0
  30. package/src/tools/basic/extractMetadata.js +74 -0
  31. package/src/tools/basic/extractText.js +46 -0
  32. package/src/tools/basic/fetchUrl.js +44 -0
  33. package/src/tools/basic/scrapeStructured.js +58 -0
  34. package/src/tools/crawl/_sessionContext.js +234 -0
  35. package/src/tools/crawl/crawlDeep.js +55 -5
  36. package/src/tools/crawl/mapSite.js +23 -2
  37. package/src/tools/extract/_fetchAndParse.js +57 -0
  38. package/src/tools/extract/extractStructured.js +3 -19
  39. package/src/tools/extract/extractWithLlm.js +365 -0
  40. package/src/tools/search/providers/searxng.js +126 -0
  41. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  42. package/src/tools/search/ranking/ResultRanker.js +17 -10
  43. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  44. package/src/tools/search/searchWeb.js +112 -6
  45. package/src/tools/tracking/trackChanges/differ.js +98 -0
  46. package/src/tools/tracking/trackChanges/index.js +432 -0
  47. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  48. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  49. package/src/tools/tracking/trackChanges/schema.js +127 -0
  50. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -11,6 +11,16 @@ import { load } from 'cheerio';
11
11
  // Import existing tool for content extraction
12
12
  import ExtractContentTool from '../extract/extractContent.js';
13
13
 
14
+ // Recording / replay helpers
15
+ import {
16
+ validateRecordingName,
17
+ saveRecording,
18
+ loadRecording,
19
+ listRecordings,
20
+ buildRecordedEntry,
21
+ recordedEntryToAction
22
+ } from './scrapeWithActions/recorder.js';
23
+
14
24
  // Action schemas (re-using from ActionExecutor but with tool-specific additions)
15
25
  const BaseActionSchema = z.object({
16
26
  type: z.string(),
@@ -102,18 +112,18 @@ const FormFieldSchema = z.object({
102
112
  // Main scrape with actions schema
103
113
  const ScrapeWithActionsSchema = z.object({
104
114
  url: z.string().url(),
105
- actions: z.array(ActionSchema).min(1).max(20),
106
-
115
+ actions: z.array(ActionSchema).min(1).max(20).optional(),
116
+
107
117
  // Output formats
108
118
  formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
109
-
119
+
110
120
  // Intermediate state capture
111
121
  captureIntermediateStates: z.boolean().default(false),
112
122
  captureScreenshots: z.boolean().default(true),
113
-
123
+
114
124
  // Form auto-fill
115
125
  formAutoFill: z.record(z.string()).optional(),
116
-
126
+
117
127
  // Browser options
118
128
  browserOptions: z.object({
119
129
  headless: z.boolean().default(true),
@@ -122,7 +132,7 @@ const ScrapeWithActionsSchema = z.object({
122
132
  viewportHeight: z.number().min(600).max(1080).default(720),
123
133
  timeout: z.number().min(10000).max(120000).default(30000)
124
134
  }).optional(),
125
-
135
+
126
136
  // Content extraction options
127
137
  extractionOptions: z.object({
128
138
  selectors: z.record(z.string()).optional(),
@@ -130,12 +140,39 @@ const ScrapeWithActionsSchema = z.object({
130
140
  includeLinks: z.boolean().default(true),
131
141
  includeImages: z.boolean().default(true)
132
142
  }).optional(),
133
-
143
+
134
144
  // Error handling
135
145
  continueOnActionError: z.boolean().default(false),
136
146
  maxRetries: z.number().min(0).max(3).default(1),
137
- screenshotOnError: z.boolean().default(true)
138
- });
147
+ screenshotOnError: z.boolean().default(true),
148
+
149
+ // ── Recording / replay ──────────────────────────────────────────────────
150
+ // record: true → execute actions AND persist them as a named recording.
151
+ record: z.boolean().default(false),
152
+ // recordingName: required when record=true; also used to name the saved file.
153
+ recordingName: z.string().optional(),
154
+ // replayRecording: load a saved recording by name and execute it.
155
+ // Special value "__list__" returns the list of available recordings instead.
156
+ replayRecording: z.string().optional()
157
+ }).refine(
158
+ (data) => {
159
+ // actions is required unless replayRecording is set
160
+ if (!data.replayRecording && (!data.actions || data.actions.length === 0)) {
161
+ return false;
162
+ }
163
+ return true;
164
+ },
165
+ { message: 'actions is required when replayRecording is not set' }
166
+ ).refine(
167
+ (data) => {
168
+ // recordingName is required when record=true
169
+ if (data.record && !data.recordingName) {
170
+ return false;
171
+ }
172
+ return true;
173
+ },
174
+ { message: 'recordingName is required when record is true' }
175
+ );
139
176
 
140
177
  export class ScrapeWithActionsTool extends EventEmitter {
141
178
  constructor(options = {}) {
@@ -188,8 +225,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
188
225
 
189
226
  async execute(params) {
190
227
  try {
228
+ // ── __list__ shortcut — resolve before full schema parse ─────────────
229
+ if (params.replayRecording === '__list__') {
230
+ const recordings = await listRecordings();
231
+ return { success: true, recordings };
232
+ }
233
+
234
+ // ── Validate recordingName if provided (path-traversal guard) ─────────
235
+ if (params.recordingName) {
236
+ validateRecordingName(params.recordingName);
237
+ }
238
+ if (params.replayRecording && params.replayRecording !== '__list__') {
239
+ validateRecordingName(params.replayRecording);
240
+ }
241
+
191
242
  const validated = ScrapeWithActionsSchema.parse(params);
192
-
243
+
244
+ // ── Replay mode — load saved recording and substitute actions ─────────
245
+ if (validated.replayRecording) {
246
+ const recording = await loadRecording(validated.replayRecording);
247
+ validated.actions = recording.recordedActions.map(recordedEntryToAction);
248
+ this.log('info', `Replaying recording "${validated.replayRecording}" with ${validated.actions.length} actions on ${validated.url}`);
249
+ }
250
+
193
251
  this.stats.totalSessions++;
194
252
  const sessionId = this.generateSessionId();
195
253
  const startTime = Date.now();
@@ -270,7 +328,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
270
328
 
271
329
  // Build action chain with form auto-fill if provided
272
330
  let actionChain = [...params.actions];
273
-
331
+
274
332
  if (params.formAutoFill) {
275
333
  actionChain = this.insertFormAutoFillActions(actionChain, params.formAutoFill);
276
334
  }
@@ -300,9 +358,32 @@ export class ScrapeWithActionsTool extends EventEmitter {
300
358
  sessionContext.actionResults = chainResult.results;
301
359
  sessionContext.screenshots = chainResult.screenshots || [];
302
360
 
303
- // Process action results
361
+ // ── Recording mode — persist actions after successful execution ─────────
362
+ let savedRecordingPath;
363
+ if (params.record && params.recordingName) {
364
+ const sessionStartTime = sessionContext.startTime;
365
+ const recordedActions = actionChain.map((action, index) => {
366
+ // Use actual result timing if available, otherwise estimate from index
367
+ const result = (chainResult.results || [])[index];
368
+ const tMs = result?.timestamp
369
+ ? result.timestamp - sessionStartTime
370
+ : index * 100; // fallback estimate
371
+ return buildRecordedEntry(action, tMs);
372
+ });
373
+
374
+ try {
375
+ savedRecordingPath = await saveRecording(params.recordingName, recordedActions, {
376
+ originalUrl: params.url
377
+ });
378
+ this.log('info', `Recording saved: ${savedRecordingPath}`);
379
+ } catch (err) {
380
+ this.log('warn', `Failed to save recording: ${err.message}`);
381
+ }
382
+ }
383
+
384
+ // Process action results
304
385
  const actionResults = this.processActionResults(chainResult.results);
305
- const intermediateStates = params.captureIntermediateStates ?
386
+ const intermediateStates = params.captureIntermediateStates ?
306
387
  await this.extractIntermediateStates(actionResults, params) : [];
307
388
 
308
389
  // Get final page content after all actions
@@ -331,15 +412,20 @@ export class ScrapeWithActionsTool extends EventEmitter {
331
412
  successfulActions: actionResults.filter(r => r.success).length,
332
413
  failedActions: actionResults.filter(r => !r.success).length,
333
414
  actionsExecuted: actionResults.length, // Total executed (for validation)
334
-
415
+
335
416
  content,
336
-
417
+
337
418
  intermediateStates: params.captureIntermediateStates ? intermediateStates : undefined,
338
419
  screenshots: params.captureScreenshots ? sessionContext.screenshots : undefined,
339
-
420
+
340
421
  // Form auto-fill flag (for tests/validation)
341
422
  formAutoFillApplied: !!params.formAutoFill,
342
-
423
+
424
+ // Recording fields
425
+ recordingSaved: params.record ? !!savedRecordingPath : undefined,
426
+ recordingPath: savedRecordingPath || undefined,
427
+ replayedFrom: params.replayRecording || undefined,
428
+
343
429
  metadata: {
344
430
  browserOptions,
345
431
  formAutoFillApplied: !!params.formAutoFill,
@@ -348,10 +434,10 @@ export class ScrapeWithActionsTool extends EventEmitter {
348
434
  finalUrl: chainResult.metadata?.finalUrl,
349
435
  timestamp: Date.now()
350
436
  },
351
-
437
+
352
438
  stats: {
353
439
  sessionTime: executionTime,
354
- averageActionTime: actionResults.length > 0 ?
440
+ averageActionTime: actionResults.length > 0 ?
355
441
  actionResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / actionResults.length : 0,
356
442
  errorRecoveryCount: actionResults.filter(r => r.recovered).length
357
443
  }
@@ -0,0 +1,328 @@
1
+ /**
2
+ * batchScrape — entry-point (index.js).
3
+ *
4
+ * Preserves the same exports as the original BatchScrapeTool.js:
5
+ * export class BatchScrapeTool
6
+ * export default BatchScrapeTool
7
+ *
8
+ * Heavy work is delegated to:
9
+ * schema.js — Zod input schema
10
+ * worker.js — per-URL fetch + content extraction
11
+ * queue.js — Semaphore concurrency runner
12
+ * reporter.js — webhook notification helper
13
+ *
14
+ * Reuses JobManager and WebhookDispatcher from src/core/ (no embedded copies).
15
+ */
16
+
17
+ import { EventEmitter } from 'events';
18
+ import JobManager from '../../../core/JobManager.js';
19
+ import WebhookDispatcher from '../../../core/WebhookDispatcher.js';
20
+ import { BatchScrapeSchema } from './schema.js';
21
+ import { scrapeUrlsBatch, processResults, paginateResults } from './queue.js';
22
+ import { sendWebhookNotification } from './reporter.js';
23
+
24
+ export class BatchScrapeTool extends EventEmitter {
25
+ constructor(options = {}) {
26
+ super();
27
+
28
+ const {
29
+ jobManager = null,
30
+ webhookDispatcher = null,
31
+ enableJobPersistence = true,
32
+ enableWebhookNotifications = true,
33
+ defaultTimeout = 15000,
34
+ maxBatchSize = 50,
35
+ enableResultCaching = true,
36
+ enableLogging = true
37
+ } = options;
38
+
39
+ this.jobManager = jobManager || new JobManager({
40
+ enablePersistence: enableJobPersistence,
41
+ defaultTtl: 24 * 60 * 60 * 1000
42
+ });
43
+
44
+ this.webhookDispatcher = webhookDispatcher || new WebhookDispatcher({
45
+ enablePersistence: enableJobPersistence
46
+ });
47
+
48
+ this.defaultTimeout = defaultTimeout;
49
+ this.maxBatchSize = maxBatchSize;
50
+ this.enableResultCaching = enableResultCaching;
51
+ this.enableLogging = enableLogging;
52
+ this.enableWebhookNotifications = enableWebhookNotifications;
53
+
54
+ this.activeBatches = new Map();
55
+ this.batchResults = new Map();
56
+
57
+ this.stats = {
58
+ totalBatches: 0,
59
+ completedBatches: 0,
60
+ failedBatches: 0,
61
+ totalUrls: 0,
62
+ successfulUrls: 0,
63
+ failedUrls: 0,
64
+ averageBatchTime: 0,
65
+ lastUpdated: Date.now()
66
+ };
67
+
68
+ this._initializeJobExecutors();
69
+ }
70
+
71
+ async execute(params) {
72
+ try {
73
+ const validated = BatchScrapeSchema.parse(params);
74
+ this.stats.totalBatches++;
75
+ const batchId = this._generateBatchId();
76
+ const startTime = Date.now();
77
+
78
+ this._log('info', `Starting batch scrape ${batchId} with ${validated.urls.length} URLs in ${validated.mode} mode`);
79
+
80
+ const urlConfigs = this._normalizeUrlConfigs(validated.urls, validated);
81
+
82
+ let webhookConfig = null;
83
+ if (validated.webhook && this.enableWebhookNotifications) {
84
+ webhookConfig = this._registerWebhook(validated.webhook, batchId);
85
+ }
86
+
87
+ if (validated.mode === 'sync') {
88
+ return await this._processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
89
+ } else {
90
+ return await this._processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime);
91
+ }
92
+ } catch (error) {
93
+ this.stats.failedBatches++;
94
+ this._log('error', `Batch scrape failed: ${error.message}`);
95
+ throw new Error(`Batch scrape failed: ${error.message}`);
96
+ }
97
+ }
98
+
99
+ async _processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime) {
100
+ try {
101
+ this.activeBatches.set(batchId, { id: batchId, mode: 'sync', startTime, total: urlConfigs.length, completed: 0 });
102
+
103
+ const rawResults = await scrapeUrlsBatch(urlConfigs, validated, this.defaultTimeout);
104
+ const processedResults = processResults(rawResults, validated);
105
+ const executionTime = Date.now() - startTime;
106
+ this._updateAverageBatchTime(executionTime);
107
+
108
+ const batchResult = {
109
+ batchId, mode: 'sync', success: true, executionTime,
110
+ totalUrls: urlConfigs.length,
111
+ successfulUrls: processedResults.filter(r => r.success).length,
112
+ failedUrls: processedResults.filter(r => !r.success).length,
113
+ results: paginateResults(processedResults, 0, validated.pageSize),
114
+ pagination: {
115
+ page: 1, pageSize: validated.pageSize,
116
+ totalResults: processedResults.length,
117
+ totalPages: Math.ceil(processedResults.length / validated.pageSize)
118
+ },
119
+ formats: validated.formats,
120
+ metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now() }
121
+ };
122
+
123
+ if (this.enableResultCaching) {
124
+ this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
125
+ }
126
+
127
+ this.stats.completedBatches++;
128
+ this.stats.totalUrls += urlConfigs.length;
129
+ this.stats.successfulUrls += batchResult.successfulUrls;
130
+ this.stats.failedUrls += batchResult.failedUrls;
131
+ this.stats.lastUpdated = Date.now();
132
+ this.activeBatches.delete(batchId);
133
+
134
+ await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
135
+ this.emit('batchCompleted', batchResult);
136
+ return batchResult;
137
+ } catch (error) {
138
+ this.stats.failedBatches++;
139
+ this.activeBatches.delete(batchId);
140
+ await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
141
+ throw error;
142
+ }
143
+ }
144
+
145
+ async _processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime) {
146
+ try {
147
+ const jobData = { batchId, urlConfigs, validated, webhookConfig, startTime };
148
+ const jobOptions = {
149
+ ...validated.jobOptions,
150
+ webhooks: webhookConfig ? [webhookConfig] : [],
151
+ tags: ['batch_scrape', batchId, ...(validated.jobOptions?.tags || [])],
152
+ metadata: { batchId, urlCount: urlConfigs.length, formats: validated.formats }
153
+ };
154
+
155
+ const job = await this.jobManager.createJob('batch_scrape', jobData, jobOptions);
156
+ this.jobManager.executeJob(job.id).catch(err => {
157
+ this._log('error', `Async batch job ${job.id} failed: ${err.message}`);
158
+ });
159
+
160
+ this.emit('batchJobCreated', job);
161
+
162
+ return {
163
+ batchId, mode: 'async', jobId: job.id, status: 'queued',
164
+ totalUrls: urlConfigs.length, createdAt: job.createdAt,
165
+ estimatedCompletion: new Date(job.createdAt + (urlConfigs.length * 2000)),
166
+ statusCheckUrl: `batch_scrape_status?jobId=${job.id}`,
167
+ webhook: webhookConfig ? { url: webhookConfig.url, events: webhookConfig.events } : null
168
+ };
169
+ } catch (error) {
170
+ this.stats.failedBatches++;
171
+ throw error;
172
+ }
173
+ }
174
+
175
+ async getBatchResults(batchId, page = 1, pageSize = 25) {
176
+ const cached = this.batchResults.get(batchId);
177
+ if (cached && Date.now() - cached.timestamp < cached.ttl) {
178
+ const offset = (page - 1) * pageSize;
179
+ return {
180
+ batchId, success: true,
181
+ results: paginateResults(cached.results, offset, pageSize),
182
+ pagination: { page, pageSize, totalResults: cached.results.length, totalPages: Math.ceil(cached.results.length / pageSize) },
183
+ cached: true, timestamp: cached.timestamp
184
+ };
185
+ }
186
+
187
+ const active = this.activeBatches.get(batchId);
188
+ if (active) {
189
+ return {
190
+ batchId, status: 'in_progress', mode: active.mode,
191
+ progress: { completed: active.completed, total: active.total, percentage: Math.round((active.completed / active.total) * 100) },
192
+ startTime: active.startTime, runningTime: Date.now() - active.startTime
193
+ };
194
+ }
195
+
196
+ throw new Error(`Batch ${batchId} not found`);
197
+ }
198
+
199
+ async getJobStatus(jobId) {
200
+ const job = this.jobManager.getJob(jobId);
201
+ if (!job) throw new Error(`Job ${jobId} not found`);
202
+ const status = { jobId, batchId: job.metadata?.batchId, status: job.status, progress: job.progress, createdAt: job.createdAt, startedAt: job.startedAt, completedAt: job.completedAt, error: job.error, metadata: job.metadata };
203
+ if (job.status === 'completed' && job.result) status.results = job.result;
204
+ return status;
205
+ }
206
+
207
+ async cancelBatch(batchId) {
208
+ if (this.activeBatches.has(batchId)) {
209
+ this.activeBatches.delete(batchId);
210
+ return { success: true, message: `Active batch ${batchId} cancelled` };
211
+ }
212
+ const jobs = this.jobManager.getJobsByTag(batchId);
213
+ if (jobs.length > 0) {
214
+ const job = jobs[0];
215
+ await this.jobManager.cancelJob(job.id);
216
+ return { success: true, message: `Job ${job.id} for batch ${batchId} cancelled` };
217
+ }
218
+ throw new Error(`Batch ${batchId} not found or already completed`);
219
+ }
220
+
221
+ getStats() {
222
+ return {
223
+ ...this.stats,
224
+ activeBatches: this.activeBatches.size,
225
+ cachedResults: this.batchResults.size,
226
+ jobManagerStats: this.jobManager ? this.jobManager.getStats() : null,
227
+ webhookStats: this.webhookDispatcher ? this.webhookDispatcher.getStats() : null
228
+ };
229
+ }
230
+
231
+ async destroy() {
232
+ for (const batchId of this.activeBatches.keys()) {
233
+ try { await this.cancelBatch(batchId); } catch (e) { this._log('warn', `Failed to cancel batch ${batchId}: ${e.message}`); }
234
+ }
235
+ this.activeBatches.clear();
236
+ this.batchResults.clear();
237
+ this.jobManager?.destroy();
238
+ this.webhookDispatcher?.destroy();
239
+ this.removeAllListeners();
240
+ this.emit('destroyed');
241
+ }
242
+
243
+ // ── Private helpers ──────────────────────────────────────────────────────────
244
+
245
+ _generateBatchId() {
246
+ return `batch_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
247
+ }
248
+
249
+ _normalizeUrlConfigs(urls, globalOptions) {
250
+ return urls.map(url => {
251
+ if (typeof url === 'string') {
252
+ return { url, selectors: globalOptions.extractionSchema || {}, headers: {}, timeout: this.defaultTimeout };
253
+ }
254
+ return {
255
+ ...url,
256
+ selectors: { ...globalOptions.extractionSchema, ...(url.selectors || {}) },
257
+ headers: url.headers || {},
258
+ timeout: url.timeout || this.defaultTimeout
259
+ };
260
+ });
261
+ }
262
+
263
+ _registerWebhook(webhookConfig, batchId) {
264
+ const config = { ...webhookConfig, metadata: { batchId, registeredAt: Date.now() } };
265
+ return this.webhookDispatcher.registerWebhook(webhookConfig.url, config);
266
+ }
267
+
268
+ _updateAverageBatchTime(batchTime) {
269
+ const n = this.stats.completedBatches;
270
+ this.stats.averageBatchTime = n === 1 ? batchTime : ((this.stats.averageBatchTime * (n - 1)) + batchTime) / n;
271
+ }
272
+
273
+ _log(level, message) {
274
+ if (this.enableLogging) console.log(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
275
+ }
276
+
277
+ _initializeJobExecutors() {
278
+ this.jobManager.registerExecutor('batch_scrape', async (job) => {
279
+ const { batchId, urlConfigs, validated, webhookConfig, startTime } = job.data;
280
+ try {
281
+ await this.jobManager.updateJobProgress(job.id, 0, 'Starting batch scrape');
282
+
283
+ const results = [];
284
+ const total = urlConfigs.length;
285
+
286
+ for (let i = 0; i < total; i += validated.maxConcurrency) {
287
+ const batch = urlConfigs.slice(i, i + validated.maxConcurrency);
288
+ results.push(...await scrapeUrlsBatch(batch, validated, this.defaultTimeout));
289
+ const progress = Math.round(((i + batch.length) / total) * 100);
290
+ await this.jobManager.updateJobProgress(job.id, progress, `Processed ${i + batch.length}/${total} URLs`);
291
+ }
292
+
293
+ const processedResults = processResults(results, validated);
294
+ const executionTime = Date.now() - startTime;
295
+
296
+ const batchResult = {
297
+ batchId, mode: 'async', success: true, executionTime,
298
+ totalUrls: urlConfigs.length,
299
+ successfulUrls: processedResults.filter(r => r.success).length,
300
+ failedUrls: processedResults.filter(r => !r.success).length,
301
+ results: processedResults, formats: validated.formats,
302
+ metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now(), jobId: job.id }
303
+ };
304
+
305
+ if (this.enableResultCaching) {
306
+ this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
307
+ }
308
+
309
+ this.stats.completedBatches++;
310
+ this.stats.totalUrls += urlConfigs.length;
311
+ this.stats.successfulUrls += batchResult.successfulUrls;
312
+ this.stats.failedUrls += batchResult.failedUrls;
313
+ this._updateAverageBatchTime(executionTime);
314
+ this.stats.lastUpdated = Date.now();
315
+
316
+ await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
317
+ this.emit('batchCompleted', batchResult);
318
+ return batchResult;
319
+ } catch (error) {
320
+ this.stats.failedBatches++;
321
+ await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
322
+ throw error;
323
+ }
324
+ });
325
+ }
326
+ }
327
+
328
+ export default BatchScrapeTool;
@@ -0,0 +1,91 @@
1
+ /**
2
+ * batchScrape — queue module.
3
+ * Semaphore-based concurrency runner that dispatches work to worker.js.
4
+ */
5
+
6
+ import { scrapeUrl } from './worker.js';
7
+
8
+ /** Semaphore for concurrency limiting. */
9
+ class Semaphore {
10
+ constructor(max) {
11
+ this.max = max;
12
+ this.current = 0;
13
+ this.queue = [];
14
+ }
15
+
16
+ async acquire(task) {
17
+ return new Promise((resolve, reject) => {
18
+ this.queue.push({ task, resolve, reject });
19
+ this._tryNext();
20
+ });
21
+ }
22
+
23
+ _tryNext() {
24
+ if (this.current >= this.max || this.queue.length === 0) return;
25
+ this.current++;
26
+ const { task, resolve, reject } = this.queue.shift();
27
+ task()
28
+ .then(resolve)
29
+ .catch(reject)
30
+ .finally(() => {
31
+ this.current--;
32
+ this._tryNext();
33
+ });
34
+ }
35
+ }
36
+
37
+ /**
38
+ * Scrape a list of URL configs with controlled concurrency and optional delay.
39
+ * @param {Array} urlConfigs
40
+ * @param {Object} options — { maxConcurrency, delayBetweenRequests, formats, extractionSchema, ... }
41
+ * @param {number} defaultTimeout
42
+ * @returns {Promise<Array>} raw results array
43
+ */
44
+ export async function scrapeUrlsBatch(urlConfigs, options, defaultTimeout) {
45
+ const semaphore = new Semaphore(options.maxConcurrency);
46
+
47
+ const promises = urlConfigs.map((config, index) =>
48
+ semaphore.acquire(async () => {
49
+ if (options.delayBetweenRequests > 0 && index > 0) {
50
+ await new Promise(r => setTimeout(r, options.delayBetweenRequests));
51
+ }
52
+ return scrapeUrl(config, options, defaultTimeout);
53
+ })
54
+ );
55
+
56
+ const settled = await Promise.allSettled(promises);
57
+
58
+ return settled.map((result, index) => {
59
+ if (result.status === 'fulfilled') return result.value;
60
+ return {
61
+ success: false,
62
+ url: urlConfigs[index].url,
63
+ error: result.reason?.message || 'Unknown error',
64
+ timestamp: Date.now()
65
+ };
66
+ });
67
+ }
68
+
69
+ /**
70
+ * Filter and enrich results according to options.
71
+ */
72
+ export function processResults(results, options) {
73
+ let out = [...results];
74
+ if (!options.includeFailed) out = out.filter(r => r.success);
75
+ if (options.includeMetadata) {
76
+ out = out.map(r => ({
77
+ ...r,
78
+ processingMetadata: {
79
+ formats: options.formats,
80
+ extractionApplied: !!options.extractionSchema,
81
+ processedAt: Date.now()
82
+ }
83
+ }));
84
+ }
85
+ return out;
86
+ }
87
+
88
+ /** Return a page-sized slice of results. */
89
+ export function paginateResults(results, offset, limit) {
90
+ return results.slice(offset, offset + limit);
91
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * batchScrape — reporter module.
3
+ * Webhook dispatching helper (thin wrapper around WebhookDispatcher).
4
+ */
5
+
6
+ /**
7
+ * Send a batch event via the webhookDispatcher.
8
+ * @param {string} event
9
+ * @param {Object} data
10
+ * @param {Object} webhookConfig
11
+ * @param {Object} webhookDispatcher
12
+ * @param {boolean} enabled
13
+ */
14
+ export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
15
+ if (!enabled || !webhookConfig || !webhookDispatcher) return;
16
+
17
+ try {
18
+ await webhookDispatcher.dispatch(event, data, {
19
+ urls: [webhookConfig.url],
20
+ immediate: false,
21
+ metadata: { batchId: data.batchId, timestamp: Date.now() }
22
+ });
23
+ } catch (error) {
24
+ console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
25
+ }
26
+ }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * batchScrape — schema module.
3
+ */
4
+
5
+ import { z } from 'zod';
6
+
7
+ export const UrlConfigSchema = z.object({
8
+ url: z.string().url(),
9
+ selectors: z.record(z.string()).optional(),
10
+ headers: z.record(z.string()).optional(),
11
+ timeout: z.number().min(1000).max(30000).optional(),
12
+ metadata: z.record(z.any()).optional()
13
+ });
14
+
15
+ export const BatchScrapeSchema = z.object({
16
+ urls: z.array(z.union([z.string().url(), UrlConfigSchema])).min(1).max(50),
17
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
18
+ mode: z.enum(['sync', 'async']).default('sync'),
19
+ webhook: z.object({
20
+ url: z.string().url(),
21
+ events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
22
+ headers: z.record(z.string()).optional(),
23
+ signingSecret: z.string().optional()
24
+ }).optional(),
25
+ extractionSchema: z.record(z.string()).optional(),
26
+ maxConcurrency: z.number().min(1).max(20).default(10),
27
+ delayBetweenRequests: z.number().min(0).max(10000).default(100),
28
+ includeMetadata: z.boolean().default(true),
29
+ includeFailed: z.boolean().default(true),
30
+ pageSize: z.number().min(1).max(100).default(25),
31
+ jobOptions: z.object({
32
+ priority: z.number().default(0),
33
+ ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
34
+ maxRetries: z.number().min(0).max(5).default(1),
35
+ tags: z.array(z.string()).default([])
36
+ }).optional()
37
+ });