crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -11,6 +11,16 @@ import { load } from 'cheerio';
|
|
|
11
11
|
// Import existing tool for content extraction
|
|
12
12
|
import ExtractContentTool from '../extract/extractContent.js';
|
|
13
13
|
|
|
14
|
+
// Recording / replay helpers
|
|
15
|
+
import {
|
|
16
|
+
validateRecordingName,
|
|
17
|
+
saveRecording,
|
|
18
|
+
loadRecording,
|
|
19
|
+
listRecordings,
|
|
20
|
+
buildRecordedEntry,
|
|
21
|
+
recordedEntryToAction
|
|
22
|
+
} from './scrapeWithActions/recorder.js';
|
|
23
|
+
|
|
14
24
|
// Action schemas (re-using from ActionExecutor but with tool-specific additions)
|
|
15
25
|
const BaseActionSchema = z.object({
|
|
16
26
|
type: z.string(),
|
|
@@ -102,18 +112,18 @@ const FormFieldSchema = z.object({
|
|
|
102
112
|
// Main scrape with actions schema
|
|
103
113
|
const ScrapeWithActionsSchema = z.object({
|
|
104
114
|
url: z.string().url(),
|
|
105
|
-
actions: z.array(ActionSchema).min(1).max(20),
|
|
106
|
-
|
|
115
|
+
actions: z.array(ActionSchema).min(1).max(20).optional(),
|
|
116
|
+
|
|
107
117
|
// Output formats
|
|
108
118
|
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
109
|
-
|
|
119
|
+
|
|
110
120
|
// Intermediate state capture
|
|
111
121
|
captureIntermediateStates: z.boolean().default(false),
|
|
112
122
|
captureScreenshots: z.boolean().default(true),
|
|
113
|
-
|
|
123
|
+
|
|
114
124
|
// Form auto-fill
|
|
115
125
|
formAutoFill: z.record(z.string()).optional(),
|
|
116
|
-
|
|
126
|
+
|
|
117
127
|
// Browser options
|
|
118
128
|
browserOptions: z.object({
|
|
119
129
|
headless: z.boolean().default(true),
|
|
@@ -122,7 +132,7 @@ const ScrapeWithActionsSchema = z.object({
|
|
|
122
132
|
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
123
133
|
timeout: z.number().min(10000).max(120000).default(30000)
|
|
124
134
|
}).optional(),
|
|
125
|
-
|
|
135
|
+
|
|
126
136
|
// Content extraction options
|
|
127
137
|
extractionOptions: z.object({
|
|
128
138
|
selectors: z.record(z.string()).optional(),
|
|
@@ -130,12 +140,39 @@ const ScrapeWithActionsSchema = z.object({
|
|
|
130
140
|
includeLinks: z.boolean().default(true),
|
|
131
141
|
includeImages: z.boolean().default(true)
|
|
132
142
|
}).optional(),
|
|
133
|
-
|
|
143
|
+
|
|
134
144
|
// Error handling
|
|
135
145
|
continueOnActionError: z.boolean().default(false),
|
|
136
146
|
maxRetries: z.number().min(0).max(3).default(1),
|
|
137
|
-
screenshotOnError: z.boolean().default(true)
|
|
138
|
-
|
|
147
|
+
screenshotOnError: z.boolean().default(true),
|
|
148
|
+
|
|
149
|
+
// ── Recording / replay ──────────────────────────────────────────────────
|
|
150
|
+
// record: true → execute actions AND persist them as a named recording.
|
|
151
|
+
record: z.boolean().default(false),
|
|
152
|
+
// recordingName: required when record=true; also used to name the saved file.
|
|
153
|
+
recordingName: z.string().optional(),
|
|
154
|
+
// replayRecording: load a saved recording by name and execute it.
|
|
155
|
+
// Special value "__list__" returns the list of available recordings instead.
|
|
156
|
+
replayRecording: z.string().optional()
|
|
157
|
+
}).refine(
|
|
158
|
+
(data) => {
|
|
159
|
+
// actions is required unless replayRecording is set
|
|
160
|
+
if (!data.replayRecording && (!data.actions || data.actions.length === 0)) {
|
|
161
|
+
return false;
|
|
162
|
+
}
|
|
163
|
+
return true;
|
|
164
|
+
},
|
|
165
|
+
{ message: 'actions is required when replayRecording is not set' }
|
|
166
|
+
).refine(
|
|
167
|
+
(data) => {
|
|
168
|
+
// recordingName is required when record=true
|
|
169
|
+
if (data.record && !data.recordingName) {
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
172
|
+
return true;
|
|
173
|
+
},
|
|
174
|
+
{ message: 'recordingName is required when record is true' }
|
|
175
|
+
);
|
|
139
176
|
|
|
140
177
|
export class ScrapeWithActionsTool extends EventEmitter {
|
|
141
178
|
constructor(options = {}) {
|
|
@@ -188,8 +225,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
188
225
|
|
|
189
226
|
async execute(params) {
|
|
190
227
|
try {
|
|
228
|
+
// ── __list__ shortcut — resolve before full schema parse ─────────────
|
|
229
|
+
if (params.replayRecording === '__list__') {
|
|
230
|
+
const recordings = await listRecordings();
|
|
231
|
+
return { success: true, recordings };
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// ── Validate recordingName if provided (path-traversal guard) ─────────
|
|
235
|
+
if (params.recordingName) {
|
|
236
|
+
validateRecordingName(params.recordingName);
|
|
237
|
+
}
|
|
238
|
+
if (params.replayRecording && params.replayRecording !== '__list__') {
|
|
239
|
+
validateRecordingName(params.replayRecording);
|
|
240
|
+
}
|
|
241
|
+
|
|
191
242
|
const validated = ScrapeWithActionsSchema.parse(params);
|
|
192
|
-
|
|
243
|
+
|
|
244
|
+
// ── Replay mode — load saved recording and substitute actions ─────────
|
|
245
|
+
if (validated.replayRecording) {
|
|
246
|
+
const recording = await loadRecording(validated.replayRecording);
|
|
247
|
+
validated.actions = recording.recordedActions.map(recordedEntryToAction);
|
|
248
|
+
this.log('info', `Replaying recording "${validated.replayRecording}" with ${validated.actions.length} actions on ${validated.url}`);
|
|
249
|
+
}
|
|
250
|
+
|
|
193
251
|
this.stats.totalSessions++;
|
|
194
252
|
const sessionId = this.generateSessionId();
|
|
195
253
|
const startTime = Date.now();
|
|
@@ -270,7 +328,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
270
328
|
|
|
271
329
|
// Build action chain with form auto-fill if provided
|
|
272
330
|
let actionChain = [...params.actions];
|
|
273
|
-
|
|
331
|
+
|
|
274
332
|
if (params.formAutoFill) {
|
|
275
333
|
actionChain = this.insertFormAutoFillActions(actionChain, params.formAutoFill);
|
|
276
334
|
}
|
|
@@ -300,9 +358,32 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
300
358
|
sessionContext.actionResults = chainResult.results;
|
|
301
359
|
sessionContext.screenshots = chainResult.screenshots || [];
|
|
302
360
|
|
|
303
|
-
//
|
|
361
|
+
// ── Recording mode — persist actions after successful execution ─────────
|
|
362
|
+
let savedRecordingPath;
|
|
363
|
+
if (params.record && params.recordingName) {
|
|
364
|
+
const sessionStartTime = sessionContext.startTime;
|
|
365
|
+
const recordedActions = actionChain.map((action, index) => {
|
|
366
|
+
// Use actual result timing if available, otherwise estimate from index
|
|
367
|
+
const result = (chainResult.results || [])[index];
|
|
368
|
+
const tMs = result?.timestamp
|
|
369
|
+
? result.timestamp - sessionStartTime
|
|
370
|
+
: index * 100; // fallback estimate
|
|
371
|
+
return buildRecordedEntry(action, tMs);
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
try {
|
|
375
|
+
savedRecordingPath = await saveRecording(params.recordingName, recordedActions, {
|
|
376
|
+
originalUrl: params.url
|
|
377
|
+
});
|
|
378
|
+
this.log('info', `Recording saved: ${savedRecordingPath}`);
|
|
379
|
+
} catch (err) {
|
|
380
|
+
this.log('warn', `Failed to save recording: ${err.message}`);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Process action results
|
|
304
385
|
const actionResults = this.processActionResults(chainResult.results);
|
|
305
|
-
const intermediateStates = params.captureIntermediateStates ?
|
|
386
|
+
const intermediateStates = params.captureIntermediateStates ?
|
|
306
387
|
await this.extractIntermediateStates(actionResults, params) : [];
|
|
307
388
|
|
|
308
389
|
// Get final page content after all actions
|
|
@@ -331,15 +412,20 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
331
412
|
successfulActions: actionResults.filter(r => r.success).length,
|
|
332
413
|
failedActions: actionResults.filter(r => !r.success).length,
|
|
333
414
|
actionsExecuted: actionResults.length, // Total executed (for validation)
|
|
334
|
-
|
|
415
|
+
|
|
335
416
|
content,
|
|
336
|
-
|
|
417
|
+
|
|
337
418
|
intermediateStates: params.captureIntermediateStates ? intermediateStates : undefined,
|
|
338
419
|
screenshots: params.captureScreenshots ? sessionContext.screenshots : undefined,
|
|
339
|
-
|
|
420
|
+
|
|
340
421
|
// Form auto-fill flag (for tests/validation)
|
|
341
422
|
formAutoFillApplied: !!params.formAutoFill,
|
|
342
|
-
|
|
423
|
+
|
|
424
|
+
// Recording fields
|
|
425
|
+
recordingSaved: params.record ? !!savedRecordingPath : undefined,
|
|
426
|
+
recordingPath: savedRecordingPath || undefined,
|
|
427
|
+
replayedFrom: params.replayRecording || undefined,
|
|
428
|
+
|
|
343
429
|
metadata: {
|
|
344
430
|
browserOptions,
|
|
345
431
|
formAutoFillApplied: !!params.formAutoFill,
|
|
@@ -348,10 +434,10 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
348
434
|
finalUrl: chainResult.metadata?.finalUrl,
|
|
349
435
|
timestamp: Date.now()
|
|
350
436
|
},
|
|
351
|
-
|
|
437
|
+
|
|
352
438
|
stats: {
|
|
353
439
|
sessionTime: executionTime,
|
|
354
|
-
averageActionTime: actionResults.length > 0 ?
|
|
440
|
+
averageActionTime: actionResults.length > 0 ?
|
|
355
441
|
actionResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / actionResults.length : 0,
|
|
356
442
|
errorRecoveryCount: actionResults.filter(r => r.recovered).length
|
|
357
443
|
}
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batchScrape — entry-point (index.js).
|
|
3
|
+
*
|
|
4
|
+
* Preserves the same exports as the original BatchScrapeTool.js:
|
|
5
|
+
* export class BatchScrapeTool
|
|
6
|
+
* export default BatchScrapeTool
|
|
7
|
+
*
|
|
8
|
+
* Heavy work is delegated to:
|
|
9
|
+
* schema.js — Zod input schema
|
|
10
|
+
* worker.js — per-URL fetch + content extraction
|
|
11
|
+
* queue.js — Semaphore concurrency runner
|
|
12
|
+
* reporter.js — webhook notification helper
|
|
13
|
+
*
|
|
14
|
+
* Reuses JobManager and WebhookDispatcher from src/core/ (no embedded copies).
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { EventEmitter } from 'events';
|
|
18
|
+
import JobManager from '../../../core/JobManager.js';
|
|
19
|
+
import WebhookDispatcher from '../../../core/WebhookDispatcher.js';
|
|
20
|
+
import { BatchScrapeSchema } from './schema.js';
|
|
21
|
+
import { scrapeUrlsBatch, processResults, paginateResults } from './queue.js';
|
|
22
|
+
import { sendWebhookNotification } from './reporter.js';
|
|
23
|
+
|
|
24
|
+
export class BatchScrapeTool extends EventEmitter {
|
|
25
|
+
constructor(options = {}) {
|
|
26
|
+
super();
|
|
27
|
+
|
|
28
|
+
const {
|
|
29
|
+
jobManager = null,
|
|
30
|
+
webhookDispatcher = null,
|
|
31
|
+
enableJobPersistence = true,
|
|
32
|
+
enableWebhookNotifications = true,
|
|
33
|
+
defaultTimeout = 15000,
|
|
34
|
+
maxBatchSize = 50,
|
|
35
|
+
enableResultCaching = true,
|
|
36
|
+
enableLogging = true
|
|
37
|
+
} = options;
|
|
38
|
+
|
|
39
|
+
this.jobManager = jobManager || new JobManager({
|
|
40
|
+
enablePersistence: enableJobPersistence,
|
|
41
|
+
defaultTtl: 24 * 60 * 60 * 1000
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
this.webhookDispatcher = webhookDispatcher || new WebhookDispatcher({
|
|
45
|
+
enablePersistence: enableJobPersistence
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
this.defaultTimeout = defaultTimeout;
|
|
49
|
+
this.maxBatchSize = maxBatchSize;
|
|
50
|
+
this.enableResultCaching = enableResultCaching;
|
|
51
|
+
this.enableLogging = enableLogging;
|
|
52
|
+
this.enableWebhookNotifications = enableWebhookNotifications;
|
|
53
|
+
|
|
54
|
+
this.activeBatches = new Map();
|
|
55
|
+
this.batchResults = new Map();
|
|
56
|
+
|
|
57
|
+
this.stats = {
|
|
58
|
+
totalBatches: 0,
|
|
59
|
+
completedBatches: 0,
|
|
60
|
+
failedBatches: 0,
|
|
61
|
+
totalUrls: 0,
|
|
62
|
+
successfulUrls: 0,
|
|
63
|
+
failedUrls: 0,
|
|
64
|
+
averageBatchTime: 0,
|
|
65
|
+
lastUpdated: Date.now()
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
this._initializeJobExecutors();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async execute(params) {
|
|
72
|
+
try {
|
|
73
|
+
const validated = BatchScrapeSchema.parse(params);
|
|
74
|
+
this.stats.totalBatches++;
|
|
75
|
+
const batchId = this._generateBatchId();
|
|
76
|
+
const startTime = Date.now();
|
|
77
|
+
|
|
78
|
+
this._log('info', `Starting batch scrape ${batchId} with ${validated.urls.length} URLs in ${validated.mode} mode`);
|
|
79
|
+
|
|
80
|
+
const urlConfigs = this._normalizeUrlConfigs(validated.urls, validated);
|
|
81
|
+
|
|
82
|
+
let webhookConfig = null;
|
|
83
|
+
if (validated.webhook && this.enableWebhookNotifications) {
|
|
84
|
+
webhookConfig = this._registerWebhook(validated.webhook, batchId);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (validated.mode === 'sync') {
|
|
88
|
+
return await this._processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
|
|
89
|
+
} else {
|
|
90
|
+
return await this._processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime);
|
|
91
|
+
}
|
|
92
|
+
} catch (error) {
|
|
93
|
+
this.stats.failedBatches++;
|
|
94
|
+
this._log('error', `Batch scrape failed: ${error.message}`);
|
|
95
|
+
throw new Error(`Batch scrape failed: ${error.message}`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
async _processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime) {
|
|
100
|
+
try {
|
|
101
|
+
this.activeBatches.set(batchId, { id: batchId, mode: 'sync', startTime, total: urlConfigs.length, completed: 0 });
|
|
102
|
+
|
|
103
|
+
const rawResults = await scrapeUrlsBatch(urlConfigs, validated, this.defaultTimeout);
|
|
104
|
+
const processedResults = processResults(rawResults, validated);
|
|
105
|
+
const executionTime = Date.now() - startTime;
|
|
106
|
+
this._updateAverageBatchTime(executionTime);
|
|
107
|
+
|
|
108
|
+
const batchResult = {
|
|
109
|
+
batchId, mode: 'sync', success: true, executionTime,
|
|
110
|
+
totalUrls: urlConfigs.length,
|
|
111
|
+
successfulUrls: processedResults.filter(r => r.success).length,
|
|
112
|
+
failedUrls: processedResults.filter(r => !r.success).length,
|
|
113
|
+
results: paginateResults(processedResults, 0, validated.pageSize),
|
|
114
|
+
pagination: {
|
|
115
|
+
page: 1, pageSize: validated.pageSize,
|
|
116
|
+
totalResults: processedResults.length,
|
|
117
|
+
totalPages: Math.ceil(processedResults.length / validated.pageSize)
|
|
118
|
+
},
|
|
119
|
+
formats: validated.formats,
|
|
120
|
+
metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now() }
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
if (this.enableResultCaching) {
|
|
124
|
+
this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
this.stats.completedBatches++;
|
|
128
|
+
this.stats.totalUrls += urlConfigs.length;
|
|
129
|
+
this.stats.successfulUrls += batchResult.successfulUrls;
|
|
130
|
+
this.stats.failedUrls += batchResult.failedUrls;
|
|
131
|
+
this.stats.lastUpdated = Date.now();
|
|
132
|
+
this.activeBatches.delete(batchId);
|
|
133
|
+
|
|
134
|
+
await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
|
|
135
|
+
this.emit('batchCompleted', batchResult);
|
|
136
|
+
return batchResult;
|
|
137
|
+
} catch (error) {
|
|
138
|
+
this.stats.failedBatches++;
|
|
139
|
+
this.activeBatches.delete(batchId);
|
|
140
|
+
await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
|
|
141
|
+
throw error;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async _processBatchAsync(batchId, urlConfigs, validated, webhookConfig, startTime) {
|
|
146
|
+
try {
|
|
147
|
+
const jobData = { batchId, urlConfigs, validated, webhookConfig, startTime };
|
|
148
|
+
const jobOptions = {
|
|
149
|
+
...validated.jobOptions,
|
|
150
|
+
webhooks: webhookConfig ? [webhookConfig] : [],
|
|
151
|
+
tags: ['batch_scrape', batchId, ...(validated.jobOptions?.tags || [])],
|
|
152
|
+
metadata: { batchId, urlCount: urlConfigs.length, formats: validated.formats }
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
const job = await this.jobManager.createJob('batch_scrape', jobData, jobOptions);
|
|
156
|
+
this.jobManager.executeJob(job.id).catch(err => {
|
|
157
|
+
this._log('error', `Async batch job ${job.id} failed: ${err.message}`);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
this.emit('batchJobCreated', job);
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
batchId, mode: 'async', jobId: job.id, status: 'queued',
|
|
164
|
+
totalUrls: urlConfigs.length, createdAt: job.createdAt,
|
|
165
|
+
estimatedCompletion: new Date(job.createdAt + (urlConfigs.length * 2000)),
|
|
166
|
+
statusCheckUrl: `batch_scrape_status?jobId=${job.id}`,
|
|
167
|
+
webhook: webhookConfig ? { url: webhookConfig.url, events: webhookConfig.events } : null
|
|
168
|
+
};
|
|
169
|
+
} catch (error) {
|
|
170
|
+
this.stats.failedBatches++;
|
|
171
|
+
throw error;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async getBatchResults(batchId, page = 1, pageSize = 25) {
|
|
176
|
+
const cached = this.batchResults.get(batchId);
|
|
177
|
+
if (cached && Date.now() - cached.timestamp < cached.ttl) {
|
|
178
|
+
const offset = (page - 1) * pageSize;
|
|
179
|
+
return {
|
|
180
|
+
batchId, success: true,
|
|
181
|
+
results: paginateResults(cached.results, offset, pageSize),
|
|
182
|
+
pagination: { page, pageSize, totalResults: cached.results.length, totalPages: Math.ceil(cached.results.length / pageSize) },
|
|
183
|
+
cached: true, timestamp: cached.timestamp
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const active = this.activeBatches.get(batchId);
|
|
188
|
+
if (active) {
|
|
189
|
+
return {
|
|
190
|
+
batchId, status: 'in_progress', mode: active.mode,
|
|
191
|
+
progress: { completed: active.completed, total: active.total, percentage: Math.round((active.completed / active.total) * 100) },
|
|
192
|
+
startTime: active.startTime, runningTime: Date.now() - active.startTime
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
throw new Error(`Batch ${batchId} not found`);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async getJobStatus(jobId) {
|
|
200
|
+
const job = this.jobManager.getJob(jobId);
|
|
201
|
+
if (!job) throw new Error(`Job ${jobId} not found`);
|
|
202
|
+
const status = { jobId, batchId: job.metadata?.batchId, status: job.status, progress: job.progress, createdAt: job.createdAt, startedAt: job.startedAt, completedAt: job.completedAt, error: job.error, metadata: job.metadata };
|
|
203
|
+
if (job.status === 'completed' && job.result) status.results = job.result;
|
|
204
|
+
return status;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
async cancelBatch(batchId) {
|
|
208
|
+
if (this.activeBatches.has(batchId)) {
|
|
209
|
+
this.activeBatches.delete(batchId);
|
|
210
|
+
return { success: true, message: `Active batch ${batchId} cancelled` };
|
|
211
|
+
}
|
|
212
|
+
const jobs = this.jobManager.getJobsByTag(batchId);
|
|
213
|
+
if (jobs.length > 0) {
|
|
214
|
+
const job = jobs[0];
|
|
215
|
+
await this.jobManager.cancelJob(job.id);
|
|
216
|
+
return { success: true, message: `Job ${job.id} for batch ${batchId} cancelled` };
|
|
217
|
+
}
|
|
218
|
+
throw new Error(`Batch ${batchId} not found or already completed`);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
getStats() {
|
|
222
|
+
return {
|
|
223
|
+
...this.stats,
|
|
224
|
+
activeBatches: this.activeBatches.size,
|
|
225
|
+
cachedResults: this.batchResults.size,
|
|
226
|
+
jobManagerStats: this.jobManager ? this.jobManager.getStats() : null,
|
|
227
|
+
webhookStats: this.webhookDispatcher ? this.webhookDispatcher.getStats() : null
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
async destroy() {
|
|
232
|
+
for (const batchId of this.activeBatches.keys()) {
|
|
233
|
+
try { await this.cancelBatch(batchId); } catch (e) { this._log('warn', `Failed to cancel batch ${batchId}: ${e.message}`); }
|
|
234
|
+
}
|
|
235
|
+
this.activeBatches.clear();
|
|
236
|
+
this.batchResults.clear();
|
|
237
|
+
this.jobManager?.destroy();
|
|
238
|
+
this.webhookDispatcher?.destroy();
|
|
239
|
+
this.removeAllListeners();
|
|
240
|
+
this.emit('destroyed');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ── Private helpers ──────────────────────────────────────────────────────────
|
|
244
|
+
|
|
245
|
+
_generateBatchId() {
|
|
246
|
+
return `batch_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
_normalizeUrlConfigs(urls, globalOptions) {
|
|
250
|
+
return urls.map(url => {
|
|
251
|
+
if (typeof url === 'string') {
|
|
252
|
+
return { url, selectors: globalOptions.extractionSchema || {}, headers: {}, timeout: this.defaultTimeout };
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
...url,
|
|
256
|
+
selectors: { ...globalOptions.extractionSchema, ...(url.selectors || {}) },
|
|
257
|
+
headers: url.headers || {},
|
|
258
|
+
timeout: url.timeout || this.defaultTimeout
|
|
259
|
+
};
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
_registerWebhook(webhookConfig, batchId) {
|
|
264
|
+
const config = { ...webhookConfig, metadata: { batchId, registeredAt: Date.now() } };
|
|
265
|
+
return this.webhookDispatcher.registerWebhook(webhookConfig.url, config);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
_updateAverageBatchTime(batchTime) {
|
|
269
|
+
const n = this.stats.completedBatches;
|
|
270
|
+
this.stats.averageBatchTime = n === 1 ? batchTime : ((this.stats.averageBatchTime * (n - 1)) + batchTime) / n;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
_log(level, message) {
|
|
274
|
+
if (this.enableLogging) console.log(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
_initializeJobExecutors() {
|
|
278
|
+
this.jobManager.registerExecutor('batch_scrape', async (job) => {
|
|
279
|
+
const { batchId, urlConfigs, validated, webhookConfig, startTime } = job.data;
|
|
280
|
+
try {
|
|
281
|
+
await this.jobManager.updateJobProgress(job.id, 0, 'Starting batch scrape');
|
|
282
|
+
|
|
283
|
+
const results = [];
|
|
284
|
+
const total = urlConfigs.length;
|
|
285
|
+
|
|
286
|
+
for (let i = 0; i < total; i += validated.maxConcurrency) {
|
|
287
|
+
const batch = urlConfigs.slice(i, i + validated.maxConcurrency);
|
|
288
|
+
results.push(...await scrapeUrlsBatch(batch, validated, this.defaultTimeout));
|
|
289
|
+
const progress = Math.round(((i + batch.length) / total) * 100);
|
|
290
|
+
await this.jobManager.updateJobProgress(job.id, progress, `Processed ${i + batch.length}/${total} URLs`);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const processedResults = processResults(results, validated);
|
|
294
|
+
const executionTime = Date.now() - startTime;
|
|
295
|
+
|
|
296
|
+
const batchResult = {
|
|
297
|
+
batchId, mode: 'async', success: true, executionTime,
|
|
298
|
+
totalUrls: urlConfigs.length,
|
|
299
|
+
successfulUrls: processedResults.filter(r => r.success).length,
|
|
300
|
+
failedUrls: processedResults.filter(r => !r.success).length,
|
|
301
|
+
results: processedResults, formats: validated.formats,
|
|
302
|
+
metadata: { concurrency: validated.maxConcurrency, timestamp: Date.now(), jobId: job.id }
|
|
303
|
+
};
|
|
304
|
+
|
|
305
|
+
if (this.enableResultCaching) {
|
|
306
|
+
this.batchResults.set(batchId, { results: processedResults, timestamp: Date.now(), ttl: 3600000 });
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
this.stats.completedBatches++;
|
|
310
|
+
this.stats.totalUrls += urlConfigs.length;
|
|
311
|
+
this.stats.successfulUrls += batchResult.successfulUrls;
|
|
312
|
+
this.stats.failedUrls += batchResult.failedUrls;
|
|
313
|
+
this._updateAverageBatchTime(executionTime);
|
|
314
|
+
this.stats.lastUpdated = Date.now();
|
|
315
|
+
|
|
316
|
+
await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
|
|
317
|
+
this.emit('batchCompleted', batchResult);
|
|
318
|
+
return batchResult;
|
|
319
|
+
} catch (error) {
|
|
320
|
+
this.stats.failedBatches++;
|
|
321
|
+
await sendWebhookNotification('batch_failed', { batchId, error: error.message }, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
|
|
322
|
+
throw error;
|
|
323
|
+
}
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
export default BatchScrapeTool;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batchScrape — queue module.
|
|
3
|
+
* Semaphore-based concurrency runner that dispatches work to worker.js.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { scrapeUrl } from './worker.js';
|
|
7
|
+
|
|
8
|
+
/** Semaphore for concurrency limiting. */
|
|
9
|
+
class Semaphore {
|
|
10
|
+
constructor(max) {
|
|
11
|
+
this.max = max;
|
|
12
|
+
this.current = 0;
|
|
13
|
+
this.queue = [];
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
async acquire(task) {
|
|
17
|
+
return new Promise((resolve, reject) => {
|
|
18
|
+
this.queue.push({ task, resolve, reject });
|
|
19
|
+
this._tryNext();
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
_tryNext() {
|
|
24
|
+
if (this.current >= this.max || this.queue.length === 0) return;
|
|
25
|
+
this.current++;
|
|
26
|
+
const { task, resolve, reject } = this.queue.shift();
|
|
27
|
+
task()
|
|
28
|
+
.then(resolve)
|
|
29
|
+
.catch(reject)
|
|
30
|
+
.finally(() => {
|
|
31
|
+
this.current--;
|
|
32
|
+
this._tryNext();
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Scrape a list of URL configs with controlled concurrency and optional delay.
|
|
39
|
+
* @param {Array} urlConfigs
|
|
40
|
+
* @param {Object} options — { maxConcurrency, delayBetweenRequests, formats, extractionSchema, ... }
|
|
41
|
+
* @param {number} defaultTimeout
|
|
42
|
+
* @returns {Promise<Array>} raw results array
|
|
43
|
+
*/
|
|
44
|
+
export async function scrapeUrlsBatch(urlConfigs, options, defaultTimeout) {
|
|
45
|
+
const semaphore = new Semaphore(options.maxConcurrency);
|
|
46
|
+
|
|
47
|
+
const promises = urlConfigs.map((config, index) =>
|
|
48
|
+
semaphore.acquire(async () => {
|
|
49
|
+
if (options.delayBetweenRequests > 0 && index > 0) {
|
|
50
|
+
await new Promise(r => setTimeout(r, options.delayBetweenRequests));
|
|
51
|
+
}
|
|
52
|
+
return scrapeUrl(config, options, defaultTimeout);
|
|
53
|
+
})
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
const settled = await Promise.allSettled(promises);
|
|
57
|
+
|
|
58
|
+
return settled.map((result, index) => {
|
|
59
|
+
if (result.status === 'fulfilled') return result.value;
|
|
60
|
+
return {
|
|
61
|
+
success: false,
|
|
62
|
+
url: urlConfigs[index].url,
|
|
63
|
+
error: result.reason?.message || 'Unknown error',
|
|
64
|
+
timestamp: Date.now()
|
|
65
|
+
};
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Filter and enrich results according to options.
|
|
71
|
+
*/
|
|
72
|
+
export function processResults(results, options) {
|
|
73
|
+
let out = [...results];
|
|
74
|
+
if (!options.includeFailed) out = out.filter(r => r.success);
|
|
75
|
+
if (options.includeMetadata) {
|
|
76
|
+
out = out.map(r => ({
|
|
77
|
+
...r,
|
|
78
|
+
processingMetadata: {
|
|
79
|
+
formats: options.formats,
|
|
80
|
+
extractionApplied: !!options.extractionSchema,
|
|
81
|
+
processedAt: Date.now()
|
|
82
|
+
}
|
|
83
|
+
}));
|
|
84
|
+
}
|
|
85
|
+
return out;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Return a page-sized slice of results. */
|
|
89
|
+
export function paginateResults(results, offset, limit) {
|
|
90
|
+
return results.slice(offset, offset + limit);
|
|
91
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batchScrape — reporter module.
|
|
3
|
+
* Webhook dispatching helper (thin wrapper around WebhookDispatcher).
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Send a batch event via the webhookDispatcher.
|
|
8
|
+
* @param {string} event
|
|
9
|
+
* @param {Object} data
|
|
10
|
+
* @param {Object} webhookConfig
|
|
11
|
+
* @param {Object} webhookDispatcher
|
|
12
|
+
* @param {boolean} enabled
|
|
13
|
+
*/
|
|
14
|
+
export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
|
|
15
|
+
if (!enabled || !webhookConfig || !webhookDispatcher) return;
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
await webhookDispatcher.dispatch(event, data, {
|
|
19
|
+
urls: [webhookConfig.url],
|
|
20
|
+
immediate: false,
|
|
21
|
+
metadata: { batchId: data.batchId, timestamp: Date.now() }
|
|
22
|
+
});
|
|
23
|
+
} catch (error) {
|
|
24
|
+
console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batchScrape — schema module.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { z } from 'zod';
|
|
6
|
+
|
|
7
|
+
export const UrlConfigSchema = z.object({
|
|
8
|
+
url: z.string().url(),
|
|
9
|
+
selectors: z.record(z.string()).optional(),
|
|
10
|
+
headers: z.record(z.string()).optional(),
|
|
11
|
+
timeout: z.number().min(1000).max(30000).optional(),
|
|
12
|
+
metadata: z.record(z.any()).optional()
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
export const BatchScrapeSchema = z.object({
|
|
16
|
+
urls: z.array(z.union([z.string().url(), UrlConfigSchema])).min(1).max(50),
|
|
17
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
18
|
+
mode: z.enum(['sync', 'async']).default('sync'),
|
|
19
|
+
webhook: z.object({
|
|
20
|
+
url: z.string().url(),
|
|
21
|
+
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
22
|
+
headers: z.record(z.string()).optional(),
|
|
23
|
+
signingSecret: z.string().optional()
|
|
24
|
+
}).optional(),
|
|
25
|
+
extractionSchema: z.record(z.string()).optional(),
|
|
26
|
+
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
27
|
+
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
28
|
+
includeMetadata: z.boolean().default(true),
|
|
29
|
+
includeFailed: z.boolean().default(true),
|
|
30
|
+
pageSize: z.number().min(1).max(100).default(25),
|
|
31
|
+
jobOptions: z.object({
|
|
32
|
+
priority: z.number().default(0),
|
|
33
|
+
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
34
|
+
maxRetries: z.number().min(0).max(5).default(1),
|
|
35
|
+
tags: z.array(z.string()).default([])
|
|
36
|
+
}).optional()
|
|
37
|
+
});
|