crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ScrapeWithActionsTool - Execute action chains before scraping with result collection
|
|
3
|
+
* Features: action chains, form interactions, intermediate state capture, error recovery
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import { EventEmitter } from 'events';
|
|
8
|
+
import ActionExecutor from '../../core/ActionExecutor.js';
|
|
9
|
+
import { load } from 'cheerio';
|
|
10
|
+
|
|
11
|
+
// Import existing tool for content extraction
|
|
12
|
+
import ExtractContentTool from '../extract/extractContent.js';
|
|
13
|
+
|
|
14
|
+
// Action schemas (re-using from ActionExecutor but with tool-specific additions)
|
|
15
|
+
const BaseActionSchema = z.object({
|
|
16
|
+
type: z.string(),
|
|
17
|
+
timeout: z.number().optional(),
|
|
18
|
+
description: z.string().optional(),
|
|
19
|
+
continueOnError: z.boolean().default(false),
|
|
20
|
+
retries: z.number().min(0).max(5).default(0),
|
|
21
|
+
captureAfter: z.boolean().default(false) // Capture content after this action
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
const WaitActionSchema = BaseActionSchema.extend({
|
|
25
|
+
type: z.literal('wait'),
|
|
26
|
+
duration: z.number().min(0).max(30000).optional(),
|
|
27
|
+
selector: z.string().optional(),
|
|
28
|
+
condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional(),
|
|
29
|
+
text: z.string().optional()
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const ClickActionSchema = BaseActionSchema.extend({
|
|
33
|
+
type: z.literal('click'),
|
|
34
|
+
selector: z.string(),
|
|
35
|
+
button: z.enum(['left', 'right', 'middle']).default('left'),
|
|
36
|
+
clickCount: z.number().min(1).max(3).default(1),
|
|
37
|
+
delay: z.number().min(0).max(1000).default(0),
|
|
38
|
+
force: z.boolean().default(false),
|
|
39
|
+
position: z.object({
|
|
40
|
+
x: z.number(),
|
|
41
|
+
y: z.number()
|
|
42
|
+
}).optional()
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
const TypeActionSchema = BaseActionSchema.extend({
|
|
46
|
+
type: z.literal('type'),
|
|
47
|
+
selector: z.string(),
|
|
48
|
+
text: z.string(),
|
|
49
|
+
delay: z.number().min(0).max(1000).default(0),
|
|
50
|
+
clear: z.boolean().default(false)
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const PressActionSchema = BaseActionSchema.extend({
|
|
54
|
+
type: z.literal('press'),
|
|
55
|
+
key: z.string(),
|
|
56
|
+
modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).default([]),
|
|
57
|
+
selector: z.string().optional()
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
const ScrollActionSchema = BaseActionSchema.extend({
|
|
61
|
+
type: z.literal('scroll'),
|
|
62
|
+
selector: z.string().optional(),
|
|
63
|
+
direction: z.enum(['up', 'down', 'left', 'right']).default('down'),
|
|
64
|
+
distance: z.number().min(0).default(100),
|
|
65
|
+
smooth: z.boolean().default(true),
|
|
66
|
+
toElement: z.string().optional()
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
const ScreenshotActionSchema = BaseActionSchema.extend({
|
|
70
|
+
type: z.literal('screenshot'),
|
|
71
|
+
selector: z.string().optional(),
|
|
72
|
+
fullPage: z.boolean().default(false),
|
|
73
|
+
quality: z.number().min(0).max(100).default(80),
|
|
74
|
+
format: z.enum(['png', 'jpeg']).default('png')
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
const ExecuteJavaScriptActionSchema = BaseActionSchema.extend({
|
|
78
|
+
type: z.literal('executeJavaScript'),
|
|
79
|
+
script: z.string(),
|
|
80
|
+
args: z.array(z.any()).default([]),
|
|
81
|
+
returnResult: z.boolean().default(true)
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
const ActionSchema = z.union([
|
|
85
|
+
WaitActionSchema,
|
|
86
|
+
ClickActionSchema,
|
|
87
|
+
TypeActionSchema,
|
|
88
|
+
PressActionSchema,
|
|
89
|
+
ScrollActionSchema,
|
|
90
|
+
ScreenshotActionSchema,
|
|
91
|
+
ExecuteJavaScriptActionSchema
|
|
92
|
+
]);
|
|
93
|
+
|
|
94
|
+
// Form field schema for auto-fill
|
|
95
|
+
const FormFieldSchema = z.object({
|
|
96
|
+
selector: z.string(),
|
|
97
|
+
value: z.string(),
|
|
98
|
+
type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
|
|
99
|
+
waitAfter: z.number().min(0).max(5000).default(100)
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Main scrape with actions schema
|
|
103
|
+
const ScrapeWithActionsSchema = z.object({
|
|
104
|
+
url: z.string().url(),
|
|
105
|
+
actions: z.array(ActionSchema).min(1).max(20),
|
|
106
|
+
|
|
107
|
+
// Output formats
|
|
108
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
109
|
+
|
|
110
|
+
// Intermediate state capture
|
|
111
|
+
captureIntermediateStates: z.boolean().default(false),
|
|
112
|
+
captureScreenshots: z.boolean().default(true),
|
|
113
|
+
|
|
114
|
+
// Form auto-fill
|
|
115
|
+
formAutoFill: z.record(z.string()).optional(),
|
|
116
|
+
|
|
117
|
+
// Browser options
|
|
118
|
+
browserOptions: z.object({
|
|
119
|
+
headless: z.boolean().default(true),
|
|
120
|
+
userAgent: z.string().optional(),
|
|
121
|
+
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
122
|
+
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
123
|
+
timeout: z.number().min(10000).max(120000).default(30000)
|
|
124
|
+
}).optional(),
|
|
125
|
+
|
|
126
|
+
// Content extraction options
|
|
127
|
+
extractionOptions: z.object({
|
|
128
|
+
selectors: z.record(z.string()).optional(),
|
|
129
|
+
includeMetadata: z.boolean().default(true),
|
|
130
|
+
includeLinks: z.boolean().default(true),
|
|
131
|
+
includeImages: z.boolean().default(true)
|
|
132
|
+
}).optional(),
|
|
133
|
+
|
|
134
|
+
// Error handling
|
|
135
|
+
continueOnActionError: z.boolean().default(false),
|
|
136
|
+
maxRetries: z.number().min(0).max(3).default(1),
|
|
137
|
+
screenshotOnError: z.boolean().default(true)
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
export class ScrapeWithActionsTool extends EventEmitter {
|
|
141
|
+
constructor(options = {}) {
|
|
142
|
+
super();
|
|
143
|
+
|
|
144
|
+
const {
|
|
145
|
+
actionExecutor = null,
|
|
146
|
+
extractContentTool = null,
|
|
147
|
+
enableLogging = true,
|
|
148
|
+
enableCaching = false,
|
|
149
|
+
maxConcurrentSessions = 3,
|
|
150
|
+
defaultBrowserOptions = {},
|
|
151
|
+
screenshotPath = './screenshots'
|
|
152
|
+
} = options;
|
|
153
|
+
|
|
154
|
+
this.actionExecutor = actionExecutor || new ActionExecutor({
|
|
155
|
+
enableLogging,
|
|
156
|
+
enableScreenshotOnError: true,
|
|
157
|
+
screenshotPath
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
this.extractContentTool = extractContentTool || new ExtractContentTool();
|
|
161
|
+
this.enableLogging = enableLogging;
|
|
162
|
+
this.enableCaching = enableCaching;
|
|
163
|
+
this.maxConcurrentSessions = maxConcurrentSessions;
|
|
164
|
+
this.defaultBrowserOptions = defaultBrowserOptions;
|
|
165
|
+
|
|
166
|
+
// Active sessions tracking
|
|
167
|
+
this.activeSessions = new Map();
|
|
168
|
+
this.sessionResults = new Map();
|
|
169
|
+
|
|
170
|
+
// Statistics
|
|
171
|
+
this.stats = {
|
|
172
|
+
totalSessions: 0,
|
|
173
|
+
successfulSessions: 0,
|
|
174
|
+
failedSessions: 0,
|
|
175
|
+
totalActions: 0,
|
|
176
|
+
successfulActions: 0,
|
|
177
|
+
failedActions: 0,
|
|
178
|
+
averageSessionTime: 0,
|
|
179
|
+
averageActionsPerSession: 0,
|
|
180
|
+
lastUpdated: Date.now()
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
async execute(params) {
|
|
185
|
+
try {
|
|
186
|
+
const validated = ScrapeWithActionsSchema.parse(params);
|
|
187
|
+
|
|
188
|
+
this.stats.totalSessions++;
|
|
189
|
+
const sessionId = this.generateSessionId();
|
|
190
|
+
const startTime = Date.now();
|
|
191
|
+
|
|
192
|
+
if (this.enableLogging) {
|
|
193
|
+
console.log(`Starting scrape session ${sessionId} with ${validated.actions.length} actions on ${validated.url}`);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Check concurrent sessions limit
|
|
197
|
+
if (this.activeSessions.size >= this.maxConcurrentSessions) {
|
|
198
|
+
throw new Error(`Maximum concurrent sessions (${this.maxConcurrentSessions}) reached`);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Create session context
|
|
202
|
+
const sessionContext = {
|
|
203
|
+
id: sessionId,
|
|
204
|
+
url: validated.url,
|
|
205
|
+
startTime,
|
|
206
|
+
params: validated,
|
|
207
|
+
states: [],
|
|
208
|
+
screenshots: [],
|
|
209
|
+
actionResults: [],
|
|
210
|
+
errors: [],
|
|
211
|
+
status: 'initializing'
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
this.activeSessions.set(sessionId, sessionContext);
|
|
215
|
+
this.emit('sessionStarted', sessionContext);
|
|
216
|
+
|
|
217
|
+
try {
|
|
218
|
+
const result = await this.executeSession(sessionContext);
|
|
219
|
+
|
|
220
|
+
this.stats.successfulSessions++;
|
|
221
|
+
this.stats.totalActions += validated.actions.length;
|
|
222
|
+
this.stats.successfulActions += result.actionResults.filter(r => r.success).length;
|
|
223
|
+
this.stats.failedActions += result.actionResults.filter(r => !r.success).length;
|
|
224
|
+
|
|
225
|
+
const executionTime = Date.now() - startTime;
|
|
226
|
+
this.updateAverageSessionTime(executionTime);
|
|
227
|
+
this.updateAverageActionsPerSession(validated.actions.length);
|
|
228
|
+
this.updateStats();
|
|
229
|
+
|
|
230
|
+
if (this.enableCaching) {
|
|
231
|
+
this.sessionResults.set(sessionId, {
|
|
232
|
+
result,
|
|
233
|
+
timestamp: Date.now(),
|
|
234
|
+
ttl: 3600000
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
this.activeSessions.delete(sessionId);
|
|
239
|
+
this.emit('sessionCompleted', result);
|
|
240
|
+
|
|
241
|
+
return result;
|
|
242
|
+
|
|
243
|
+
} catch (error) {
|
|
244
|
+
this.stats.failedSessions++;
|
|
245
|
+
this.activeSessions.delete(sessionId);
|
|
246
|
+
this.emit('sessionFailed', { sessionId, url: validated.url, error });
|
|
247
|
+
throw error;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
} catch (error) {
|
|
251
|
+
this.log('error', `Scrape with actions failed: ${error.message}`);
|
|
252
|
+
throw new Error(`Scrape with actions failed: ${error.message}`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
async executeSession(sessionContext) {
|
|
257
|
+
const { params } = sessionContext;
|
|
258
|
+
sessionContext.status = 'running';
|
|
259
|
+
|
|
260
|
+
// Merge browser options
|
|
261
|
+
const browserOptions = {
|
|
262
|
+
...this.defaultBrowserOptions,
|
|
263
|
+
...params.browserOptions
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
// Build action chain with form auto-fill if provided
|
|
267
|
+
let actionChain = [...params.actions];
|
|
268
|
+
|
|
269
|
+
if (params.formAutoFill) {
|
|
270
|
+
actionChain = this.insertFormAutoFillActions(actionChain, params.formAutoFill);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Add capture actions if intermediate states requested
|
|
274
|
+
if (params.captureIntermediateStates) {
|
|
275
|
+
actionChain = this.insertCaptureActions(actionChain);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Execute action chain
|
|
279
|
+
const chainResult = await this.actionExecutor.executeActionChain(
|
|
280
|
+
params.url,
|
|
281
|
+
{
|
|
282
|
+
actions: actionChain,
|
|
283
|
+
continueOnError: params.continueOnActionError,
|
|
284
|
+
timeout: browserOptions.timeout || 30000,
|
|
285
|
+
retryChain: params.maxRetries,
|
|
286
|
+
metadata: {
|
|
287
|
+
sessionId: sessionContext.id,
|
|
288
|
+
originalActionCount: params.actions.length,
|
|
289
|
+
formAutoFill: !!params.formAutoFill
|
|
290
|
+
}
|
|
291
|
+
},
|
|
292
|
+
browserOptions
|
|
293
|
+
);
|
|
294
|
+
|
|
295
|
+
sessionContext.actionResults = chainResult.results;
|
|
296
|
+
sessionContext.screenshots = chainResult.screenshots || [];
|
|
297
|
+
|
|
298
|
+
// Process action results
|
|
299
|
+
const actionResults = this.processActionResults(chainResult.results);
|
|
300
|
+
const intermediateStates = params.captureIntermediateStates ?
|
|
301
|
+
await this.extractIntermediateStates(actionResults, params) : [];
|
|
302
|
+
|
|
303
|
+
// Get final page content after all actions
|
|
304
|
+
const finalContent = await this.extractFinalContent(params);
|
|
305
|
+
|
|
306
|
+
// Generate different formats
|
|
307
|
+
const content = this.generateFormats(finalContent, params.formats, {
|
|
308
|
+
actionResults,
|
|
309
|
+
intermediateStates,
|
|
310
|
+
screenshots: sessionContext.screenshots
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
const executionTime = Date.now() - sessionContext.startTime;
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
success: chainResult.success,
|
|
317
|
+
sessionId: sessionContext.id,
|
|
318
|
+
url: params.url,
|
|
319
|
+
executionTime,
|
|
320
|
+
|
|
321
|
+
actionResults,
|
|
322
|
+
totalActions: params.actions.length,
|
|
323
|
+
successfulActions: actionResults.filter(r => r.success).length,
|
|
324
|
+
failedActions: actionResults.filter(r => !r.success).length,
|
|
325
|
+
actionsExecuted: actionResults.length, // Total executed (for validation)
|
|
326
|
+
|
|
327
|
+
content,
|
|
328
|
+
|
|
329
|
+
intermediateStates: params.captureIntermediateStates ? intermediateStates : undefined,
|
|
330
|
+
screenshots: params.captureScreenshots ? sessionContext.screenshots : undefined,
|
|
331
|
+
|
|
332
|
+
// Form auto-fill flag (for tests/validation)
|
|
333
|
+
formAutoFillApplied: !!params.formAutoFill,
|
|
334
|
+
|
|
335
|
+
metadata: {
|
|
336
|
+
browserOptions,
|
|
337
|
+
formAutoFillApplied: !!params.formAutoFill,
|
|
338
|
+
intermediateStatesCount: intermediateStates.length,
|
|
339
|
+
screenshotsCount: sessionContext.screenshots.length,
|
|
340
|
+
finalUrl: chainResult.metadata?.finalUrl,
|
|
341
|
+
timestamp: Date.now()
|
|
342
|
+
},
|
|
343
|
+
|
|
344
|
+
stats: {
|
|
345
|
+
sessionTime: executionTime,
|
|
346
|
+
averageActionTime: actionResults.length > 0 ?
|
|
347
|
+
actionResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / actionResults.length : 0,
|
|
348
|
+
errorRecoveryCount: actionResults.filter(r => r.recovered).length
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
insertFormAutoFillActions(actions, formAutoFill) {
|
|
354
|
+
const fillActions = [];
|
|
355
|
+
|
|
356
|
+
// Convert object with key-value pairs to fill actions
|
|
357
|
+
for (const [selector, value] of Object.entries(formAutoFill)) {
|
|
358
|
+
if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
|
|
359
|
+
continue; // Skip special keys
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
fillActions.push({
|
|
363
|
+
type: 'type',
|
|
364
|
+
selector,
|
|
365
|
+
text: value,
|
|
366
|
+
description: `Auto-fill field: ${selector}`,
|
|
367
|
+
continueOnError: true,
|
|
368
|
+
retries: 1
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Add submit action if specified
|
|
373
|
+
if (formAutoFill.submitSelector) {
|
|
374
|
+
fillActions.push({
|
|
375
|
+
type: 'click',
|
|
376
|
+
selector: formAutoFill.submitSelector,
|
|
377
|
+
description: 'Auto-submit form',
|
|
378
|
+
continueOnError: false,
|
|
379
|
+
retries: 2
|
|
380
|
+
});
|
|
381
|
+
|
|
382
|
+
// Add wait after submit if specified
|
|
383
|
+
const waitTime = parseInt(formAutoFill.waitAfterSubmit) || 2000;
|
|
384
|
+
fillActions.push({
|
|
385
|
+
type: 'wait',
|
|
386
|
+
duration: waitTime,
|
|
387
|
+
description: 'Wait after form submission'
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
let insertIndex = 0;
|
|
392
|
+
for (let i = 0; i < actions.length; i++) {
|
|
393
|
+
if (actions[i].type !== 'wait') {
|
|
394
|
+
insertIndex = i;
|
|
395
|
+
break;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
return [
|
|
400
|
+
...actions.slice(0, insertIndex),
|
|
401
|
+
...fillActions,
|
|
402
|
+
...actions.slice(insertIndex)
|
|
403
|
+
];
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
insertCaptureActions(actions) {
|
|
407
|
+
const modifiedActions = [];
|
|
408
|
+
|
|
409
|
+
actions.forEach((action, index) => {
|
|
410
|
+
modifiedActions.push(action);
|
|
411
|
+
|
|
412
|
+
if (this.shouldCaptureAfterAction(action) || action.captureAfter) {
|
|
413
|
+
modifiedActions.push({
|
|
414
|
+
type: 'executeJavaScript',
|
|
415
|
+
script: `return {url: window.location.href, title: document.title, html: document.documentElement.outerHTML, timestamp: Date.now(), capturePoint: ${index + 1}};`,
|
|
416
|
+
description: `Capture state after action ${index + 1}`,
|
|
417
|
+
returnResult: true,
|
|
418
|
+
continueOnError: true
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
});
|
|
422
|
+
|
|
423
|
+
return modifiedActions;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
shouldCaptureAfterAction(action) {
|
|
427
|
+
const captureAfterTypes = ['click', 'type', 'press'];
|
|
428
|
+
return captureAfterTypes.includes(action.type);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
processActionResults(rawResults) {
|
|
432
|
+
return rawResults.map(result => ({
|
|
433
|
+
id: result.id,
|
|
434
|
+
type: result.type,
|
|
435
|
+
success: result.success,
|
|
436
|
+
description: result.description,
|
|
437
|
+
executionTime: result.executionTime,
|
|
438
|
+
timestamp: result.timestamp,
|
|
439
|
+
error: result.error,
|
|
440
|
+
result: result.result,
|
|
441
|
+
recovered: result.recovered,
|
|
442
|
+
recoveryStrategy: result.recoveryStrategy,
|
|
443
|
+
jsResult: result.type === 'executeJavaScript' && result.result ? result.result.result : undefined
|
|
444
|
+
}));
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
async extractIntermediateStates(actionResults, params) {
|
|
448
|
+
const states = [];
|
|
449
|
+
|
|
450
|
+
for (const result of actionResults) {
|
|
451
|
+
if (result.type === 'executeJavaScript' && result.jsResult && result.jsResult.html) {
|
|
452
|
+
try {
|
|
453
|
+
const stateData = result.jsResult;
|
|
454
|
+
const $ = load(stateData.html);
|
|
455
|
+
|
|
456
|
+
const state = {
|
|
457
|
+
capturePoint: stateData.capturePoint,
|
|
458
|
+
url: stateData.url,
|
|
459
|
+
title: stateData.title,
|
|
460
|
+
timestamp: stateData.timestamp,
|
|
461
|
+
content: {}
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
if (params.formats.includes('text')) {
|
|
465
|
+
state.content.text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if (params.formats.includes('html')) {
|
|
469
|
+
state.content.html = stateData.html;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if (params.formats.includes('json')) {
|
|
473
|
+
state.content.json = {
|
|
474
|
+
title: stateData.title,
|
|
475
|
+
headings: this.extractHeadings($),
|
|
476
|
+
links: this.extractLinks($)
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if (params.extractionOptions?.selectors) {
|
|
481
|
+
state.content.extracted = this.extractWithSelectors($, params.extractionOptions.selectors);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
states.push(state);
|
|
485
|
+
} catch (error) {
|
|
486
|
+
this.log('warn', `Failed to process intermediate state: ${error.message}`);
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
return states;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
async extractFinalContent(params) {
|
|
495
|
+
try {
|
|
496
|
+
const extractResult = await this.extractContentTool.execute({
|
|
497
|
+
url: params.url,
|
|
498
|
+
options: {
|
|
499
|
+
includeMetadata: params.extractionOptions?.includeMetadata !== false,
|
|
500
|
+
includeLinks: params.extractionOptions?.includeLinks !== false,
|
|
501
|
+
includeImages: params.extractionOptions?.includeImages !== false,
|
|
502
|
+
customSelectors: params.extractionOptions?.selectors
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
|
|
506
|
+
return extractResult;
|
|
507
|
+
} catch (error) {
|
|
508
|
+
this.log('warn', `Final content extraction failed: ${error.message}`);
|
|
509
|
+
return {
|
|
510
|
+
success: false,
|
|
511
|
+
error: error.message,
|
|
512
|
+
content: {},
|
|
513
|
+
metadata: {}
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
generateFormats(finalContent, formats, additionalData) {
|
|
519
|
+
const content = {};
|
|
520
|
+
|
|
521
|
+
if (formats.includes('json')) {
|
|
522
|
+
content.json = {
|
|
523
|
+
finalContent: finalContent.content || {},
|
|
524
|
+
metadata: finalContent.metadata || {},
|
|
525
|
+
actionSummary: {
|
|
526
|
+
totalActions: additionalData.actionResults.length,
|
|
527
|
+
successfulActions: additionalData.actionResults.filter(r => r.success).length,
|
|
528
|
+
failedActions: additionalData.actionResults.filter(r => !r.success).length,
|
|
529
|
+
actions: additionalData.actionResults.map(r => ({
|
|
530
|
+
type: r.type,
|
|
531
|
+
success: r.success,
|
|
532
|
+
description: r.description,
|
|
533
|
+
executionTime: r.executionTime
|
|
534
|
+
}))
|
|
535
|
+
}
|
|
536
|
+
};
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if (formats.includes('html')) {
|
|
540
|
+
content.html = finalContent.content?.html || '';
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
if (formats.includes('text')) {
|
|
544
|
+
content.text = finalContent.content?.text || '';
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
if (formats.includes('markdown')) {
|
|
548
|
+
content.markdown = finalContent.content?.markdown || 'Content not available in markdown format';
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
if (formats.includes('screenshots')) {
|
|
552
|
+
content.screenshots = additionalData.screenshots || [];
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
return content;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
extractHeadings($) {
|
|
559
|
+
const headings = [];
|
|
560
|
+
$('h1, h2, h3, h4, h5, h6').each((_, el) => {
|
|
561
|
+
headings.push({
|
|
562
|
+
level: parseInt(el.name.substring(1)),
|
|
563
|
+
text: $(el).text().trim(),
|
|
564
|
+
id: $(el).attr('id') || null
|
|
565
|
+
});
|
|
566
|
+
});
|
|
567
|
+
return headings;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
extractLinks($) {
|
|
571
|
+
const links = [];
|
|
572
|
+
$('a[href]').each((_, el) => {
|
|
573
|
+
const href = $(el).attr('href');
|
|
574
|
+
const text = $(el).text().trim();
|
|
575
|
+
|
|
576
|
+
if (href && text) {
|
|
577
|
+
links.push({
|
|
578
|
+
href,
|
|
579
|
+
text,
|
|
580
|
+
title: $(el).attr('title') || null
|
|
581
|
+
});
|
|
582
|
+
}
|
|
583
|
+
});
|
|
584
|
+
return links;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
extractWithSelectors($, selectors) {
|
|
588
|
+
const extracted = {};
|
|
589
|
+
|
|
590
|
+
for (const [key, selector] of Object.entries(selectors)) {
|
|
591
|
+
try {
|
|
592
|
+
const elements = $(selector);
|
|
593
|
+
|
|
594
|
+
if (elements.length === 0) {
|
|
595
|
+
extracted[key] = null;
|
|
596
|
+
} else if (elements.length === 1) {
|
|
597
|
+
extracted[key] = elements.text().trim();
|
|
598
|
+
} else {
|
|
599
|
+
extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
|
|
600
|
+
}
|
|
601
|
+
} catch (error) {
|
|
602
|
+
extracted[key] = { error: `Invalid selector: ${selector}` };
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
return extracted;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
generateSessionId() {
|
|
610
|
+
return `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
updateAverageSessionTime(sessionTime) {
|
|
614
|
+
const currentAverage = this.stats.averageSessionTime;
|
|
615
|
+
const completedSessions = this.stats.successfulSessions + this.stats.failedSessions;
|
|
616
|
+
|
|
617
|
+
if (completedSessions === 1) {
|
|
618
|
+
this.stats.averageSessionTime = sessionTime;
|
|
619
|
+
} else {
|
|
620
|
+
this.stats.averageSessionTime =
|
|
621
|
+
((currentAverage * (completedSessions - 1)) + sessionTime) / completedSessions;
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
updateAverageActionsPerSession(actionCount) {
|
|
626
|
+
const currentAverage = this.stats.averageActionsPerSession;
|
|
627
|
+
const totalSessions = this.stats.totalSessions;
|
|
628
|
+
|
|
629
|
+
if (totalSessions === 1) {
|
|
630
|
+
this.stats.averageActionsPerSession = actionCount;
|
|
631
|
+
} else {
|
|
632
|
+
this.stats.averageActionsPerSession =
|
|
633
|
+
((currentAverage * (totalSessions - 1)) + actionCount) / totalSessions;
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
updateStats() {
|
|
638
|
+
this.stats.lastUpdated = Date.now();
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
log(level, message) {
|
|
642
|
+
if (this.enableLogging) {
|
|
643
|
+
console.log(`[ScrapeWithActionsTool:${level.toUpperCase()}] ${message}`);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
getStats() {
|
|
648
|
+
return {
|
|
649
|
+
...this.stats,
|
|
650
|
+
activeSessions: this.activeSessions.size,
|
|
651
|
+
cachedResults: this.sessionResults.size,
|
|
652
|
+
actionExecutorStats: this.actionExecutor ? this.actionExecutor.getStats() : null
|
|
653
|
+
};
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
async destroy() {
|
|
657
|
+
this.activeSessions.clear();
|
|
658
|
+
this.sessionResults.clear();
|
|
659
|
+
|
|
660
|
+
if (this.actionExecutor) {
|
|
661
|
+
await this.actionExecutor.destroy();
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
this.removeAllListeners();
|
|
665
|
+
this.emit('destroyed');
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
export default ScrapeWithActionsTool;
|