crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,669 @@
1
+ /**
2
+ * ScrapeWithActionsTool - Execute action chains before scraping with result collection
3
+ * Features: action chains, form interactions, intermediate state capture, error recovery
4
+ */
5
+
6
+ import { z } from 'zod';
7
+ import { EventEmitter } from 'events';
8
+ import ActionExecutor from '../../core/ActionExecutor.js';
9
+ import { load } from 'cheerio';
10
+
11
+ // Import existing tool for content extraction
12
+ import ExtractContentTool from '../extract/extractContent.js';
13
+
14
+ // Action schemas (re-using from ActionExecutor but with tool-specific additions)
15
+ const BaseActionSchema = z.object({
16
+ type: z.string(),
17
+ timeout: z.number().optional(),
18
+ description: z.string().optional(),
19
+ continueOnError: z.boolean().default(false),
20
+ retries: z.number().min(0).max(5).default(0),
21
+ captureAfter: z.boolean().default(false) // Capture content after this action
22
+ });
23
+
24
+ const WaitActionSchema = BaseActionSchema.extend({
25
+ type: z.literal('wait'),
26
+ duration: z.number().min(0).max(30000).optional(),
27
+ selector: z.string().optional(),
28
+ condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional(),
29
+ text: z.string().optional()
30
+ });
31
+
32
+ const ClickActionSchema = BaseActionSchema.extend({
33
+ type: z.literal('click'),
34
+ selector: z.string(),
35
+ button: z.enum(['left', 'right', 'middle']).default('left'),
36
+ clickCount: z.number().min(1).max(3).default(1),
37
+ delay: z.number().min(0).max(1000).default(0),
38
+ force: z.boolean().default(false),
39
+ position: z.object({
40
+ x: z.number(),
41
+ y: z.number()
42
+ }).optional()
43
+ });
44
+
45
+ const TypeActionSchema = BaseActionSchema.extend({
46
+ type: z.literal('type'),
47
+ selector: z.string(),
48
+ text: z.string(),
49
+ delay: z.number().min(0).max(1000).default(0),
50
+ clear: z.boolean().default(false)
51
+ });
52
+
53
+ const PressActionSchema = BaseActionSchema.extend({
54
+ type: z.literal('press'),
55
+ key: z.string(),
56
+ modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).default([]),
57
+ selector: z.string().optional()
58
+ });
59
+
60
+ const ScrollActionSchema = BaseActionSchema.extend({
61
+ type: z.literal('scroll'),
62
+ selector: z.string().optional(),
63
+ direction: z.enum(['up', 'down', 'left', 'right']).default('down'),
64
+ distance: z.number().min(0).default(100),
65
+ smooth: z.boolean().default(true),
66
+ toElement: z.string().optional()
67
+ });
68
+
69
+ const ScreenshotActionSchema = BaseActionSchema.extend({
70
+ type: z.literal('screenshot'),
71
+ selector: z.string().optional(),
72
+ fullPage: z.boolean().default(false),
73
+ quality: z.number().min(0).max(100).default(80),
74
+ format: z.enum(['png', 'jpeg']).default('png')
75
+ });
76
+
77
+ const ExecuteJavaScriptActionSchema = BaseActionSchema.extend({
78
+ type: z.literal('executeJavaScript'),
79
+ script: z.string(),
80
+ args: z.array(z.any()).default([]),
81
+ returnResult: z.boolean().default(true)
82
+ });
83
+
84
+ const ActionSchema = z.union([
85
+ WaitActionSchema,
86
+ ClickActionSchema,
87
+ TypeActionSchema,
88
+ PressActionSchema,
89
+ ScrollActionSchema,
90
+ ScreenshotActionSchema,
91
+ ExecuteJavaScriptActionSchema
92
+ ]);
93
+
94
+ // Form field schema for auto-fill
95
+ const FormFieldSchema = z.object({
96
+ selector: z.string(),
97
+ value: z.string(),
98
+ type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
99
+ waitAfter: z.number().min(0).max(5000).default(100)
100
+ });
101
+
102
+ // Main scrape with actions schema
103
+ const ScrapeWithActionsSchema = z.object({
104
+ url: z.string().url(),
105
+ actions: z.array(ActionSchema).min(1).max(20),
106
+
107
+ // Output formats
108
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
109
+
110
+ // Intermediate state capture
111
+ captureIntermediateStates: z.boolean().default(false),
112
+ captureScreenshots: z.boolean().default(true),
113
+
114
+ // Form auto-fill
115
+ formAutoFill: z.record(z.string()).optional(),
116
+
117
+ // Browser options
118
+ browserOptions: z.object({
119
+ headless: z.boolean().default(true),
120
+ userAgent: z.string().optional(),
121
+ viewportWidth: z.number().min(800).max(1920).default(1280),
122
+ viewportHeight: z.number().min(600).max(1080).default(720),
123
+ timeout: z.number().min(10000).max(120000).default(30000)
124
+ }).optional(),
125
+
126
+ // Content extraction options
127
+ extractionOptions: z.object({
128
+ selectors: z.record(z.string()).optional(),
129
+ includeMetadata: z.boolean().default(true),
130
+ includeLinks: z.boolean().default(true),
131
+ includeImages: z.boolean().default(true)
132
+ }).optional(),
133
+
134
+ // Error handling
135
+ continueOnActionError: z.boolean().default(false),
136
+ maxRetries: z.number().min(0).max(3).default(1),
137
+ screenshotOnError: z.boolean().default(true)
138
+ });
139
+
140
+ export class ScrapeWithActionsTool extends EventEmitter {
141
+ constructor(options = {}) {
142
+ super();
143
+
144
+ const {
145
+ actionExecutor = null,
146
+ extractContentTool = null,
147
+ enableLogging = true,
148
+ enableCaching = false,
149
+ maxConcurrentSessions = 3,
150
+ defaultBrowserOptions = {},
151
+ screenshotPath = './screenshots'
152
+ } = options;
153
+
154
+ this.actionExecutor = actionExecutor || new ActionExecutor({
155
+ enableLogging,
156
+ enableScreenshotOnError: true,
157
+ screenshotPath
158
+ });
159
+
160
+ this.extractContentTool = extractContentTool || new ExtractContentTool();
161
+ this.enableLogging = enableLogging;
162
+ this.enableCaching = enableCaching;
163
+ this.maxConcurrentSessions = maxConcurrentSessions;
164
+ this.defaultBrowserOptions = defaultBrowserOptions;
165
+
166
+ // Active sessions tracking
167
+ this.activeSessions = new Map();
168
+ this.sessionResults = new Map();
169
+
170
+ // Statistics
171
+ this.stats = {
172
+ totalSessions: 0,
173
+ successfulSessions: 0,
174
+ failedSessions: 0,
175
+ totalActions: 0,
176
+ successfulActions: 0,
177
+ failedActions: 0,
178
+ averageSessionTime: 0,
179
+ averageActionsPerSession: 0,
180
+ lastUpdated: Date.now()
181
+ };
182
+ }
183
+
184
+ async execute(params) {
185
+ try {
186
+ const validated = ScrapeWithActionsSchema.parse(params);
187
+
188
+ this.stats.totalSessions++;
189
+ const sessionId = this.generateSessionId();
190
+ const startTime = Date.now();
191
+
192
+ if (this.enableLogging) {
193
+ console.log(`Starting scrape session ${sessionId} with ${validated.actions.length} actions on ${validated.url}`);
194
+ }
195
+
196
+ // Check concurrent sessions limit
197
+ if (this.activeSessions.size >= this.maxConcurrentSessions) {
198
+ throw new Error(`Maximum concurrent sessions (${this.maxConcurrentSessions}) reached`);
199
+ }
200
+
201
+ // Create session context
202
+ const sessionContext = {
203
+ id: sessionId,
204
+ url: validated.url,
205
+ startTime,
206
+ params: validated,
207
+ states: [],
208
+ screenshots: [],
209
+ actionResults: [],
210
+ errors: [],
211
+ status: 'initializing'
212
+ };
213
+
214
+ this.activeSessions.set(sessionId, sessionContext);
215
+ this.emit('sessionStarted', sessionContext);
216
+
217
+ try {
218
+ const result = await this.executeSession(sessionContext);
219
+
220
+ this.stats.successfulSessions++;
221
+ this.stats.totalActions += validated.actions.length;
222
+ this.stats.successfulActions += result.actionResults.filter(r => r.success).length;
223
+ this.stats.failedActions += result.actionResults.filter(r => !r.success).length;
224
+
225
+ const executionTime = Date.now() - startTime;
226
+ this.updateAverageSessionTime(executionTime);
227
+ this.updateAverageActionsPerSession(validated.actions.length);
228
+ this.updateStats();
229
+
230
+ if (this.enableCaching) {
231
+ this.sessionResults.set(sessionId, {
232
+ result,
233
+ timestamp: Date.now(),
234
+ ttl: 3600000
235
+ });
236
+ }
237
+
238
+ this.activeSessions.delete(sessionId);
239
+ this.emit('sessionCompleted', result);
240
+
241
+ return result;
242
+
243
+ } catch (error) {
244
+ this.stats.failedSessions++;
245
+ this.activeSessions.delete(sessionId);
246
+ this.emit('sessionFailed', { sessionId, url: validated.url, error });
247
+ throw error;
248
+ }
249
+
250
+ } catch (error) {
251
+ this.log('error', `Scrape with actions failed: ${error.message}`);
252
+ throw new Error(`Scrape with actions failed: ${error.message}`);
253
+ }
254
+ }
255
+
256
+ async executeSession(sessionContext) {
257
+ const { params } = sessionContext;
258
+ sessionContext.status = 'running';
259
+
260
+ // Merge browser options
261
+ const browserOptions = {
262
+ ...this.defaultBrowserOptions,
263
+ ...params.browserOptions
264
+ };
265
+
266
+ // Build action chain with form auto-fill if provided
267
+ let actionChain = [...params.actions];
268
+
269
+ if (params.formAutoFill) {
270
+ actionChain = this.insertFormAutoFillActions(actionChain, params.formAutoFill);
271
+ }
272
+
273
+ // Add capture actions if intermediate states requested
274
+ if (params.captureIntermediateStates) {
275
+ actionChain = this.insertCaptureActions(actionChain);
276
+ }
277
+
278
+ // Execute action chain
279
+ const chainResult = await this.actionExecutor.executeActionChain(
280
+ params.url,
281
+ {
282
+ actions: actionChain,
283
+ continueOnError: params.continueOnActionError,
284
+ timeout: browserOptions.timeout || 30000,
285
+ retryChain: params.maxRetries,
286
+ metadata: {
287
+ sessionId: sessionContext.id,
288
+ originalActionCount: params.actions.length,
289
+ formAutoFill: !!params.formAutoFill
290
+ }
291
+ },
292
+ browserOptions
293
+ );
294
+
295
+ sessionContext.actionResults = chainResult.results;
296
+ sessionContext.screenshots = chainResult.screenshots || [];
297
+
298
+ // Process action results
299
+ const actionResults = this.processActionResults(chainResult.results);
300
+ const intermediateStates = params.captureIntermediateStates ?
301
+ await this.extractIntermediateStates(actionResults, params) : [];
302
+
303
+ // Get final page content after all actions
304
+ const finalContent = await this.extractFinalContent(params);
305
+
306
+ // Generate different formats
307
+ const content = this.generateFormats(finalContent, params.formats, {
308
+ actionResults,
309
+ intermediateStates,
310
+ screenshots: sessionContext.screenshots
311
+ });
312
+
313
+ const executionTime = Date.now() - sessionContext.startTime;
314
+
315
+ return {
316
+ success: chainResult.success,
317
+ sessionId: sessionContext.id,
318
+ url: params.url,
319
+ executionTime,
320
+
321
+ actionResults,
322
+ totalActions: params.actions.length,
323
+ successfulActions: actionResults.filter(r => r.success).length,
324
+ failedActions: actionResults.filter(r => !r.success).length,
325
+ actionsExecuted: actionResults.length, // Total executed (for validation)
326
+
327
+ content,
328
+
329
+ intermediateStates: params.captureIntermediateStates ? intermediateStates : undefined,
330
+ screenshots: params.captureScreenshots ? sessionContext.screenshots : undefined,
331
+
332
+ // Form auto-fill flag (for tests/validation)
333
+ formAutoFillApplied: !!params.formAutoFill,
334
+
335
+ metadata: {
336
+ browserOptions,
337
+ formAutoFillApplied: !!params.formAutoFill,
338
+ intermediateStatesCount: intermediateStates.length,
339
+ screenshotsCount: sessionContext.screenshots.length,
340
+ finalUrl: chainResult.metadata?.finalUrl,
341
+ timestamp: Date.now()
342
+ },
343
+
344
+ stats: {
345
+ sessionTime: executionTime,
346
+ averageActionTime: actionResults.length > 0 ?
347
+ actionResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / actionResults.length : 0,
348
+ errorRecoveryCount: actionResults.filter(r => r.recovered).length
349
+ }
350
+ };
351
+ }
352
+
353
+ insertFormAutoFillActions(actions, formAutoFill) {
354
+ const fillActions = [];
355
+
356
+ // Convert object with key-value pairs to fill actions
357
+ for (const [selector, value] of Object.entries(formAutoFill)) {
358
+ if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
359
+ continue; // Skip special keys
360
+ }
361
+
362
+ fillActions.push({
363
+ type: 'type',
364
+ selector,
365
+ text: value,
366
+ description: `Auto-fill field: ${selector}`,
367
+ continueOnError: true,
368
+ retries: 1
369
+ });
370
+ }
371
+
372
+ // Add submit action if specified
373
+ if (formAutoFill.submitSelector) {
374
+ fillActions.push({
375
+ type: 'click',
376
+ selector: formAutoFill.submitSelector,
377
+ description: 'Auto-submit form',
378
+ continueOnError: false,
379
+ retries: 2
380
+ });
381
+
382
+ // Add wait after submit if specified
383
+ const waitTime = parseInt(formAutoFill.waitAfterSubmit) || 2000;
384
+ fillActions.push({
385
+ type: 'wait',
386
+ duration: waitTime,
387
+ description: 'Wait after form submission'
388
+ });
389
+ }
390
+
391
+ let insertIndex = 0;
392
+ for (let i = 0; i < actions.length; i++) {
393
+ if (actions[i].type !== 'wait') {
394
+ insertIndex = i;
395
+ break;
396
+ }
397
+ }
398
+
399
+ return [
400
+ ...actions.slice(0, insertIndex),
401
+ ...fillActions,
402
+ ...actions.slice(insertIndex)
403
+ ];
404
+ }
405
+
406
+ insertCaptureActions(actions) {
407
+ const modifiedActions = [];
408
+
409
+ actions.forEach((action, index) => {
410
+ modifiedActions.push(action);
411
+
412
+ if (this.shouldCaptureAfterAction(action) || action.captureAfter) {
413
+ modifiedActions.push({
414
+ type: 'executeJavaScript',
415
+ script: `return {url: window.location.href, title: document.title, html: document.documentElement.outerHTML, timestamp: Date.now(), capturePoint: ${index + 1}};`,
416
+ description: `Capture state after action ${index + 1}`,
417
+ returnResult: true,
418
+ continueOnError: true
419
+ });
420
+ }
421
+ });
422
+
423
+ return modifiedActions;
424
+ }
425
+
426
+ shouldCaptureAfterAction(action) {
427
+ const captureAfterTypes = ['click', 'type', 'press'];
428
+ return captureAfterTypes.includes(action.type);
429
+ }
430
+
431
+ processActionResults(rawResults) {
432
+ return rawResults.map(result => ({
433
+ id: result.id,
434
+ type: result.type,
435
+ success: result.success,
436
+ description: result.description,
437
+ executionTime: result.executionTime,
438
+ timestamp: result.timestamp,
439
+ error: result.error,
440
+ result: result.result,
441
+ recovered: result.recovered,
442
+ recoveryStrategy: result.recoveryStrategy,
443
+ jsResult: result.type === 'executeJavaScript' && result.result ? result.result.result : undefined
444
+ }));
445
+ }
446
+
447
+ async extractIntermediateStates(actionResults, params) {
448
+ const states = [];
449
+
450
+ for (const result of actionResults) {
451
+ if (result.type === 'executeJavaScript' && result.jsResult && result.jsResult.html) {
452
+ try {
453
+ const stateData = result.jsResult;
454
+ const $ = load(stateData.html);
455
+
456
+ const state = {
457
+ capturePoint: stateData.capturePoint,
458
+ url: stateData.url,
459
+ title: stateData.title,
460
+ timestamp: stateData.timestamp,
461
+ content: {}
462
+ };
463
+
464
+ if (params.formats.includes('text')) {
465
+ state.content.text = $('body').text().replace(/\s+/g, ' ').trim();
466
+ }
467
+
468
+ if (params.formats.includes('html')) {
469
+ state.content.html = stateData.html;
470
+ }
471
+
472
+ if (params.formats.includes('json')) {
473
+ state.content.json = {
474
+ title: stateData.title,
475
+ headings: this.extractHeadings($),
476
+ links: this.extractLinks($)
477
+ };
478
+ }
479
+
480
+ if (params.extractionOptions?.selectors) {
481
+ state.content.extracted = this.extractWithSelectors($, params.extractionOptions.selectors);
482
+ }
483
+
484
+ states.push(state);
485
+ } catch (error) {
486
+ this.log('warn', `Failed to process intermediate state: ${error.message}`);
487
+ }
488
+ }
489
+ }
490
+
491
+ return states;
492
+ }
493
+
494
+ async extractFinalContent(params) {
495
+ try {
496
+ const extractResult = await this.extractContentTool.execute({
497
+ url: params.url,
498
+ options: {
499
+ includeMetadata: params.extractionOptions?.includeMetadata !== false,
500
+ includeLinks: params.extractionOptions?.includeLinks !== false,
501
+ includeImages: params.extractionOptions?.includeImages !== false,
502
+ customSelectors: params.extractionOptions?.selectors
503
+ }
504
+ });
505
+
506
+ return extractResult;
507
+ } catch (error) {
508
+ this.log('warn', `Final content extraction failed: ${error.message}`);
509
+ return {
510
+ success: false,
511
+ error: error.message,
512
+ content: {},
513
+ metadata: {}
514
+ };
515
+ }
516
+ }
517
+
518
+ generateFormats(finalContent, formats, additionalData) {
519
+ const content = {};
520
+
521
+ if (formats.includes('json')) {
522
+ content.json = {
523
+ finalContent: finalContent.content || {},
524
+ metadata: finalContent.metadata || {},
525
+ actionSummary: {
526
+ totalActions: additionalData.actionResults.length,
527
+ successfulActions: additionalData.actionResults.filter(r => r.success).length,
528
+ failedActions: additionalData.actionResults.filter(r => !r.success).length,
529
+ actions: additionalData.actionResults.map(r => ({
530
+ type: r.type,
531
+ success: r.success,
532
+ description: r.description,
533
+ executionTime: r.executionTime
534
+ }))
535
+ }
536
+ };
537
+ }
538
+
539
+ if (formats.includes('html')) {
540
+ content.html = finalContent.content?.html || '';
541
+ }
542
+
543
+ if (formats.includes('text')) {
544
+ content.text = finalContent.content?.text || '';
545
+ }
546
+
547
+ if (formats.includes('markdown')) {
548
+ content.markdown = finalContent.content?.markdown || 'Content not available in markdown format';
549
+ }
550
+
551
+ if (formats.includes('screenshots')) {
552
+ content.screenshots = additionalData.screenshots || [];
553
+ }
554
+
555
+ return content;
556
+ }
557
+
558
+ extractHeadings($) {
559
+ const headings = [];
560
+ $('h1, h2, h3, h4, h5, h6').each((_, el) => {
561
+ headings.push({
562
+ level: parseInt(el.name.substring(1)),
563
+ text: $(el).text().trim(),
564
+ id: $(el).attr('id') || null
565
+ });
566
+ });
567
+ return headings;
568
+ }
569
+
570
+ extractLinks($) {
571
+ const links = [];
572
+ $('a[href]').each((_, el) => {
573
+ const href = $(el).attr('href');
574
+ const text = $(el).text().trim();
575
+
576
+ if (href && text) {
577
+ links.push({
578
+ href,
579
+ text,
580
+ title: $(el).attr('title') || null
581
+ });
582
+ }
583
+ });
584
+ return links;
585
+ }
586
+
587
+ extractWithSelectors($, selectors) {
588
+ const extracted = {};
589
+
590
+ for (const [key, selector] of Object.entries(selectors)) {
591
+ try {
592
+ const elements = $(selector);
593
+
594
+ if (elements.length === 0) {
595
+ extracted[key] = null;
596
+ } else if (elements.length === 1) {
597
+ extracted[key] = elements.text().trim();
598
+ } else {
599
+ extracted[key] = elements.map((_, el) => $(el).text().trim()).get();
600
+ }
601
+ } catch (error) {
602
+ extracted[key] = { error: `Invalid selector: ${selector}` };
603
+ }
604
+ }
605
+
606
+ return extracted;
607
+ }
608
+
609
+ generateSessionId() {
610
+ return `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
611
+ }
612
+
613
+ updateAverageSessionTime(sessionTime) {
614
+ const currentAverage = this.stats.averageSessionTime;
615
+ const completedSessions = this.stats.successfulSessions + this.stats.failedSessions;
616
+
617
+ if (completedSessions === 1) {
618
+ this.stats.averageSessionTime = sessionTime;
619
+ } else {
620
+ this.stats.averageSessionTime =
621
+ ((currentAverage * (completedSessions - 1)) + sessionTime) / completedSessions;
622
+ }
623
+ }
624
+
625
+ updateAverageActionsPerSession(actionCount) {
626
+ const currentAverage = this.stats.averageActionsPerSession;
627
+ const totalSessions = this.stats.totalSessions;
628
+
629
+ if (totalSessions === 1) {
630
+ this.stats.averageActionsPerSession = actionCount;
631
+ } else {
632
+ this.stats.averageActionsPerSession =
633
+ ((currentAverage * (totalSessions - 1)) + actionCount) / totalSessions;
634
+ }
635
+ }
636
+
637
+ updateStats() {
638
+ this.stats.lastUpdated = Date.now();
639
+ }
640
+
641
+ log(level, message) {
642
+ if (this.enableLogging) {
643
+ console.log(`[ScrapeWithActionsTool:${level.toUpperCase()}] ${message}`);
644
+ }
645
+ }
646
+
647
+ getStats() {
648
+ return {
649
+ ...this.stats,
650
+ activeSessions: this.activeSessions.size,
651
+ cachedResults: this.sessionResults.size,
652
+ actionExecutorStats: this.actionExecutor ? this.actionExecutor.getStats() : null
653
+ };
654
+ }
655
+
656
+ async destroy() {
657
+ this.activeSessions.clear();
658
+ this.sessionResults.clear();
659
+
660
+ if (this.actionExecutor) {
661
+ await this.actionExecutor.destroy();
662
+ }
663
+
664
+ this.removeAllListeners();
665
+ this.emit('destroyed');
666
+ }
667
+ }
668
+
669
+ export default ScrapeWithActionsTool;