crawlforge-mcp-server 4.2.12 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +19 -7
- package/README.md +11 -3
- package/package.json +3 -2
- package/server.js +195 -22
- package/src/cli/commands/init.js +107 -0
- package/src/cli/index.js +2 -0
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/AgentOrchestrator.js +300 -0
- package/src/core/AuthManager.js +21 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/agent/agent.js +71 -0
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +73 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +53 -52
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/scrape/unifiedScrape.js +314 -0
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
|
|
|
47
47
|
spoofMediaDevices: z.boolean().default(true),
|
|
48
48
|
spoofBatteryAPI: z.boolean().default(true)
|
|
49
49
|
}).optional(),
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
fingerprinting: z.object({
|
|
52
52
|
canvasNoise: z.boolean().default(true),
|
|
53
53
|
webglSpoofing: z.boolean().default(true),
|
|
54
54
|
audioContextSpoofing: z.boolean().default(true),
|
|
55
55
|
fontSpoofing: z.boolean().default(true),
|
|
56
56
|
hardwareSpoofing: z.boolean().default(true)
|
|
57
|
-
}).optional()
|
|
57
|
+
}).optional(),
|
|
58
|
+
|
|
59
|
+
// C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
|
|
60
|
+
engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
|
|
58
61
|
});
|
|
59
62
|
|
|
60
63
|
export class StealthBrowserManager {
|
|
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
|
|
|
232
235
|
}
|
|
233
236
|
|
|
234
237
|
/**
|
|
235
|
-
* Launch stealth browser with anti-detection configurations
|
|
238
|
+
* Launch stealth browser with anti-detection configurations.
|
|
239
|
+
* C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
|
|
236
240
|
*/
|
|
237
241
|
async launchStealthBrowser(config = {}) {
|
|
242
|
+
const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
|
|
243
|
+
|
|
244
|
+
// C2: if the requested engine differs from the running browser, tear it down first.
|
|
245
|
+
if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
|
|
246
|
+
await this.browser.close().catch(() => {});
|
|
247
|
+
this.browser = null;
|
|
248
|
+
}
|
|
249
|
+
|
|
238
250
|
if (this.browser) {
|
|
239
251
|
return this.browser;
|
|
240
252
|
}
|
|
241
253
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
254
|
+
// C2: delegate to CamoufoxAdapter when engine === 'camoufox'
|
|
255
|
+
if (validatedConfig.engine === 'camoufox') {
|
|
256
|
+
const adapter = new CamoufoxAdapter();
|
|
257
|
+
const available = await adapter.isAvailable();
|
|
258
|
+
if (!available) {
|
|
259
|
+
throw new Error(
|
|
260
|
+
'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
this.browser = await adapter.launch({
|
|
264
|
+
headless: true,
|
|
265
|
+
launchOptions: {}
|
|
266
|
+
});
|
|
267
|
+
this._launchedEngine = 'camoufox';
|
|
268
|
+
return this.browser;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
this._launchedEngine = 'chromium';
|
|
272
|
+
// Base browser args for stealth (Chromium path)
|
|
245
273
|
const stealthArgs = [
|
|
246
274
|
'--no-sandbox',
|
|
247
275
|
'--disable-dev-shm-usage',
|
|
@@ -498,6 +526,9 @@ export class StealthBrowserManager {
|
|
|
498
526
|
* Generate advanced HTTP headers with realistic patterns
|
|
499
527
|
*/
|
|
500
528
|
generateAdvancedHeaders(config, selectedOS) {
|
|
529
|
+
// Resolve the UA first so sec-ch-ua brand version can match.
|
|
530
|
+
const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
|
|
531
|
+
|
|
501
532
|
const headers = {
|
|
502
533
|
'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
|
|
503
534
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
@@ -512,8 +543,8 @@ export class StealthBrowserManager {
|
|
|
512
543
|
'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
|
|
513
544
|
};
|
|
514
545
|
|
|
515
|
-
//
|
|
516
|
-
headers['sec-ch-ua'] = this.generateSecChUaHeader();
|
|
546
|
+
// C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
|
|
547
|
+
headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
|
|
517
548
|
|
|
518
549
|
// Randomize some headers
|
|
519
550
|
if (Math.random() < 0.25) {
|
|
@@ -533,15 +564,23 @@ export class StealthBrowserManager {
|
|
|
533
564
|
}
|
|
534
565
|
|
|
535
566
|
/**
|
|
536
|
-
* Generate sec-ch-ua header
|
|
567
|
+
* Generate sec-ch-ua header.
|
|
568
|
+
* C2: brand versions are derived from the UA's Chrome major version so
|
|
569
|
+
* sec-ch-ua and the User-Agent header stay consistent.
|
|
570
|
+
* @param {string} [userAgent] — the selected user agent string
|
|
537
571
|
*/
|
|
538
|
-
generateSecChUaHeader() {
|
|
572
|
+
generateSecChUaHeader(userAgent = '') {
|
|
573
|
+
// Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
|
|
574
|
+
// Fall back to 121 if the UA is not a Chrome UA.
|
|
575
|
+
const match = userAgent.match(/Chrome\/(\d+)/i);
|
|
576
|
+
const version = match ? match[1] : '121';
|
|
577
|
+
|
|
539
578
|
const brands = [
|
|
540
579
|
{ brand: 'Not_A Brand', version: '8' },
|
|
541
|
-
{ brand: 'Chromium', version
|
|
542
|
-
{ brand: 'Google Chrome', version
|
|
580
|
+
{ brand: 'Chromium', version },
|
|
581
|
+
{ brand: 'Google Chrome', version }
|
|
543
582
|
];
|
|
544
|
-
|
|
583
|
+
|
|
545
584
|
return brands
|
|
546
585
|
.map(b => `"${b.brand}";v="${b.version}"`)
|
|
547
586
|
.join(', ');
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { SummarizerManager } from 'node-summarizer';
|
|
7
|
-
import { franc } from 'franc';
|
|
7
|
+
import { franc, francAll } from 'franc';
|
|
8
8
|
import nlp from 'compromise';
|
|
9
9
|
import { z } from 'zod';
|
|
10
10
|
import { splitSentences } from './sentenceUtils.js';
|
|
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
|
|
|
316
316
|
const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
|
|
317
317
|
|
|
318
318
|
// Get alternative languages using franc.all
|
|
319
|
-
const alternatives =
|
|
319
|
+
const alternatives = francAll(text, {
|
|
320
320
|
minLength: 10,
|
|
321
321
|
whitelist: Object.keys(LANGUAGE_NAMES)
|
|
322
322
|
})
|
|
@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
|
|
|
6
6
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
7
7
|
import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
|
|
8
8
|
import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
|
|
9
|
+
import { Logger } from '../../utils/Logger.js';
|
|
10
|
+
|
|
11
|
+
const logger = new Logger('BFSCrawler');
|
|
9
12
|
|
|
10
13
|
export class BFSCrawler {
|
|
11
14
|
constructor(options = {}) {
|
|
@@ -43,7 +46,10 @@ export class BFSCrawler {
|
|
|
43
46
|
|
|
44
47
|
this.queue = new QueueManager({ concurrency, timeout });
|
|
45
48
|
this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
|
|
49
|
+
// C1: per-domain rate-limiter map — reuse existing limiter when
|
|
50
|
+
// effectiveRateLimit hasn't changed, rather than recreating it on every URL.
|
|
46
51
|
this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
|
|
52
|
+
this._domainRateLimiters = new Map();
|
|
47
53
|
this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
|
|
48
54
|
|
|
49
55
|
// Initialize domain filter (create new if not provided)
|
|
@@ -142,13 +148,13 @@ export class BFSCrawler {
|
|
|
142
148
|
});
|
|
143
149
|
|
|
144
150
|
if (!filterDecision.allowed) {
|
|
145
|
-
|
|
151
|
+
logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
|
|
146
152
|
return;
|
|
147
153
|
}
|
|
148
|
-
|
|
154
|
+
|
|
149
155
|
// Backward compatibility: also check legacy patterns
|
|
150
156
|
if (!this.shouldCrawlUrl(normalizedUrl)) {
|
|
151
|
-
|
|
157
|
+
logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
|
|
152
158
|
return;
|
|
153
159
|
}
|
|
154
160
|
|
|
@@ -156,7 +162,7 @@ export class BFSCrawler {
|
|
|
156
162
|
if (this.respectRobots && this.robotsChecker) {
|
|
157
163
|
const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
|
|
158
164
|
if (!canFetch) {
|
|
159
|
-
|
|
165
|
+
logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
|
|
160
166
|
return;
|
|
161
167
|
}
|
|
162
168
|
}
|
|
@@ -171,17 +177,22 @@ export class BFSCrawler {
|
|
|
171
177
|
|
|
172
178
|
if (!pageData) {
|
|
173
179
|
// Apply domain-specific rate limiting
|
|
180
|
+
// C1: reuse per-domain limiter from the map to avoid recreating on each URL.
|
|
174
181
|
const urlObj = new URL(normalizedUrl);
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
// Use domain-specific rate limit if available
|
|
182
|
+
const domain = urlObj.hostname;
|
|
183
|
+
const domainRules = this.domainFilter.getDomainRules(domain);
|
|
178
184
|
const effectiveRateLimit = domainRules.rateLimit || 10;
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
this.
|
|
185
|
+
|
|
186
|
+
if (!this._domainRateLimiters.has(domain)) {
|
|
187
|
+
this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
|
|
188
|
+
} else {
|
|
189
|
+
const existing = this._domainRateLimiters.get(domain);
|
|
190
|
+
if (existing.requestsPerSecond !== effectiveRateLimit) {
|
|
191
|
+
this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
|
|
192
|
+
}
|
|
182
193
|
}
|
|
183
|
-
|
|
184
|
-
await this.
|
|
194
|
+
|
|
195
|
+
await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
|
|
185
196
|
|
|
186
197
|
// Fetch the page
|
|
187
198
|
pageData = await this.fetchPage(normalizedUrl);
|
|
@@ -401,9 +401,10 @@ export class ContentProcessor {
|
|
|
401
401
|
|
|
402
402
|
const avgWordsPerSentence = words.length / sentences.length;
|
|
403
403
|
const avgCharsPerWord = charactersNoSpaces / words.length;
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
404
|
+
const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
|
|
405
|
+
|
|
406
|
+
// Flesch Reading-Ease: higher score = easier to read
|
|
407
|
+
const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
|
|
407
408
|
|
|
408
409
|
return {
|
|
409
410
|
sentences: sentences.length,
|
|
@@ -412,6 +413,7 @@ export class ContentProcessor {
|
|
|
412
413
|
charactersNoSpaces,
|
|
413
414
|
avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
|
|
414
415
|
avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
|
|
416
|
+
avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
|
|
415
417
|
readabilityScore: Math.round(readabilityScore * 100) / 100,
|
|
416
418
|
readabilityLevel: this.getReadabilityLevel(readabilityScore)
|
|
417
419
|
};
|
|
@@ -432,6 +434,20 @@ export class ContentProcessor {
|
|
|
432
434
|
return 'Very Difficult';
|
|
433
435
|
}
|
|
434
436
|
|
|
437
|
+
/**
|
|
438
|
+
* Count syllables in a word (heuristic)
|
|
439
|
+
* @param {string} word
|
|
440
|
+
* @returns {number}
|
|
441
|
+
*/
|
|
442
|
+
_countSyllables(word) {
|
|
443
|
+
const w = word.toLowerCase().replace(/[^a-z]/g, '');
|
|
444
|
+
if (w.length <= 3) return 1;
|
|
445
|
+
// Remove trailing silent e
|
|
446
|
+
const stripped = w.replace(/e$/, '');
|
|
447
|
+
const matches = stripped.match(/[aeiouy]+/g);
|
|
448
|
+
return Math.max(1, matches ? matches.length : 1);
|
|
449
|
+
}
|
|
450
|
+
|
|
435
451
|
/**
|
|
436
452
|
* Extract fallback content when Readability fails
|
|
437
453
|
* @param {string} html - HTML content
|
|
@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
|
|
|
16
16
|
extractText: z.boolean().default(true),
|
|
17
17
|
password: z.string().optional(),
|
|
18
18
|
maxPages: z.number().min(1).max(1000).default(100),
|
|
19
|
+
// C3: true page-range extraction (1-based, inclusive). When set, only the
|
|
20
|
+
// text from pages [start..end] is returned.
|
|
21
|
+
pageRange: z.object({
|
|
22
|
+
start: z.number().min(1).default(1),
|
|
23
|
+
end: z.number().min(1).optional()
|
|
24
|
+
}).optional(),
|
|
19
25
|
parseOptions: z.object({
|
|
20
26
|
normalizeWhitespace: z.boolean().default(true),
|
|
21
27
|
disableCombineTextItems: z.boolean().default(false)
|
|
@@ -95,12 +101,29 @@ export class PDFProcessor {
|
|
|
95
101
|
return result;
|
|
96
102
|
}
|
|
97
103
|
|
|
104
|
+
// C3: when a page range is requested, capture per-page text so we can
|
|
105
|
+
// return exactly pages [start..end] (pdf-parse otherwise concatenates the
|
|
106
|
+
// whole document and its `max` option only caps the *upper* page bound).
|
|
107
|
+
const pageRange = processingOptions.pageRange;
|
|
108
|
+
const capturedPages = [];
|
|
109
|
+
|
|
98
110
|
// Parse PDF with options
|
|
99
111
|
const parseOptions = {
|
|
100
112
|
...processingOptions.parseOptions,
|
|
101
113
|
max: processingOptions.maxPages
|
|
102
114
|
};
|
|
103
115
|
|
|
116
|
+
// If extracting a range, raise `max` to at least the requested end page
|
|
117
|
+
// and install a pagerender that records each page's text.
|
|
118
|
+
if (pageRange) {
|
|
119
|
+
if (pageRange.end) {
|
|
120
|
+
parseOptions.max = Math.max(parseOptions.max, pageRange.end);
|
|
121
|
+
} else {
|
|
122
|
+
parseOptions.max = processingOptions.maxPages;
|
|
123
|
+
}
|
|
124
|
+
parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
|
|
125
|
+
}
|
|
126
|
+
|
|
104
127
|
if (processingOptions.password) {
|
|
105
128
|
parseOptions.password = processingOptions.password;
|
|
106
129
|
}
|
|
@@ -118,7 +141,15 @@ export class PDFProcessor {
|
|
|
118
141
|
|
|
119
142
|
// Extract text content
|
|
120
143
|
if (processingOptions.extractText) {
|
|
121
|
-
|
|
144
|
+
if (pageRange) {
|
|
145
|
+
const start = pageRange.start || 1;
|
|
146
|
+
const end = pageRange.end || capturedPages.length;
|
|
147
|
+
const slice = capturedPages.slice(start - 1, end);
|
|
148
|
+
result.text = this.cleanPDFText(slice.join('\n\n'));
|
|
149
|
+
result.extractedPages = { start, end, count: slice.length };
|
|
150
|
+
} else {
|
|
151
|
+
result.text = this.cleanPDFText(pdfData.text);
|
|
152
|
+
}
|
|
122
153
|
}
|
|
123
154
|
|
|
124
155
|
// Extract metadata
|
|
@@ -414,34 +445,52 @@ export class PDFProcessor {
|
|
|
414
445
|
}
|
|
415
446
|
|
|
416
447
|
/**
|
|
417
|
-
*
|
|
418
|
-
*
|
|
419
|
-
*
|
|
448
|
+
* Render a single PDF page to text and record it.
|
|
449
|
+
* Mirrors pdf-parse's default render (newline on Y-position change) but
|
|
450
|
+
* accumulates per-page text so callers can slice a true page range.
|
|
451
|
+
* Note: like pdf-parse, this does not reconstruct multi-column / table
|
|
452
|
+
* layout — column order follows the PDF's text-item stream.
|
|
453
|
+
* @param {Object} pageData - pdf.js page proxy from pdf-parse
|
|
454
|
+
* @param {string[]} sink - array that receives this page's text
|
|
455
|
+
* @returns {Promise<string>}
|
|
456
|
+
*/
|
|
457
|
+
async _renderPage(pageData, sink) {
|
|
458
|
+
const textContent = await pageData.getTextContent({
|
|
459
|
+
normalizeWhitespace: true,
|
|
460
|
+
disableCombineTextItems: false
|
|
461
|
+
});
|
|
462
|
+
let lastY;
|
|
463
|
+
let text = '';
|
|
464
|
+
for (const item of textContent.items) {
|
|
465
|
+
if (lastY === item.transform[5] || lastY === undefined) {
|
|
466
|
+
text += item.str;
|
|
467
|
+
} else {
|
|
468
|
+
text += '\n' + item.str;
|
|
469
|
+
}
|
|
470
|
+
lastY = item.transform[5];
|
|
471
|
+
}
|
|
472
|
+
sink.push(text);
|
|
473
|
+
// pdf-parse joins page renders with '\n\n' for pdfData.text
|
|
474
|
+
return text;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Extract a specific page range from a PDF (1-based, inclusive).
|
|
479
|
+
* @param {Object} params - Processing parameters
|
|
480
|
+
* @param {number} [params.startPage=1] - First page to include
|
|
481
|
+
* @param {number} [params.endPage] - Last page to include (defaults to end)
|
|
482
|
+
* @returns {Promise<Object>} - Processing result for the requested pages
|
|
420
483
|
*/
|
|
421
484
|
async extractPDFPages(params) {
|
|
422
485
|
const { startPage = 1, endPage, ...processingParams } = params;
|
|
423
|
-
|
|
424
|
-
// Override parse options to limit page range
|
|
425
|
-
const options = {
|
|
426
|
-
...processingParams.options,
|
|
427
|
-
parseOptions: {
|
|
428
|
-
...processingParams.options?.parseOptions,
|
|
429
|
-
max: endPage || processingParams.options?.maxPages || 100
|
|
430
|
-
}
|
|
431
|
-
};
|
|
432
486
|
|
|
433
|
-
|
|
487
|
+
return this.processPDF({
|
|
434
488
|
...processingParams,
|
|
435
|
-
options
|
|
489
|
+
options: {
|
|
490
|
+
...processingParams.options,
|
|
491
|
+
pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
|
|
492
|
+
}
|
|
436
493
|
});
|
|
437
|
-
|
|
438
|
-
if (result.success && result.text && startPage > 1) {
|
|
439
|
-
// This is a simplified approach - pdf-parse doesn't provide per-page text
|
|
440
|
-
// For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
|
|
441
|
-
console.warn('Page-specific extraction is limited with current PDF parser');
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
return result;
|
|
445
494
|
}
|
|
446
495
|
}
|
|
447
496
|
|
|
@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
|
|
|
121
121
|
captureIntermediateStates: z.boolean().default(false),
|
|
122
122
|
captureScreenshots: z.boolean().default(true),
|
|
123
123
|
|
|
124
|
-
// Form auto-fill
|
|
125
|
-
|
|
124
|
+
// Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
|
|
125
|
+
// A flat z.record(string) of selector→value is still accepted for backward compatibility.
|
|
126
|
+
formAutoFill: z.union([
|
|
127
|
+
z.object({
|
|
128
|
+
fields: z.array(FormFieldSchema),
|
|
129
|
+
submitSelector: z.string().optional(),
|
|
130
|
+
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
131
|
+
}),
|
|
132
|
+
z.record(z.string())
|
|
133
|
+
]).optional(),
|
|
126
134
|
|
|
127
135
|
// Browser options
|
|
128
136
|
browserOptions: z.object({
|
|
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
386
394
|
const intermediateStates = params.captureIntermediateStates ?
|
|
387
395
|
await this.extractIntermediateStates(actionResults, params) : [];
|
|
388
396
|
|
|
389
|
-
// Get final page content after all actions
|
|
390
|
-
|
|
397
|
+
// Get final page content after all actions (reads the post-action live page
|
|
398
|
+
// captured by ActionExecutor, falling back to a fresh fetch only if missing).
|
|
399
|
+
const finalContent = await this.extractFinalContent(params, chainResult);
|
|
391
400
|
|
|
392
401
|
// Generate different formats
|
|
393
402
|
const content = this.generateFormats(finalContent, params.formats, {
|
|
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
446
455
|
|
|
447
456
|
insertFormAutoFillActions(actions, formAutoFill) {
|
|
448
457
|
const fillActions = [];
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
458
|
+
|
|
459
|
+
if (Array.isArray(formAutoFill.fields)) {
|
|
460
|
+
// Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
|
|
461
|
+
for (const field of formAutoFill.fields) {
|
|
462
|
+
fillActions.push({
|
|
463
|
+
type: 'type',
|
|
464
|
+
selector: field.selector,
|
|
465
|
+
text: field.value,
|
|
466
|
+
description: `Auto-fill field: ${field.selector}`,
|
|
467
|
+
continueOnError: true,
|
|
468
|
+
retries: 1
|
|
469
|
+
});
|
|
470
|
+
if (field.waitAfter) {
|
|
471
|
+
fillActions.push({ type: 'wait', duration: field.waitAfter });
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
} else {
|
|
475
|
+
// Backward-compatible flat shape: { selector: value, ... }
|
|
476
|
+
for (const [selector, value] of Object.entries(formAutoFill)) {
|
|
477
|
+
if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
|
|
478
|
+
continue; // Skip special keys
|
|
479
|
+
}
|
|
480
|
+
fillActions.push({
|
|
481
|
+
type: 'type',
|
|
482
|
+
selector,
|
|
483
|
+
text: value,
|
|
484
|
+
description: `Auto-fill field: ${selector}`,
|
|
485
|
+
continueOnError: true,
|
|
486
|
+
retries: 1
|
|
487
|
+
});
|
|
454
488
|
}
|
|
455
|
-
|
|
456
|
-
fillActions.push({
|
|
457
|
-
type: 'type',
|
|
458
|
-
selector,
|
|
459
|
-
text: value,
|
|
460
|
-
description: `Auto-fill field: ${selector}`,
|
|
461
|
-
continueOnError: true,
|
|
462
|
-
retries: 1
|
|
463
|
-
});
|
|
464
489
|
}
|
|
465
490
|
|
|
466
491
|
// Add submit action if specified
|
|
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
585
610
|
return states;
|
|
586
611
|
}
|
|
587
612
|
|
|
588
|
-
async extractFinalContent(params) {
|
|
613
|
+
async extractFinalContent(params, chainResult = null) {
|
|
589
614
|
try {
|
|
615
|
+
const options = {
|
|
616
|
+
includeMetadata: params.extractionOptions?.includeMetadata !== false,
|
|
617
|
+
includeLinks: params.extractionOptions?.includeLinks !== false,
|
|
618
|
+
includeImages: params.extractionOptions?.includeImages !== false,
|
|
619
|
+
customSelectors: params.extractionOptions?.selectors
|
|
620
|
+
};
|
|
621
|
+
|
|
622
|
+
// Prefer the post-action live page HTML captured during action execution.
|
|
623
|
+
// This ensures the final content reflects clicks/typing/navigation rather
|
|
624
|
+
// than re-fetching the original (pre-action) URL.
|
|
625
|
+
if (chainResult?.finalHtml) {
|
|
626
|
+
return await this.extractContentTool.execute({
|
|
627
|
+
url: chainResult.finalUrl || params.url,
|
|
628
|
+
html: chainResult.finalHtml,
|
|
629
|
+
options
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
|
|
590
633
|
const extractResult = await this.extractContentTool.execute({
|
|
591
634
|
url: params.url,
|
|
592
|
-
options
|
|
593
|
-
includeMetadata: params.extractionOptions?.includeMetadata !== false,
|
|
594
|
-
includeLinks: params.extractionOptions?.includeLinks !== false,
|
|
595
|
-
includeImages: params.extractionOptions?.includeImages !== false,
|
|
596
|
-
customSelectors: params.extractionOptions?.selectors
|
|
597
|
-
}
|
|
635
|
+
options
|
|
598
636
|
});
|
|
599
637
|
|
|
600
638
|
return extractResult;
|
|
@@ -161,7 +161,9 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
161
161
|
this.stats.lastUpdated = Date.now();
|
|
162
162
|
this.activeBatches.delete(batchId);
|
|
163
163
|
|
|
164
|
-
|
|
164
|
+
// C3: include webhook delivery status in the result
|
|
165
|
+
const webhookStatus = await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
|
|
166
|
+
if (webhookStatus) batchResult.webhookDelivery = webhookStatus;
|
|
165
167
|
this.emit('batchCompleted', batchResult);
|
|
166
168
|
return batchResult;
|
|
167
169
|
} catch (error) {
|
|
@@ -5,14 +5,16 @@
|
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* Send a batch event via the webhookDispatcher.
|
|
8
|
+
* C3: returns a delivery status object so callers can include it in the result.
|
|
8
9
|
* @param {string} event
|
|
9
10
|
* @param {Object} data
|
|
10
11
|
* @param {Object} webhookConfig
|
|
11
12
|
* @param {Object} webhookDispatcher
|
|
12
13
|
* @param {boolean} enabled
|
|
14
|
+
* @returns {Promise<{queued: boolean, url?: string, error?: string}|null>}
|
|
13
15
|
*/
|
|
14
16
|
export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
|
|
15
|
-
if (!enabled || !webhookConfig || !webhookDispatcher) return;
|
|
17
|
+
if (!enabled || !webhookConfig || !webhookDispatcher) return null;
|
|
16
18
|
|
|
17
19
|
try {
|
|
18
20
|
await webhookDispatcher.dispatch(event, data, {
|
|
@@ -20,7 +22,9 @@ export async function sendWebhookNotification(event, data, webhookConfig, webhoo
|
|
|
20
22
|
immediate: false,
|
|
21
23
|
metadata: { batchId: data.batchId, timestamp: Date.now() }
|
|
22
24
|
});
|
|
25
|
+
return { queued: true, url: webhookConfig.url };
|
|
23
26
|
} catch (error) {
|
|
24
27
|
console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
|
|
28
|
+
return { queued: false, url: webhookConfig.url, error: error.message };
|
|
25
29
|
}
|
|
26
30
|
}
|
|
@@ -111,7 +111,6 @@ function generateFormats($, html, formats) {
|
|
|
111
111
|
function buildMarkdown($) {
|
|
112
112
|
let md = '';
|
|
113
113
|
const title = $('title').text().trim();
|
|
114
|
-
if (title) md += `# ${title}\n\n`;
|
|
115
114
|
|
|
116
115
|
const selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
|
|
117
116
|
let $body = null;
|
|
@@ -121,6 +120,12 @@ function buildMarkdown($) {
|
|
|
121
120
|
}
|
|
122
121
|
if (!$body || $body.length === 0) $body = $('body');
|
|
123
122
|
|
|
123
|
+
// C3: de-dup title — only emit the <title> heading if the page has no <h1>
|
|
124
|
+
// or if the first <h1> text differs from the <title> text (case-insensitive).
|
|
125
|
+
const firstH1 = $body.find('h1').first().text().trim();
|
|
126
|
+
const titleDuplicated = firstH1 && firstH1.toLowerCase() === title.toLowerCase();
|
|
127
|
+
if (title && !titleDuplicated) md += `# ${title}\n\n`;
|
|
128
|
+
|
|
124
129
|
$body.find('h1').each((_, el) => { md += `# ${$(el).text().trim()}\n\n`; });
|
|
125
130
|
$body.find('h2').each((_, el) => { md += `## ${$(el).text().trim()}\n\n`; });
|
|
126
131
|
$body.find('h3').each((_, el) => { md += `### ${$(el).text().trim()}\n\n`; });
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent tool — NL prompt → autonomous search/navigate/extract → answer.
|
|
3
|
+
*
|
|
4
|
+
* Wraps AgentOrchestrator for MCP registration.
|
|
5
|
+
* Mirrors the setMcpServer pattern from extractStructured.js.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
import { AgentOrchestrator } from '../../core/AgentOrchestrator.js';
|
|
10
|
+
import { ElicitationHelper } from '../../core/ElicitationHelper.js';
|
|
11
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
12
|
+
|
|
13
|
+
export const AgentInputSchema = z.object({
|
|
14
|
+
prompt: z.string().min(1).max(2000).describe('Natural-language task or question'),
|
|
15
|
+
urls: z.array(z.string().url()).max(20).optional().describe('Optional seed URLs to include (max 20)'),
|
|
16
|
+
schema: z.record(z.any()).optional().describe('Optional JSON schema for structured output'),
|
|
17
|
+
model: z.enum(['default', 'pro']).optional().default('default').describe('"default" = SamplingClient loop; "pro" = full ResearchOrchestrator'),
|
|
18
|
+
maxSteps: z.number().min(1).max(10).optional().default(5).describe('Max fetch iterations (hard cap: 10)'),
|
|
19
|
+
maxUrls: z.number().min(1).max(20).optional().default(10).describe('Max URLs to fetch (hard cap: 20)')
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
export class AgentTool {
|
|
23
|
+
constructor(options = {}) {
|
|
24
|
+
this._orchestrator = new AgentOrchestrator({
|
|
25
|
+
mcpServer: null,
|
|
26
|
+
searchConfig: getToolConfig('search_web') || {},
|
|
27
|
+
llmConfig: options.llmConfig || {}
|
|
28
|
+
});
|
|
29
|
+
this._elicitation = new ElicitationHelper({});
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Wire MCP server for SamplingClient + Elicitation (called from server.js). */
|
|
33
|
+
setMcpServer(mcpServer) {
|
|
34
|
+
this._orchestrator.setMcpServer(mcpServer);
|
|
35
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async execute(params) {
|
|
39
|
+
const validated = AgentInputSchema.parse(params);
|
|
40
|
+
|
|
41
|
+
// Request confirmation before a pro run (expensive)
|
|
42
|
+
if (validated.model === 'pro') {
|
|
43
|
+
const proceed = await this._elicitation.confirm(
|
|
44
|
+
'agent tool: pro model uses ResearchOrchestrator and may incur significant costs.',
|
|
45
|
+
{ model: 'pro', maxUrls: validated.maxUrls, note: 'External LLM API costs billed separately if keys are set.' }
|
|
46
|
+
);
|
|
47
|
+
if (!proceed) {
|
|
48
|
+
return {
|
|
49
|
+
success: false,
|
|
50
|
+
cancelled: true,
|
|
51
|
+
reason: 'User cancelled pro agent run.'
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return this._orchestrator.run({
|
|
57
|
+
prompt: validated.prompt,
|
|
58
|
+
urls: validated.urls,
|
|
59
|
+
schema: validated.schema,
|
|
60
|
+
model: validated.model,
|
|
61
|
+
maxSteps: validated.maxSteps,
|
|
62
|
+
maxUrls: validated.maxUrls
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async destroy() {
|
|
67
|
+
await this._orchestrator.destroy();
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export default AgentTool;
|