brave-real-browser-mcp-server 2.24.4 → 2.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/handlers/advanced-tools.js +516 -12
- package/dist/index.js +8 -1
- package/dist/tool-definitions.js +112 -0
- package/package.json +4 -2
|
@@ -586,22 +586,78 @@ export async function handleDeepAnalysis(page, args) {
|
|
|
586
586
|
}
|
|
587
587
|
/**
|
|
588
588
|
* Record full network traffic - Uses response events to avoid crashes
|
|
589
|
+
* ULTRA POWERFUL: API detection, media URLs, smart categorization
|
|
589
590
|
*/
|
|
590
591
|
export async function handleNetworkRecorder(page, args) {
|
|
591
592
|
const requests = [];
|
|
592
593
|
const duration = args.duration || 10000;
|
|
593
594
|
let totalSize = 0;
|
|
595
|
+
const categories = {};
|
|
596
|
+
const apis = [];
|
|
597
|
+
const mediaUrls = [];
|
|
598
|
+
const seen = new Set();
|
|
599
|
+
// ============================================================
|
|
600
|
+
// SMART CATEGORIZATION HELPER
|
|
601
|
+
// ============================================================
|
|
602
|
+
const categorizeUrl = (url, resourceType) => {
|
|
603
|
+
const urlLower = url.toLowerCase();
|
|
604
|
+
// API endpoints
|
|
605
|
+
if (/\/api\/|\/v\d+\/|\.json(\?|$)|graphql/i.test(url))
|
|
606
|
+
return 'api';
|
|
607
|
+
// Media
|
|
608
|
+
if (/\.(mp4|webm|m3u8|ts|mp3|flac|ogg)/i.test(url))
|
|
609
|
+
return 'media';
|
|
610
|
+
if (resourceType === 'media' || resourceType === 'video' || resourceType === 'audio')
|
|
611
|
+
return 'media';
|
|
612
|
+
// Images
|
|
613
|
+
if (/\.(jpg|jpeg|png|gif|webp|svg|ico)/i.test(url) || resourceType === 'image')
|
|
614
|
+
return 'image';
|
|
615
|
+
// Scripts
|
|
616
|
+
if (/\.js(\?|$)/i.test(url) || resourceType === 'script')
|
|
617
|
+
return 'script';
|
|
618
|
+
// Styles
|
|
619
|
+
if (/\.css(\?|$)/i.test(url) || resourceType === 'stylesheet')
|
|
620
|
+
return 'style';
|
|
621
|
+
// Fonts
|
|
622
|
+
if (/\.(woff2?|ttf|eot|otf)/i.test(url) || resourceType === 'font')
|
|
623
|
+
return 'font';
|
|
624
|
+
// XHR/Fetch
|
|
625
|
+
if (resourceType === 'xhr' || resourceType === 'fetch')
|
|
626
|
+
return 'xhr';
|
|
627
|
+
// Documents
|
|
628
|
+
if (resourceType === 'document')
|
|
629
|
+
return 'document';
|
|
630
|
+
return 'other';
|
|
631
|
+
};
|
|
594
632
|
// Response handler - safer than request interception
|
|
595
633
|
const responseHandler = (response) => {
|
|
596
634
|
try {
|
|
597
635
|
const url = response.url();
|
|
636
|
+
// Dedup
|
|
637
|
+
if (seen.has(url))
|
|
638
|
+
return;
|
|
639
|
+
seen.add(url);
|
|
598
640
|
if (args.filterUrl && !url.includes(args.filterUrl)) {
|
|
599
641
|
return;
|
|
600
642
|
}
|
|
643
|
+
const resourceType = response.request()?.resourceType?.() || 'unknown';
|
|
644
|
+
const method = response.request()?.method?.() || 'GET';
|
|
645
|
+
const category = categorizeUrl(url, resourceType);
|
|
646
|
+
categories[category] = (categories[category] || 0) + 1;
|
|
647
|
+
// Collect API endpoints
|
|
648
|
+
if (category === 'api' || resourceType === 'xhr' || resourceType === 'fetch') {
|
|
649
|
+
apis.push({ url, method, type: resourceType });
|
|
650
|
+
}
|
|
651
|
+
// Collect media URLs
|
|
652
|
+
if (category === 'media' || /\.(mp4|webm|m3u8|ts|mp3)/i.test(url)) {
|
|
653
|
+
mediaUrls.push(url);
|
|
654
|
+
}
|
|
601
655
|
const entry = {
|
|
602
656
|
url,
|
|
603
657
|
status: response.status(),
|
|
604
|
-
resourceType
|
|
658
|
+
resourceType,
|
|
659
|
+
category,
|
|
660
|
+
method,
|
|
605
661
|
timestamp: Date.now(),
|
|
606
662
|
};
|
|
607
663
|
if (args.includeHeaders) {
|
|
@@ -612,7 +668,6 @@ export async function handleNetworkRecorder(page, args) {
|
|
|
612
668
|
entry.headers = {};
|
|
613
669
|
}
|
|
614
670
|
}
|
|
615
|
-
// Note: Response body requires async handling, skip for stability
|
|
616
671
|
requests.push(entry);
|
|
617
672
|
// Track size from headers
|
|
618
673
|
try {
|
|
@@ -647,6 +702,10 @@ export async function handleNetworkRecorder(page, args) {
|
|
|
647
702
|
requests: requests.slice(0, 500),
|
|
648
703
|
count: requests.length,
|
|
649
704
|
totalSize,
|
|
705
|
+
categories,
|
|
706
|
+
apis: apis.length > 0 ? apis : undefined,
|
|
707
|
+
mediaUrls: mediaUrls.length > 0 ? mediaUrls : undefined,
|
|
708
|
+
message: `📡 Recorded ${requests.length} requests (${Math.round(totalSize / 1024)}KB) | APIs: ${apis.length} | Media: ${mediaUrls.length}`
|
|
650
709
|
};
|
|
651
710
|
}
|
|
652
711
|
/**
|
|
@@ -776,6 +835,7 @@ export async function handleAdProtectionDetector(page, args) {
|
|
|
776
835
|
}
|
|
777
836
|
/**
|
|
778
837
|
* Wait for dynamic AJAX loading
|
|
838
|
+
* ULTRA POWERFUL: Infinite scroll, lazy load, mutation observer
|
|
779
839
|
*/
|
|
780
840
|
export async function handleAjaxContentWaiter(page, args) {
|
|
781
841
|
const timeout = args.timeout || 30000;
|
|
@@ -783,6 +843,79 @@ export async function handleAjaxContentWaiter(page, args) {
|
|
|
783
843
|
const startTime = Date.now();
|
|
784
844
|
let content;
|
|
785
845
|
let loaded = false;
|
|
846
|
+
let newElementsCount = 0;
|
|
847
|
+
let scrollDepth = 0;
|
|
848
|
+
// ============================================================
|
|
849
|
+
// 1. MUTATION OBSERVER: Track DOM changes in real-time
|
|
850
|
+
// ============================================================
|
|
851
|
+
const setupMutationObserver = async () => {
|
|
852
|
+
return await page.evaluate(() => {
|
|
853
|
+
return new Promise((resolve) => {
|
|
854
|
+
let added = 0;
|
|
855
|
+
let modified = 0;
|
|
856
|
+
const observer = new MutationObserver((mutations) => {
|
|
857
|
+
mutations.forEach(m => {
|
|
858
|
+
added += m.addedNodes.length;
|
|
859
|
+
if (m.type === 'attributes' || m.type === 'characterData')
|
|
860
|
+
modified++;
|
|
861
|
+
});
|
|
862
|
+
});
|
|
863
|
+
observer.observe(document.body, {
|
|
864
|
+
childList: true,
|
|
865
|
+
subtree: true,
|
|
866
|
+
attributes: true,
|
|
867
|
+
characterData: true
|
|
868
|
+
});
|
|
869
|
+
// Return after 2 seconds of observation
|
|
870
|
+
setTimeout(() => {
|
|
871
|
+
observer.disconnect();
|
|
872
|
+
resolve({ added, modified });
|
|
873
|
+
}, 2000);
|
|
874
|
+
});
|
|
875
|
+
});
|
|
876
|
+
};
|
|
877
|
+
// ============================================================
|
|
878
|
+
// 2. INFINITE SCROLL DETECTION
|
|
879
|
+
// ============================================================
|
|
880
|
+
const handleInfiniteScroll = async () => {
|
|
881
|
+
const initialHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
882
|
+
const initialCount = await page.evaluate(() => document.querySelectorAll('*').length);
|
|
883
|
+
// Scroll to bottom
|
|
884
|
+
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
885
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
886
|
+
// Check if new content loaded
|
|
887
|
+
const newHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
888
|
+
const newCount = await page.evaluate(() => document.querySelectorAll('*').length);
|
|
889
|
+
return {
|
|
890
|
+
scrolled: newHeight > initialHeight,
|
|
891
|
+
newElements: newCount - initialCount,
|
|
892
|
+
scrollDepth: newHeight
|
|
893
|
+
};
|
|
894
|
+
};
|
|
895
|
+
// ============================================================
|
|
896
|
+
// 3. LAZY LOAD DETECTION
|
|
897
|
+
// ============================================================
|
|
898
|
+
const detectLazyLoad = async () => {
|
|
899
|
+
return await page.evaluate(() => {
|
|
900
|
+
const lazyElements = [];
|
|
901
|
+
// Check for common lazy load patterns
|
|
902
|
+
document.querySelectorAll('[data-src], [data-lazy], [loading="lazy"], .lazy, .lazyload').forEach(el => {
|
|
903
|
+
const dataSrc = el.getAttribute('data-src') || el.getAttribute('data-lazy');
|
|
904
|
+
if (dataSrc)
|
|
905
|
+
lazyElements.push(dataSrc);
|
|
906
|
+
});
|
|
907
|
+
// Intersection Observer based lazy images
|
|
908
|
+
document.querySelectorAll('img[data-src], img.lazy').forEach(img => {
|
|
909
|
+
const dataSrc = img.dataset.src;
|
|
910
|
+
if (dataSrc)
|
|
911
|
+
lazyElements.push(dataSrc);
|
|
912
|
+
});
|
|
913
|
+
return lazyElements;
|
|
914
|
+
});
|
|
915
|
+
};
|
|
916
|
+
// ============================================================
|
|
917
|
+
// 4. MAIN WAITING LOGIC
|
|
918
|
+
// ============================================================
|
|
786
919
|
while (Date.now() - startTime < timeout) {
|
|
787
920
|
if (args.selector) {
|
|
788
921
|
const element = await page.$(args.selector);
|
|
@@ -795,17 +928,35 @@ export async function handleAjaxContentWaiter(page, args) {
|
|
|
795
928
|
}
|
|
796
929
|
}
|
|
797
930
|
else {
|
|
798
|
-
//
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
931
|
+
// Smart waiting: Check for ongoing activity
|
|
932
|
+
const mutationResult = await setupMutationObserver();
|
|
933
|
+
newElementsCount = mutationResult.added;
|
|
934
|
+
if (mutationResult.added === 0 && mutationResult.modified === 0) {
|
|
935
|
+
// No DOM changes, content likely loaded
|
|
936
|
+
loaded = true;
|
|
937
|
+
break;
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
// Try infinite scroll to load more content
|
|
941
|
+
const scrollResult = await handleInfiniteScroll();
|
|
942
|
+
if (scrollResult.scrolled) {
|
|
943
|
+
scrollDepth = scrollResult.scrollDepth;
|
|
944
|
+
newElementsCount += scrollResult.newElements;
|
|
802
945
|
}
|
|
803
946
|
await new Promise((r) => setTimeout(r, pollInterval));
|
|
804
947
|
}
|
|
948
|
+
// Detect any lazy-loaded content
|
|
949
|
+
const lazyElements = await detectLazyLoad();
|
|
805
950
|
return {
|
|
806
951
|
loaded,
|
|
807
952
|
waitTime: Date.now() - startTime,
|
|
808
953
|
content,
|
|
954
|
+
newElementsCount,
|
|
955
|
+
scrollDepth,
|
|
956
|
+
lazyElements: lazyElements.length > 0 ? lazyElements : undefined,
|
|
957
|
+
message: loaded
|
|
958
|
+
? `✅ Content loaded in ${Date.now() - startTime}ms (${newElementsCount} new elements, scroll: ${scrollDepth}px)`
|
|
959
|
+
: `⏱️ Timeout after ${timeout}ms`
|
|
809
960
|
};
|
|
810
961
|
}
|
|
811
962
|
/**
|
|
@@ -1002,20 +1153,124 @@ export async function handleVideoRecording(page, args, recorderState) {
|
|
|
1002
1153
|
}
|
|
1003
1154
|
/**
|
|
1004
1155
|
* Harvest all links from page
|
|
1156
|
+
* ULTRA POWERFUL: Pagination detection, smart categorization, file types
|
|
1005
1157
|
*/
|
|
1006
1158
|
export async function handleLinkHarvester(page, args) {
|
|
1007
1159
|
const currentUrl = new URL(page.url());
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1160
|
+
// ============================================================
|
|
1161
|
+
// 1. EXTRACT ALL LINKS WITH SMART CATEGORIZATION
|
|
1162
|
+
// ============================================================
|
|
1163
|
+
const allLinks = await page.evaluate(() => {
|
|
1164
|
+
const links = [];
|
|
1165
|
+
document.querySelectorAll('a[href]').forEach((a) => {
|
|
1166
|
+
const anchor = a;
|
|
1167
|
+
links.push({
|
|
1168
|
+
url: anchor.href,
|
|
1169
|
+
text: a.textContent?.trim()?.substring(0, 100) || '',
|
|
1170
|
+
attrs: {
|
|
1171
|
+
rel: anchor.rel || '',
|
|
1172
|
+
target: anchor.target || '',
|
|
1173
|
+
class: anchor.className || '',
|
|
1174
|
+
id: anchor.id || '',
|
|
1175
|
+
download: anchor.download || '',
|
|
1176
|
+
}
|
|
1177
|
+
});
|
|
1178
|
+
});
|
|
1179
|
+
return links;
|
|
1180
|
+
});
|
|
1181
|
+
// ============================================================
|
|
1182
|
+
// 2. PAGINATION DETECTION
|
|
1183
|
+
// ============================================================
|
|
1184
|
+
const pagination = await page.evaluate(() => {
|
|
1185
|
+
let nextPage;
|
|
1186
|
+
let prevPage;
|
|
1187
|
+
let totalPages;
|
|
1188
|
+
// Common pagination selectors
|
|
1189
|
+
const nextSelectors = [
|
|
1190
|
+
'a[rel="next"]', 'a.next', 'a.pagination-next',
|
|
1191
|
+
'[aria-label="Next"]', 'a:has-text("Next")', 'a:has-text(">")',
|
|
1192
|
+
'.pagination a:last-child', 'a.page-link:last-child'
|
|
1193
|
+
];
|
|
1194
|
+
const prevSelectors = [
|
|
1195
|
+
'a[rel="prev"]', 'a.prev', 'a.pagination-prev',
|
|
1196
|
+
'[aria-label="Previous"]', 'a:has-text("Prev")', 'a:has-text("<")'
|
|
1197
|
+
];
|
|
1198
|
+
for (const sel of nextSelectors) {
|
|
1199
|
+
try {
|
|
1200
|
+
const el = document.querySelector(sel);
|
|
1201
|
+
if (el?.href) {
|
|
1202
|
+
nextPage = el.href;
|
|
1203
|
+
break;
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
catch { /* invalid selector */ }
|
|
1207
|
+
}
|
|
1208
|
+
for (const sel of prevSelectors) {
|
|
1209
|
+
try {
|
|
1210
|
+
const el = document.querySelector(sel);
|
|
1211
|
+
if (el?.href) {
|
|
1212
|
+
prevPage = el.href;
|
|
1213
|
+
break;
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
catch { /* invalid selector */ }
|
|
1217
|
+
}
|
|
1218
|
+
// Count page numbers
|
|
1219
|
+
const pageNumbers = Array.from(document.querySelectorAll('.pagination a, .page-numbers a, nav a'))
|
|
1220
|
+
.map(a => parseInt(a.textContent || '0', 10))
|
|
1221
|
+
.filter(n => !isNaN(n) && n > 0);
|
|
1222
|
+
if (pageNumbers.length > 0) {
|
|
1223
|
+
totalPages = Math.max(...pageNumbers);
|
|
1224
|
+
}
|
|
1225
|
+
return { nextPage, prevPage, totalPages };
|
|
1226
|
+
});
|
|
1227
|
+
// ============================================================
|
|
1228
|
+
// 3. SMART LINK CATEGORIZATION
|
|
1229
|
+
// ============================================================
|
|
1230
|
+
const categorizeLink = (url, text, attrs) => {
|
|
1231
|
+
const urlLower = url.toLowerCase();
|
|
1232
|
+
const textLower = text.toLowerCase();
|
|
1233
|
+
// File downloads
|
|
1234
|
+
if (/\.(pdf|doc|docx|xls|xlsx|zip|rar|7z|tar|gz)(\?.*)?$/i.test(url))
|
|
1235
|
+
return 'document';
|
|
1236
|
+
if (/\.(mp4|mkv|avi|mov|webm|flv)(\?.*)?$/i.test(url))
|
|
1237
|
+
return 'video';
|
|
1238
|
+
if (/\.(mp3|wav|flac|aac|ogg)(\?.*)?$/i.test(url))
|
|
1239
|
+
return 'audio';
|
|
1240
|
+
if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?.*)?$/i.test(url))
|
|
1241
|
+
return 'image';
|
|
1242
|
+
if (attrs.download)
|
|
1243
|
+
return 'download';
|
|
1244
|
+
// Navigation
|
|
1245
|
+
if (/\/(next|page|p)\/\d+|[?&]page=\d+/i.test(url))
|
|
1246
|
+
return 'pagination';
|
|
1247
|
+
if (textLower.includes('next') || textLower.includes('prev'))
|
|
1248
|
+
return 'pagination';
|
|
1249
|
+
// Social
|
|
1250
|
+
if (/facebook|twitter|instagram|linkedin|youtube|tiktok/i.test(url))
|
|
1251
|
+
return 'social';
|
|
1252
|
+
// Common patterns
|
|
1253
|
+
if (/login|signin|sign-in/i.test(url))
|
|
1254
|
+
return 'auth';
|
|
1255
|
+
if (/register|signup|sign-up/i.test(url))
|
|
1256
|
+
return 'auth';
|
|
1257
|
+
if (/search|query|q=/i.test(url))
|
|
1258
|
+
return 'search';
|
|
1259
|
+
if (/contact|about|faq|help/i.test(url))
|
|
1260
|
+
return 'info';
|
|
1261
|
+
return 'navigation';
|
|
1262
|
+
};
|
|
1014
1263
|
const processedLinks = [];
|
|
1264
|
+
const categories = {};
|
|
1265
|
+
const seen = new Set();
|
|
1015
1266
|
let internal = 0;
|
|
1016
1267
|
let external = 0;
|
|
1017
1268
|
for (const link of allLinks) {
|
|
1018
1269
|
try {
|
|
1270
|
+
// Dedup by URL
|
|
1271
|
+
if (seen.has(link.url))
|
|
1272
|
+
continue;
|
|
1273
|
+
seen.add(link.url);
|
|
1019
1274
|
const linkUrl = new URL(link.url);
|
|
1020
1275
|
const isInternal = linkUrl.hostname === currentUrl.hostname;
|
|
1021
1276
|
if (args.filter && !link.url.includes(args.filter) && !link.text.includes(args.filter)) {
|
|
@@ -1025,10 +1280,13 @@ export async function handleLinkHarvester(page, args) {
|
|
|
1025
1280
|
continue;
|
|
1026
1281
|
if (!isInternal && args.includeExternal === false)
|
|
1027
1282
|
continue;
|
|
1283
|
+
const category = categorizeLink(link.url, link.text, link.attrs);
|
|
1284
|
+
categories[category] = (categories[category] || 0) + 1;
|
|
1028
1285
|
processedLinks.push({
|
|
1029
1286
|
url: link.url,
|
|
1030
1287
|
text: link.text,
|
|
1031
1288
|
type: isInternal ? 'internal' : 'external',
|
|
1289
|
+
category,
|
|
1032
1290
|
});
|
|
1033
1291
|
if (isInternal)
|
|
1034
1292
|
internal++;
|
|
@@ -1045,6 +1303,10 @@ export async function handleLinkHarvester(page, args) {
|
|
|
1045
1303
|
links: processedLinks,
|
|
1046
1304
|
internal,
|
|
1047
1305
|
external,
|
|
1306
|
+
pagination: (pagination.nextPage || pagination.prevPage || pagination.totalPages) ? pagination : undefined,
|
|
1307
|
+
categories,
|
|
1308
|
+
message: `🔗 Found ${processedLinks.length} links (${internal} internal, ${external} external)` +
|
|
1309
|
+
(pagination.nextPage ? ` | Next: ${pagination.nextPage}` : '')
|
|
1048
1310
|
};
|
|
1049
1311
|
}
|
|
1050
1312
|
/**
|
|
@@ -2614,3 +2876,245 @@ export async function handleStreamExtractor(page, args) {
|
|
|
2614
2876
|
: 'No direct URLs found',
|
|
2615
2877
|
};
|
|
2616
2878
|
}
|
|
2879
|
+
/**
|
|
2880
|
+
* Advanced web crawler with Crawlee + brave-real-launcher integration
|
|
2881
|
+
* Features: URL queue, proxy rotation, rate limiting, data extraction
|
|
2882
|
+
*/
|
|
2883
|
+
export async function handleWebCrawler(page, args) {
|
|
2884
|
+
// Import Crawlee dynamically to avoid load-time errors if not installed
|
|
2885
|
+
let PuppeteerCrawler;
|
|
2886
|
+
let RequestQueue;
|
|
2887
|
+
let Configuration;
|
|
2888
|
+
try {
|
|
2889
|
+
const crawlee = await import('crawlee');
|
|
2890
|
+
PuppeteerCrawler = crawlee.PuppeteerCrawler;
|
|
2891
|
+
RequestQueue = crawlee.RequestQueue;
|
|
2892
|
+
Configuration = crawlee.Configuration;
|
|
2893
|
+
}
|
|
2894
|
+
catch (e) {
|
|
2895
|
+
return {
|
|
2896
|
+
success: false,
|
|
2897
|
+
crawledPages: 0,
|
|
2898
|
+
results: [],
|
|
2899
|
+
errors: ['Crawlee not installed. Run: npm install crawlee'],
|
|
2900
|
+
message: '❌ Crawlee package not found',
|
|
2901
|
+
};
|
|
2902
|
+
}
|
|
2903
|
+
// Import brave-real-launcher for browser launch
|
|
2904
|
+
let getBravePath;
|
|
2905
|
+
let braveRealPuppeteerCore;
|
|
2906
|
+
try {
|
|
2907
|
+
const launcher = await import('brave-real-launcher');
|
|
2908
|
+
getBravePath = launcher.getBravePath;
|
|
2909
|
+
}
|
|
2910
|
+
catch (e) {
|
|
2911
|
+
// Fallback - will use default Chromium
|
|
2912
|
+
}
|
|
2913
|
+
// Import brave-real-puppeteer-core for stealth features
|
|
2914
|
+
try {
|
|
2915
|
+
braveRealPuppeteerCore = await import('brave-real-puppeteer-core');
|
|
2916
|
+
}
|
|
2917
|
+
catch (e) {
|
|
2918
|
+
// Will use default puppeteer
|
|
2919
|
+
}
|
|
2920
|
+
const results = [];
|
|
2921
|
+
const errors = [];
|
|
2922
|
+
const visited = new Set();
|
|
2923
|
+
// Configuration
|
|
2924
|
+
const maxDepth = args.maxDepth ?? 3;
|
|
2925
|
+
const maxPages = args.maxPages ?? 50;
|
|
2926
|
+
const concurrency = args.concurrency ?? 3;
|
|
2927
|
+
const rateLimit = args.rateLimit ?? 2;
|
|
2928
|
+
const retryCount = args.retryCount ?? 3;
|
|
2929
|
+
const timeout = args.timeout ?? 30000;
|
|
2930
|
+
// URL filtering patterns
|
|
2931
|
+
const includePattern = args.includePattern ? new RegExp(args.includePattern, 'i') : null;
|
|
2932
|
+
const excludePattern = args.excludePattern ? new RegExp(args.excludePattern, 'i') : null;
|
|
2933
|
+
// Proxy rotation
|
|
2934
|
+
let proxyIndex = 0;
|
|
2935
|
+
const getNextProxy = () => {
|
|
2936
|
+
if (!args.proxyList || args.proxyList.length === 0)
|
|
2937
|
+
return undefined;
|
|
2938
|
+
const proxy = args.proxyList[proxyIndex % args.proxyList.length];
|
|
2939
|
+
proxyIndex++;
|
|
2940
|
+
return proxy;
|
|
2941
|
+
};
|
|
2942
|
+
// Rate limiting
|
|
2943
|
+
let lastRequestTime = 0;
|
|
2944
|
+
const rateLimitDelay = 1000 / rateLimit;
|
|
2945
|
+
const enforceRateLimit = async () => {
|
|
2946
|
+
const now = Date.now();
|
|
2947
|
+
const elapsed = now - lastRequestTime;
|
|
2948
|
+
if (elapsed < rateLimitDelay) {
|
|
2949
|
+
await new Promise(r => setTimeout(r, rateLimitDelay - elapsed));
|
|
2950
|
+
}
|
|
2951
|
+
lastRequestTime = Date.now();
|
|
2952
|
+
};
|
|
2953
|
+
try {
|
|
2954
|
+
// Configure Crawlee to use memory storage (no disk)
|
|
2955
|
+
Configuration.getGlobalConfig().set('persistStorage', false);
|
|
2956
|
+
// Create request queue with start URLs
|
|
2957
|
+
const requestQueue = await RequestQueue.open();
|
|
2958
|
+
for (const url of args.startUrls) {
|
|
2959
|
+
await requestQueue.addRequest({
|
|
2960
|
+
url,
|
|
2961
|
+
userData: { depth: 0 },
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
// Get Brave executable path if available
|
|
2965
|
+
let executablePath;
|
|
2966
|
+
try {
|
|
2967
|
+
if (getBravePath) {
|
|
2968
|
+
executablePath = getBravePath();
|
|
2969
|
+
}
|
|
2970
|
+
}
|
|
2971
|
+
catch (e) {
|
|
2972
|
+
// Use default
|
|
2973
|
+
}
|
|
2974
|
+
// Create crawler based on mode
|
|
2975
|
+
const crawler = new PuppeteerCrawler({
|
|
2976
|
+
requestQueue,
|
|
2977
|
+
maxConcurrency: concurrency,
|
|
2978
|
+
maxRequestRetries: retryCount,
|
|
2979
|
+
requestHandlerTimeoutSecs: timeout / 1000,
|
|
2980
|
+
// Use brave-real-puppeteer-core with all stealth features
|
|
2981
|
+
launchContext: {
|
|
2982
|
+
// Use brave-real-puppeteer-core as custom launcher for 50+ stealth features
|
|
2983
|
+
launcher: braveRealPuppeteerCore || undefined,
|
|
2984
|
+
launchOptions: {
|
|
2985
|
+
headless: true,
|
|
2986
|
+
executablePath,
|
|
2987
|
+
args: [
|
|
2988
|
+
'--no-sandbox',
|
|
2989
|
+
'--disable-setuid-sandbox',
|
|
2990
|
+
'--disable-blink-features=AutomationControlled',
|
|
2991
|
+
'--disable-dev-shm-usage',
|
|
2992
|
+
'--disable-accelerated-2d-canvas',
|
|
2993
|
+
'--disable-gpu',
|
|
2994
|
+
],
|
|
2995
|
+
},
|
|
2996
|
+
},
|
|
2997
|
+
// Browser pool configuration
|
|
2998
|
+
browserPoolOptions: {
|
|
2999
|
+
maxOpenPagesPerBrowser: 1,
|
|
3000
|
+
},
|
|
3001
|
+
// Pre-navigation hook for rate limiting and proxy
|
|
3002
|
+
preNavigationHooks: [
|
|
3003
|
+
async (crawlingContext) => {
|
|
3004
|
+
await enforceRateLimit();
|
|
3005
|
+
// Set custom user agent if provided
|
|
3006
|
+
if (args.userAgent) {
|
|
3007
|
+
await crawlingContext.page.setUserAgent(args.userAgent);
|
|
3008
|
+
}
|
|
3009
|
+
// Set custom headers if provided
|
|
3010
|
+
if (args.headers) {
|
|
3011
|
+
await crawlingContext.page.setExtraHTTPHeaders(args.headers);
|
|
3012
|
+
}
|
|
3013
|
+
},
|
|
3014
|
+
],
|
|
3015
|
+
// Main request handler
|
|
3016
|
+
requestHandler: async ({ request, page: crawlerPage, enqueueLinks }) => {
|
|
3017
|
+
const depth = request.userData.depth || 0;
|
|
3018
|
+
const url = request.url;
|
|
3019
|
+
// Skip if already visited or max pages reached
|
|
3020
|
+
if (visited.has(url) || results.length >= maxPages) {
|
|
3021
|
+
return;
|
|
3022
|
+
}
|
|
3023
|
+
visited.add(url);
|
|
3024
|
+
// URL filtering
|
|
3025
|
+
if (includePattern && !includePattern.test(url))
|
|
3026
|
+
return;
|
|
3027
|
+
if (excludePattern && excludePattern.test(url))
|
|
3028
|
+
return;
|
|
3029
|
+
const result = {
|
|
3030
|
+
url,
|
|
3031
|
+
depth,
|
|
3032
|
+
};
|
|
3033
|
+
try {
|
|
3034
|
+
// Get page title
|
|
3035
|
+
result.title = await crawlerPage.title();
|
|
3036
|
+
// Extract data using selectors
|
|
3037
|
+
if (args.extractSelectors) {
|
|
3038
|
+
result.extractedData = {};
|
|
3039
|
+
for (const [key, selector] of Object.entries(args.extractSelectors)) {
|
|
3040
|
+
try {
|
|
3041
|
+
const elements = await crawlerPage.$$(selector);
|
|
3042
|
+
if (elements.length === 1) {
|
|
3043
|
+
result.extractedData[key] = await crawlerPage.$eval(selector, (el) => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src'));
|
|
3044
|
+
}
|
|
3045
|
+
else if (elements.length > 1) {
|
|
3046
|
+
result.extractedData[key] = await crawlerPage.$$eval(selector, (els) => els.map(el => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src')).filter(Boolean));
|
|
3047
|
+
}
|
|
3048
|
+
}
|
|
3049
|
+
catch (e) {
|
|
3050
|
+
// Selector not found
|
|
3051
|
+
}
|
|
3052
|
+
}
|
|
3053
|
+
}
|
|
3054
|
+
// Follow links if enabled and depth allows
|
|
3055
|
+
if (args.followLinks !== false && depth < maxDepth && results.length < maxPages) {
|
|
3056
|
+
// Get all links
|
|
3057
|
+
const pageLinks = await crawlerPage.$$eval('a[href]', (anchors) => anchors.map(a => a.href).filter(href => href.startsWith('http')));
|
|
3058
|
+
result.links = pageLinks.slice(0, 100); // Limit stored links
|
|
3059
|
+
// Filter and enqueue links
|
|
3060
|
+
const linksToEnqueue = pageLinks.filter((link) => {
|
|
3061
|
+
if (visited.has(link))
|
|
3062
|
+
return false;
|
|
3063
|
+
if (includePattern && !includePattern.test(link))
|
|
3064
|
+
return false;
|
|
3065
|
+
if (excludePattern && excludePattern.test(link))
|
|
3066
|
+
return false;
|
|
3067
|
+
return true;
|
|
3068
|
+
});
|
|
3069
|
+
// Add filtered links using Crawlee's enqueueLinks
|
|
3070
|
+
for (const link of linksToEnqueue.slice(0, 50)) {
|
|
3071
|
+
try {
|
|
3072
|
+
await requestQueue.addRequest({
|
|
3073
|
+
url: link,
|
|
3074
|
+
userData: { depth: depth + 1 },
|
|
3075
|
+
});
|
|
3076
|
+
}
|
|
3077
|
+
catch (e) {
|
|
3078
|
+
// Link already in queue
|
|
3079
|
+
}
|
|
3080
|
+
}
|
|
3081
|
+
}
|
|
3082
|
+
// Download media if enabled
|
|
3083
|
+
if (args.downloadMedia && args.savePath) {
|
|
3084
|
+
const mediaUrls = await crawlerPage.$$eval('img[src], video source[src], a[href$=".pdf"], a[href$=".jpg"], a[href$=".png"]', (els) => els.map(el => el.getAttribute('src') || el.getAttribute('href')).filter(Boolean));
|
|
3085
|
+
result.extractedData = result.extractedData || {};
|
|
3086
|
+
result.extractedData.mediaUrls = mediaUrls;
|
|
3087
|
+
}
|
|
3088
|
+
results.push(result);
|
|
3089
|
+
}
|
|
3090
|
+
catch (error) {
|
|
3091
|
+
result.error = error instanceof Error ? error.message : String(error);
|
|
3092
|
+
errors.push(`${url}: ${result.error}`);
|
|
3093
|
+
results.push(result);
|
|
3094
|
+
}
|
|
3095
|
+
},
|
|
3096
|
+
// Failed request handler
|
|
3097
|
+
failedRequestHandler: async ({ request }, error) => {
|
|
3098
|
+
errors.push(`Failed: ${request.url} - ${error.message}`);
|
|
3099
|
+
},
|
|
3100
|
+
});
|
|
3101
|
+
// Run the crawler
|
|
3102
|
+
await crawler.run();
|
|
3103
|
+
return {
|
|
3104
|
+
success: results.length > 0,
|
|
3105
|
+
crawledPages: results.length,
|
|
3106
|
+
results,
|
|
3107
|
+
errors,
|
|
3108
|
+
message: `🕷️ Crawled ${results.length} pages (depth: ${maxDepth}, errors: ${errors.length})`,
|
|
3109
|
+
};
|
|
3110
|
+
}
|
|
3111
|
+
catch (error) {
|
|
3112
|
+
return {
|
|
3113
|
+
success: false,
|
|
3114
|
+
crawledPages: results.length,
|
|
3115
|
+
results,
|
|
3116
|
+
errors: [...errors, error instanceof Error ? error.message : String(error)],
|
|
3117
|
+
message: `❌ Crawler error: ${error instanceof Error ? error.message : String(error)}`,
|
|
3118
|
+
};
|
|
3119
|
+
}
|
|
3120
|
+
}
|
package/dist/index.js
CHANGED
|
@@ -61,7 +61,9 @@ import { handleBreadcrumbNavigator, handleUrlRedirectTracer, handleSearchContent
|
|
|
61
61
|
// Download tools
|
|
62
62
|
handleFileDownloader,
|
|
63
63
|
// Enhanced streaming/download tools
|
|
64
|
-
handleIframeHandler, handleStreamExtractor,
|
|
64
|
+
handleIframeHandler, handleStreamExtractor,
|
|
65
|
+
// Web crawler
|
|
66
|
+
handleWebCrawler, } from './handlers/advanced-tools.js';
|
|
65
67
|
// State for video recording
|
|
66
68
|
const recorderState = new Map();
|
|
67
69
|
debug('All modules loaded successfully');
|
|
@@ -254,6 +256,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
254
256
|
if (!page)
|
|
255
257
|
throw new Error('Browser not initialized. Call browser_init first.');
|
|
256
258
|
return { content: [{ type: 'text', text: JSON.stringify(await handleStreamExtractor(page, args)) }] };
|
|
259
|
+
// Web Crawler (Crawlee + brave-real-launcher)
|
|
260
|
+
case TOOL_NAMES.WEB_CRAWLER:
|
|
261
|
+
if (!page)
|
|
262
|
+
throw new Error('Browser not initialized. Call browser_init first.');
|
|
263
|
+
return { content: [{ type: 'text', text: JSON.stringify(await handleWebCrawler(page, args)) }] };
|
|
257
264
|
default:
|
|
258
265
|
throw new Error(`Unknown tool: ${name}`);
|
|
259
266
|
}
|
package/dist/tool-definitions.js
CHANGED
|
@@ -622,6 +622,116 @@ export const TOOLS = [
|
|
|
622
622
|
},
|
|
623
623
|
},
|
|
624
624
|
},
|
|
625
|
+
// ============================================================
|
|
626
|
+
// WEB CRAWLER TOOL (Crawlee-inspired)
|
|
627
|
+
// ============================================================
|
|
628
|
+
{
|
|
629
|
+
name: 'web_crawler',
|
|
630
|
+
description: 'Advanced web crawler with Crawlee-like features: URL queue (breadth/depth-first), proxy rotation, session management, auto-retry, rate limiting, concurrency control, and data extraction. Supports both browser and HTTP modes.',
|
|
631
|
+
inputSchema: {
|
|
632
|
+
type: 'object',
|
|
633
|
+
additionalProperties: false,
|
|
634
|
+
properties: {
|
|
635
|
+
startUrls: {
|
|
636
|
+
type: 'array',
|
|
637
|
+
items: { type: 'string' },
|
|
638
|
+
description: 'Initial URLs to start crawling from'
|
|
639
|
+
},
|
|
640
|
+
maxDepth: {
|
|
641
|
+
type: 'number',
|
|
642
|
+
description: 'Maximum crawl depth (1 = only start URLs)',
|
|
643
|
+
default: 3
|
|
644
|
+
},
|
|
645
|
+
maxPages: {
|
|
646
|
+
type: 'number',
|
|
647
|
+
description: 'Maximum pages to crawl',
|
|
648
|
+
default: 50
|
|
649
|
+
},
|
|
650
|
+
concurrency: {
|
|
651
|
+
type: 'number',
|
|
652
|
+
description: 'Number of concurrent requests',
|
|
653
|
+
default: 3
|
|
654
|
+
},
|
|
655
|
+
rateLimit: {
|
|
656
|
+
type: 'number',
|
|
657
|
+
description: 'Maximum requests per second',
|
|
658
|
+
default: 2
|
|
659
|
+
},
|
|
660
|
+
crawlStrategy: {
|
|
661
|
+
type: 'string',
|
|
662
|
+
enum: ['breadth-first', 'depth-first'],
|
|
663
|
+
description: 'URL queue strategy',
|
|
664
|
+
default: 'breadth-first'
|
|
665
|
+
},
|
|
666
|
+
includePattern: {
|
|
667
|
+
type: 'string',
|
|
668
|
+
description: 'Regex pattern for URLs to include'
|
|
669
|
+
},
|
|
670
|
+
excludePattern: {
|
|
671
|
+
type: 'string',
|
|
672
|
+
description: 'Regex pattern for URLs to exclude'
|
|
673
|
+
},
|
|
674
|
+
extractSelectors: {
|
|
675
|
+
type: 'object',
|
|
676
|
+
description: 'CSS selectors for data extraction (e.g., {"title": "h1", "links": "a[href]"})'
|
|
677
|
+
},
|
|
678
|
+
followLinks: {
|
|
679
|
+
type: 'boolean',
|
|
680
|
+
description: 'Follow discovered links',
|
|
681
|
+
default: true
|
|
682
|
+
},
|
|
683
|
+
downloadMedia: {
|
|
684
|
+
type: 'boolean',
|
|
685
|
+
description: 'Download images/videos/files',
|
|
686
|
+
default: false
|
|
687
|
+
},
|
|
688
|
+
savePath: {
|
|
689
|
+
type: 'string',
|
|
690
|
+
description: 'Path to save downloaded files'
|
|
691
|
+
},
|
|
692
|
+
proxyList: {
|
|
693
|
+
type: 'array',
|
|
694
|
+
items: { type: 'string' },
|
|
695
|
+
description: 'Proxy URLs for rotation (format: protocol://host:port)'
|
|
696
|
+
},
|
|
697
|
+
retryCount: {
|
|
698
|
+
type: 'number',
|
|
699
|
+
description: 'Number of retries for failed requests',
|
|
700
|
+
default: 3
|
|
701
|
+
},
|
|
702
|
+
retryDelayMs: {
|
|
703
|
+
type: 'number',
|
|
704
|
+
description: 'Delay between retries in ms (exponential backoff)',
|
|
705
|
+
default: 1000
|
|
706
|
+
},
|
|
707
|
+
timeout: {
|
|
708
|
+
type: 'number',
|
|
709
|
+
description: 'Request timeout in ms',
|
|
710
|
+
default: 30000
|
|
711
|
+
},
|
|
712
|
+
mode: {
|
|
713
|
+
type: 'string',
|
|
714
|
+
enum: ['browser', 'http'],
|
|
715
|
+
description: 'Crawl mode (browser = Puppeteer, http = fast HTTP)',
|
|
716
|
+
default: 'browser'
|
|
717
|
+
},
|
|
718
|
+
respectRobotsTxt: {
|
|
719
|
+
type: 'boolean',
|
|
720
|
+
description: 'Respect robots.txt rules',
|
|
721
|
+
default: true
|
|
722
|
+
},
|
|
723
|
+
userAgent: {
|
|
724
|
+
type: 'string',
|
|
725
|
+
description: 'Custom User-Agent string'
|
|
726
|
+
},
|
|
727
|
+
headers: {
|
|
728
|
+
type: 'object',
|
|
729
|
+
description: 'Custom headers for all requests'
|
|
730
|
+
},
|
|
731
|
+
},
|
|
732
|
+
required: ['startUrls'],
|
|
733
|
+
},
|
|
734
|
+
},
|
|
625
735
|
];
|
|
626
736
|
// Tool name constants for type safety
|
|
627
737
|
export const TOOL_NAMES = {
|
|
@@ -659,6 +769,8 @@ export const TOOL_NAMES = {
|
|
|
659
769
|
// Enhanced tools
|
|
660
770
|
IFRAME_HANDLER: 'iframe_handler',
|
|
661
771
|
STREAM_EXTRACTOR: 'stream_extractor',
|
|
772
|
+
// Crawler tool
|
|
773
|
+
WEB_CRAWLER: 'web_crawler',
|
|
662
774
|
};
|
|
663
775
|
// Tool categories for organization
|
|
664
776
|
export const TOOL_CATEGORIES = {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "brave-real-browser-mcp-server",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.25.0",
|
|
4
4
|
"description": "🦁 MCP server for Brave Real Browser - NPM Workspaces Monorepo with anti-detection features, SSE streaming, and LSP compatibility",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -50,7 +50,9 @@
|
|
|
50
50
|
"dependencies": {
|
|
51
51
|
"@modelcontextprotocol/sdk": "latest",
|
|
52
52
|
"@types/turndown": "latest",
|
|
53
|
-
"brave-real-browser": "^2.
|
|
53
|
+
"brave-real-browser": "^2.6.0",
|
|
54
|
+
"crawlee": "^3.15.3",
|
|
55
|
+
"puppeteer-core": "^24.35.0",
|
|
54
56
|
"turndown": "latest",
|
|
55
57
|
"vscode-languageserver": "^9.0.1",
|
|
56
58
|
"vscode-languageserver-textdocument": "^1.0.12"
|