brave-real-browser-mcp-server 2.24.4 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -586,22 +586,78 @@ export async function handleDeepAnalysis(page, args) {
586
586
  }
587
587
  /**
588
588
  * Record full network traffic - Uses response events to avoid crashes
589
+ * ULTRA POWERFUL: API detection, media URLs, smart categorization
589
590
  */
590
591
  export async function handleNetworkRecorder(page, args) {
591
592
  const requests = [];
592
593
  const duration = args.duration || 10000;
593
594
  let totalSize = 0;
595
+ const categories = {};
596
+ const apis = [];
597
+ const mediaUrls = [];
598
+ const seen = new Set();
599
+ // ============================================================
600
+ // SMART CATEGORIZATION HELPER
601
+ // ============================================================
602
+ const categorizeUrl = (url, resourceType) => {
603
+ const urlLower = url.toLowerCase();
604
+ // API endpoints
605
+ if (/\/api\/|\/v\d+\/|\.json(\?|$)|graphql/i.test(url))
606
+ return 'api';
607
+ // Media
608
+ if (/\.(mp4|webm|m3u8|ts|mp3|flac|ogg)/i.test(url))
609
+ return 'media';
610
+ if (resourceType === 'media' || resourceType === 'video' || resourceType === 'audio')
611
+ return 'media';
612
+ // Images
613
+ if (/\.(jpg|jpeg|png|gif|webp|svg|ico)/i.test(url) || resourceType === 'image')
614
+ return 'image';
615
+ // Scripts
616
+ if (/\.js(\?|$)/i.test(url) || resourceType === 'script')
617
+ return 'script';
618
+ // Styles
619
+ if (/\.css(\?|$)/i.test(url) || resourceType === 'stylesheet')
620
+ return 'style';
621
+ // Fonts
622
+ if (/\.(woff2?|ttf|eot|otf)/i.test(url) || resourceType === 'font')
623
+ return 'font';
624
+ // XHR/Fetch
625
+ if (resourceType === 'xhr' || resourceType === 'fetch')
626
+ return 'xhr';
627
+ // Documents
628
+ if (resourceType === 'document')
629
+ return 'document';
630
+ return 'other';
631
+ };
594
632
  // Response handler - safer than request interception
595
633
  const responseHandler = (response) => {
596
634
  try {
597
635
  const url = response.url();
636
+ // Dedup
637
+ if (seen.has(url))
638
+ return;
639
+ seen.add(url);
598
640
  if (args.filterUrl && !url.includes(args.filterUrl)) {
599
641
  return;
600
642
  }
643
+ const resourceType = response.request()?.resourceType?.() || 'unknown';
644
+ const method = response.request()?.method?.() || 'GET';
645
+ const category = categorizeUrl(url, resourceType);
646
+ categories[category] = (categories[category] || 0) + 1;
647
+ // Collect API endpoints
648
+ if (category === 'api' || resourceType === 'xhr' || resourceType === 'fetch') {
649
+ apis.push({ url, method, type: resourceType });
650
+ }
651
+ // Collect media URLs
652
+ if (category === 'media' || /\.(mp4|webm|m3u8|ts|mp3)/i.test(url)) {
653
+ mediaUrls.push(url);
654
+ }
601
655
  const entry = {
602
656
  url,
603
657
  status: response.status(),
604
- resourceType: response.request()?.resourceType?.() || 'unknown',
658
+ resourceType,
659
+ category,
660
+ method,
605
661
  timestamp: Date.now(),
606
662
  };
607
663
  if (args.includeHeaders) {
@@ -612,7 +668,6 @@ export async function handleNetworkRecorder(page, args) {
612
668
  entry.headers = {};
613
669
  }
614
670
  }
615
- // Note: Response body requires async handling, skip for stability
616
671
  requests.push(entry);
617
672
  // Track size from headers
618
673
  try {
@@ -647,6 +702,10 @@ export async function handleNetworkRecorder(page, args) {
647
702
  requests: requests.slice(0, 500),
648
703
  count: requests.length,
649
704
  totalSize,
705
+ categories,
706
+ apis: apis.length > 0 ? apis : undefined,
707
+ mediaUrls: mediaUrls.length > 0 ? mediaUrls : undefined,
708
+ message: `📡 Recorded ${requests.length} requests (${Math.round(totalSize / 1024)}KB) | APIs: ${apis.length} | Media: ${mediaUrls.length}`
650
709
  };
651
710
  }
652
711
  /**
@@ -776,6 +835,7 @@ export async function handleAdProtectionDetector(page, args) {
776
835
  }
777
836
  /**
778
837
  * Wait for dynamic AJAX loading
838
+ * ULTRA POWERFUL: Infinite scroll, lazy load, mutation observer
779
839
  */
780
840
  export async function handleAjaxContentWaiter(page, args) {
781
841
  const timeout = args.timeout || 30000;
@@ -783,6 +843,79 @@ export async function handleAjaxContentWaiter(page, args) {
783
843
  const startTime = Date.now();
784
844
  let content;
785
845
  let loaded = false;
846
+ let newElementsCount = 0;
847
+ let scrollDepth = 0;
848
+ // ============================================================
849
+ // 1. MUTATION OBSERVER: Track DOM changes in real-time
850
+ // ============================================================
851
+ const setupMutationObserver = async () => {
852
+ return await page.evaluate(() => {
853
+ return new Promise((resolve) => {
854
+ let added = 0;
855
+ let modified = 0;
856
+ const observer = new MutationObserver((mutations) => {
857
+ mutations.forEach(m => {
858
+ added += m.addedNodes.length;
859
+ if (m.type === 'attributes' || m.type === 'characterData')
860
+ modified++;
861
+ });
862
+ });
863
+ observer.observe(document.body, {
864
+ childList: true,
865
+ subtree: true,
866
+ attributes: true,
867
+ characterData: true
868
+ });
869
+ // Return after 2 seconds of observation
870
+ setTimeout(() => {
871
+ observer.disconnect();
872
+ resolve({ added, modified });
873
+ }, 2000);
874
+ });
875
+ });
876
+ };
877
+ // ============================================================
878
+ // 2. INFINITE SCROLL DETECTION
879
+ // ============================================================
880
+ const handleInfiniteScroll = async () => {
881
+ const initialHeight = await page.evaluate(() => document.body.scrollHeight);
882
+ const initialCount = await page.evaluate(() => document.querySelectorAll('*').length);
883
+ // Scroll to bottom
884
+ await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
885
+ await new Promise(r => setTimeout(r, 1000));
886
+ // Check if new content loaded
887
+ const newHeight = await page.evaluate(() => document.body.scrollHeight);
888
+ const newCount = await page.evaluate(() => document.querySelectorAll('*').length);
889
+ return {
890
+ scrolled: newHeight > initialHeight,
891
+ newElements: newCount - initialCount,
892
+ scrollDepth: newHeight
893
+ };
894
+ };
895
+ // ============================================================
896
+ // 3. LAZY LOAD DETECTION
897
+ // ============================================================
898
+ const detectLazyLoad = async () => {
899
+ return await page.evaluate(() => {
900
+ const lazyElements = [];
901
+ // Check for common lazy load patterns
902
+ document.querySelectorAll('[data-src], [data-lazy], [loading="lazy"], .lazy, .lazyload').forEach(el => {
903
+ const dataSrc = el.getAttribute('data-src') || el.getAttribute('data-lazy');
904
+ if (dataSrc)
905
+ lazyElements.push(dataSrc);
906
+ });
907
+ // Intersection Observer based lazy images
908
+ document.querySelectorAll('img[data-src], img.lazy').forEach(img => {
909
+ const dataSrc = img.dataset.src;
910
+ if (dataSrc)
911
+ lazyElements.push(dataSrc);
912
+ });
913
+ return lazyElements;
914
+ });
915
+ };
916
+ // ============================================================
917
+ // 4. MAIN WAITING LOGIC
918
+ // ============================================================
786
919
  while (Date.now() - startTime < timeout) {
787
920
  if (args.selector) {
788
921
  const element = await page.$(args.selector);
@@ -795,17 +928,35 @@ export async function handleAjaxContentWaiter(page, args) {
795
928
  }
796
929
  }
797
930
  else {
798
- // Wait for network to be idle
799
- await page.waitForNetworkIdle({ timeout: pollInterval }).catch(() => { });
800
- loaded = true;
801
- break;
931
+ // Smart waiting: Check for ongoing activity
932
+ const mutationResult = await setupMutationObserver();
933
+ newElementsCount = mutationResult.added;
934
+ if (mutationResult.added === 0 && mutationResult.modified === 0) {
935
+ // No DOM changes, content likely loaded
936
+ loaded = true;
937
+ break;
938
+ }
939
+ }
940
+ // Try infinite scroll to load more content
941
+ const scrollResult = await handleInfiniteScroll();
942
+ if (scrollResult.scrolled) {
943
+ scrollDepth = scrollResult.scrollDepth;
944
+ newElementsCount += scrollResult.newElements;
802
945
  }
803
946
  await new Promise((r) => setTimeout(r, pollInterval));
804
947
  }
948
+ // Detect any lazy-loaded content
949
+ const lazyElements = await detectLazyLoad();
805
950
  return {
806
951
  loaded,
807
952
  waitTime: Date.now() - startTime,
808
953
  content,
954
+ newElementsCount,
955
+ scrollDepth,
956
+ lazyElements: lazyElements.length > 0 ? lazyElements : undefined,
957
+ message: loaded
958
+ ? `✅ Content loaded in ${Date.now() - startTime}ms (${newElementsCount} new elements, scroll: ${scrollDepth}px)`
959
+ : `⏱️ Timeout after ${timeout}ms`
809
960
  };
810
961
  }
811
962
  /**
@@ -1002,20 +1153,124 @@ export async function handleVideoRecording(page, args, recorderState) {
1002
1153
  }
1003
1154
  /**
1004
1155
  * Harvest all links from page
1156
+ * ULTRA POWERFUL: Pagination detection, smart categorization, file types
1005
1157
  */
1006
1158
  export async function handleLinkHarvester(page, args) {
1007
1159
  const currentUrl = new URL(page.url());
1008
- const allLinks = await page.evaluate((filter) => {
1009
- return Array.from(document.querySelectorAll('a[href]')).map((a) => ({
1010
- url: a.href,
1011
- text: a.textContent?.trim()?.substring(0, 100) || '',
1012
- }));
1013
- }, args.filter);
1160
+ // ============================================================
1161
+ // 1. EXTRACT ALL LINKS WITH SMART CATEGORIZATION
1162
+ // ============================================================
1163
+ const allLinks = await page.evaluate(() => {
1164
+ const links = [];
1165
+ document.querySelectorAll('a[href]').forEach((a) => {
1166
+ const anchor = a;
1167
+ links.push({
1168
+ url: anchor.href,
1169
+ text: a.textContent?.trim()?.substring(0, 100) || '',
1170
+ attrs: {
1171
+ rel: anchor.rel || '',
1172
+ target: anchor.target || '',
1173
+ class: anchor.className || '',
1174
+ id: anchor.id || '',
1175
+ download: anchor.download || '',
1176
+ }
1177
+ });
1178
+ });
1179
+ return links;
1180
+ });
1181
+ // ============================================================
1182
+ // 2. PAGINATION DETECTION
1183
+ // ============================================================
1184
+ const pagination = await page.evaluate(() => {
1185
+ let nextPage;
1186
+ let prevPage;
1187
+ let totalPages;
1188
+ // Common pagination selectors
1189
+ const nextSelectors = [
1190
+ 'a[rel="next"]', 'a.next', 'a.pagination-next',
1191
+ '[aria-label="Next"]', 'a:has-text("Next")', 'a:has-text(">")',
1192
+ '.pagination a:last-child', 'a.page-link:last-child'
1193
+ ];
1194
+ const prevSelectors = [
1195
+ 'a[rel="prev"]', 'a.prev', 'a.pagination-prev',
1196
+ '[aria-label="Previous"]', 'a:has-text("Prev")', 'a:has-text("<")'
1197
+ ];
1198
+ for (const sel of nextSelectors) {
1199
+ try {
1200
+ const el = document.querySelector(sel);
1201
+ if (el?.href) {
1202
+ nextPage = el.href;
1203
+ break;
1204
+ }
1205
+ }
1206
+ catch { /* invalid selector */ }
1207
+ }
1208
+ for (const sel of prevSelectors) {
1209
+ try {
1210
+ const el = document.querySelector(sel);
1211
+ if (el?.href) {
1212
+ prevPage = el.href;
1213
+ break;
1214
+ }
1215
+ }
1216
+ catch { /* invalid selector */ }
1217
+ }
1218
+ // Count page numbers
1219
+ const pageNumbers = Array.from(document.querySelectorAll('.pagination a, .page-numbers a, nav a'))
1220
+ .map(a => parseInt(a.textContent || '0', 10))
1221
+ .filter(n => !isNaN(n) && n > 0);
1222
+ if (pageNumbers.length > 0) {
1223
+ totalPages = Math.max(...pageNumbers);
1224
+ }
1225
+ return { nextPage, prevPage, totalPages };
1226
+ });
1227
+ // ============================================================
1228
+ // 3. SMART LINK CATEGORIZATION
1229
+ // ============================================================
1230
+ const categorizeLink = (url, text, attrs) => {
1231
+ const urlLower = url.toLowerCase();
1232
+ const textLower = text.toLowerCase();
1233
+ // File downloads
1234
+ if (/\.(pdf|doc|docx|xls|xlsx|zip|rar|7z|tar|gz)(\?.*)?$/i.test(url))
1235
+ return 'document';
1236
+ if (/\.(mp4|mkv|avi|mov|webm|flv)(\?.*)?$/i.test(url))
1237
+ return 'video';
1238
+ if (/\.(mp3|wav|flac|aac|ogg)(\?.*)?$/i.test(url))
1239
+ return 'audio';
1240
+ if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?.*)?$/i.test(url))
1241
+ return 'image';
1242
+ if (attrs.download)
1243
+ return 'download';
1244
+ // Navigation
1245
+ if (/\/(next|page|p)\/\d+|[?&]page=\d+/i.test(url))
1246
+ return 'pagination';
1247
+ if (textLower.includes('next') || textLower.includes('prev'))
1248
+ return 'pagination';
1249
+ // Social
1250
+ if (/facebook|twitter|instagram|linkedin|youtube|tiktok/i.test(url))
1251
+ return 'social';
1252
+ // Common patterns
1253
+ if (/login|signin|sign-in/i.test(url))
1254
+ return 'auth';
1255
+ if (/register|signup|sign-up/i.test(url))
1256
+ return 'auth';
1257
+ if (/search|query|q=/i.test(url))
1258
+ return 'search';
1259
+ if (/contact|about|faq|help/i.test(url))
1260
+ return 'info';
1261
+ return 'navigation';
1262
+ };
1014
1263
  const processedLinks = [];
1264
+ const categories = {};
1265
+ const seen = new Set();
1015
1266
  let internal = 0;
1016
1267
  let external = 0;
1017
1268
  for (const link of allLinks) {
1018
1269
  try {
1270
+ // Dedup by URL
1271
+ if (seen.has(link.url))
1272
+ continue;
1273
+ seen.add(link.url);
1019
1274
  const linkUrl = new URL(link.url);
1020
1275
  const isInternal = linkUrl.hostname === currentUrl.hostname;
1021
1276
  if (args.filter && !link.url.includes(args.filter) && !link.text.includes(args.filter)) {
@@ -1025,10 +1280,13 @@ export async function handleLinkHarvester(page, args) {
1025
1280
  continue;
1026
1281
  if (!isInternal && args.includeExternal === false)
1027
1282
  continue;
1283
+ const category = categorizeLink(link.url, link.text, link.attrs);
1284
+ categories[category] = (categories[category] || 0) + 1;
1028
1285
  processedLinks.push({
1029
1286
  url: link.url,
1030
1287
  text: link.text,
1031
1288
  type: isInternal ? 'internal' : 'external',
1289
+ category,
1032
1290
  });
1033
1291
  if (isInternal)
1034
1292
  internal++;
@@ -1045,6 +1303,10 @@ export async function handleLinkHarvester(page, args) {
1045
1303
  links: processedLinks,
1046
1304
  internal,
1047
1305
  external,
1306
+ pagination: (pagination.nextPage || pagination.prevPage || pagination.totalPages) ? pagination : undefined,
1307
+ categories,
1308
+ message: `🔗 Found ${processedLinks.length} links (${internal} internal, ${external} external)` +
1309
+ (pagination.nextPage ? ` | Next: ${pagination.nextPage}` : '')
1048
1310
  };
1049
1311
  }
1050
1312
  /**
@@ -2614,3 +2876,245 @@ export async function handleStreamExtractor(page, args) {
2614
2876
  : 'No direct URLs found',
2615
2877
  };
2616
2878
  }
2879
+ /**
2880
+ * Advanced web crawler with Crawlee + brave-real-launcher integration
2881
+ * Features: URL queue, proxy rotation, rate limiting, data extraction
2882
+ */
2883
+ export async function handleWebCrawler(page, args) {
2884
+ // Import Crawlee dynamically to avoid load-time errors if not installed
2885
+ let PuppeteerCrawler;
2886
+ let RequestQueue;
2887
+ let Configuration;
2888
+ try {
2889
+ const crawlee = await import('crawlee');
2890
+ PuppeteerCrawler = crawlee.PuppeteerCrawler;
2891
+ RequestQueue = crawlee.RequestQueue;
2892
+ Configuration = crawlee.Configuration;
2893
+ }
2894
+ catch (e) {
2895
+ return {
2896
+ success: false,
2897
+ crawledPages: 0,
2898
+ results: [],
2899
+ errors: ['Crawlee not installed. Run: npm install crawlee'],
2900
+ message: '❌ Crawlee package not found',
2901
+ };
2902
+ }
2903
+ // Import brave-real-launcher for browser launch
2904
+ let getBravePath;
2905
+ let braveRealPuppeteerCore;
2906
+ try {
2907
+ const launcher = await import('brave-real-launcher');
2908
+ getBravePath = launcher.getBravePath;
2909
+ }
2910
+ catch (e) {
2911
+ // Fallback - will use default Chromium
2912
+ }
2913
+ // Import brave-real-puppeteer-core for stealth features
2914
+ try {
2915
+ braveRealPuppeteerCore = await import('brave-real-puppeteer-core');
2916
+ }
2917
+ catch (e) {
2918
+ // Will use default puppeteer
2919
+ }
2920
+ const results = [];
2921
+ const errors = [];
2922
+ const visited = new Set();
2923
+ // Configuration
2924
+ const maxDepth = args.maxDepth ?? 3;
2925
+ const maxPages = args.maxPages ?? 50;
2926
+ const concurrency = args.concurrency ?? 3;
2927
+ const rateLimit = args.rateLimit ?? 2;
2928
+ const retryCount = args.retryCount ?? 3;
2929
+ const timeout = args.timeout ?? 30000;
2930
+ // URL filtering patterns
2931
+ const includePattern = args.includePattern ? new RegExp(args.includePattern, 'i') : null;
2932
+ const excludePattern = args.excludePattern ? new RegExp(args.excludePattern, 'i') : null;
2933
+ // Proxy rotation
2934
+ let proxyIndex = 0;
2935
+ const getNextProxy = () => {
2936
+ if (!args.proxyList || args.proxyList.length === 0)
2937
+ return undefined;
2938
+ const proxy = args.proxyList[proxyIndex % args.proxyList.length];
2939
+ proxyIndex++;
2940
+ return proxy;
2941
+ };
2942
+ // Rate limiting
2943
+ let lastRequestTime = 0;
2944
+ const rateLimitDelay = 1000 / rateLimit;
2945
+ const enforceRateLimit = async () => {
2946
+ const now = Date.now();
2947
+ const elapsed = now - lastRequestTime;
2948
+ if (elapsed < rateLimitDelay) {
2949
+ await new Promise(r => setTimeout(r, rateLimitDelay - elapsed));
2950
+ }
2951
+ lastRequestTime = Date.now();
2952
+ };
2953
+ try {
2954
+ // Configure Crawlee to use memory storage (no disk)
2955
+ Configuration.getGlobalConfig().set('persistStorage', false);
2956
+ // Create request queue with start URLs
2957
+ const requestQueue = await RequestQueue.open();
2958
+ for (const url of args.startUrls) {
2959
+ await requestQueue.addRequest({
2960
+ url,
2961
+ userData: { depth: 0 },
2962
+ });
2963
+ }
2964
+ // Get Brave executable path if available
2965
+ let executablePath;
2966
+ try {
2967
+ if (getBravePath) {
2968
+ executablePath = getBravePath();
2969
+ }
2970
+ }
2971
+ catch (e) {
2972
+ // Use default
2973
+ }
2974
+ // Create crawler based on mode
2975
+ const crawler = new PuppeteerCrawler({
2976
+ requestQueue,
2977
+ maxConcurrency: concurrency,
2978
+ maxRequestRetries: retryCount,
2979
+ requestHandlerTimeoutSecs: timeout / 1000,
2980
+ // Use brave-real-puppeteer-core with all stealth features
2981
+ launchContext: {
2982
+ // Use brave-real-puppeteer-core as custom launcher for 50+ stealth features
2983
+ launcher: braveRealPuppeteerCore || undefined,
2984
+ launchOptions: {
2985
+ headless: true,
2986
+ executablePath,
2987
+ args: [
2988
+ '--no-sandbox',
2989
+ '--disable-setuid-sandbox',
2990
+ '--disable-blink-features=AutomationControlled',
2991
+ '--disable-dev-shm-usage',
2992
+ '--disable-accelerated-2d-canvas',
2993
+ '--disable-gpu',
2994
+ ],
2995
+ },
2996
+ },
2997
+ // Browser pool configuration
2998
+ browserPoolOptions: {
2999
+ maxOpenPagesPerBrowser: 1,
3000
+ },
3001
+ // Pre-navigation hook for rate limiting and proxy
3002
+ preNavigationHooks: [
3003
+ async (crawlingContext) => {
3004
+ await enforceRateLimit();
3005
+ // Set custom user agent if provided
3006
+ if (args.userAgent) {
3007
+ await crawlingContext.page.setUserAgent(args.userAgent);
3008
+ }
3009
+ // Set custom headers if provided
3010
+ if (args.headers) {
3011
+ await crawlingContext.page.setExtraHTTPHeaders(args.headers);
3012
+ }
3013
+ },
3014
+ ],
3015
+ // Main request handler
3016
+ requestHandler: async ({ request, page: crawlerPage, enqueueLinks }) => {
3017
+ const depth = request.userData.depth || 0;
3018
+ const url = request.url;
3019
+ // Skip if already visited or max pages reached
3020
+ if (visited.has(url) || results.length >= maxPages) {
3021
+ return;
3022
+ }
3023
+ visited.add(url);
3024
+ // URL filtering
3025
+ if (includePattern && !includePattern.test(url))
3026
+ return;
3027
+ if (excludePattern && excludePattern.test(url))
3028
+ return;
3029
+ const result = {
3030
+ url,
3031
+ depth,
3032
+ };
3033
+ try {
3034
+ // Get page title
3035
+ result.title = await crawlerPage.title();
3036
+ // Extract data using selectors
3037
+ if (args.extractSelectors) {
3038
+ result.extractedData = {};
3039
+ for (const [key, selector] of Object.entries(args.extractSelectors)) {
3040
+ try {
3041
+ const elements = await crawlerPage.$$(selector);
3042
+ if (elements.length === 1) {
3043
+ result.extractedData[key] = await crawlerPage.$eval(selector, (el) => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src'));
3044
+ }
3045
+ else if (elements.length > 1) {
3046
+ result.extractedData[key] = await crawlerPage.$$eval(selector, (els) => els.map(el => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src')).filter(Boolean));
3047
+ }
3048
+ }
3049
+ catch (e) {
3050
+ // Selector not found
3051
+ }
3052
+ }
3053
+ }
3054
+ // Follow links if enabled and depth allows
3055
+ if (args.followLinks !== false && depth < maxDepth && results.length < maxPages) {
3056
+ // Get all links
3057
+ const pageLinks = await crawlerPage.$$eval('a[href]', (anchors) => anchors.map(a => a.href).filter(href => href.startsWith('http')));
3058
+ result.links = pageLinks.slice(0, 100); // Limit stored links
3059
+ // Filter and enqueue links
3060
+ const linksToEnqueue = pageLinks.filter((link) => {
3061
+ if (visited.has(link))
3062
+ return false;
3063
+ if (includePattern && !includePattern.test(link))
3064
+ return false;
3065
+ if (excludePattern && excludePattern.test(link))
3066
+ return false;
3067
+ return true;
3068
+ });
3069
+ // Add filtered links using Crawlee's enqueueLinks
3070
+ for (const link of linksToEnqueue.slice(0, 50)) {
3071
+ try {
3072
+ await requestQueue.addRequest({
3073
+ url: link,
3074
+ userData: { depth: depth + 1 },
3075
+ });
3076
+ }
3077
+ catch (e) {
3078
+ // Link already in queue
3079
+ }
3080
+ }
3081
+ }
3082
+ // Download media if enabled
3083
+ if (args.downloadMedia && args.savePath) {
3084
+ const mediaUrls = await crawlerPage.$$eval('img[src], video source[src], a[href$=".pdf"], a[href$=".jpg"], a[href$=".png"]', (els) => els.map(el => el.getAttribute('src') || el.getAttribute('href')).filter(Boolean));
3085
+ result.extractedData = result.extractedData || {};
3086
+ result.extractedData.mediaUrls = mediaUrls;
3087
+ }
3088
+ results.push(result);
3089
+ }
3090
+ catch (error) {
3091
+ result.error = error instanceof Error ? error.message : String(error);
3092
+ errors.push(`${url}: ${result.error}`);
3093
+ results.push(result);
3094
+ }
3095
+ },
3096
+ // Failed request handler
3097
+ failedRequestHandler: async ({ request }, error) => {
3098
+ errors.push(`Failed: ${request.url} - ${error.message}`);
3099
+ },
3100
+ });
3101
+ // Run the crawler
3102
+ await crawler.run();
3103
+ return {
3104
+ success: results.length > 0,
3105
+ crawledPages: results.length,
3106
+ results,
3107
+ errors,
3108
+ message: `🕷️ Crawled ${results.length} pages (depth: ${maxDepth}, errors: ${errors.length})`,
3109
+ };
3110
+ }
3111
+ catch (error) {
3112
+ return {
3113
+ success: false,
3114
+ crawledPages: results.length,
3115
+ results,
3116
+ errors: [...errors, error instanceof Error ? error.message : String(error)],
3117
+ message: `❌ Crawler error: ${error instanceof Error ? error.message : String(error)}`,
3118
+ };
3119
+ }
3120
+ }
package/dist/index.js CHANGED
@@ -61,7 +61,9 @@ import { handleBreadcrumbNavigator, handleUrlRedirectTracer, handleSearchContent
61
61
  // Download tools
62
62
  handleFileDownloader,
63
63
  // Enhanced streaming/download tools
64
- handleIframeHandler, handleStreamExtractor, } from './handlers/advanced-tools.js';
64
+ handleIframeHandler, handleStreamExtractor,
65
+ // Web crawler
66
+ handleWebCrawler, } from './handlers/advanced-tools.js';
65
67
  // State for video recording
66
68
  const recorderState = new Map();
67
69
  debug('All modules loaded successfully');
@@ -254,6 +256,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
254
256
  if (!page)
255
257
  throw new Error('Browser not initialized. Call browser_init first.');
256
258
  return { content: [{ type: 'text', text: JSON.stringify(await handleStreamExtractor(page, args)) }] };
259
+ // Web Crawler (Crawlee + brave-real-launcher)
260
+ case TOOL_NAMES.WEB_CRAWLER:
261
+ if (!page)
262
+ throw new Error('Browser not initialized. Call browser_init first.');
263
+ return { content: [{ type: 'text', text: JSON.stringify(await handleWebCrawler(page, args)) }] };
257
264
  default:
258
265
  throw new Error(`Unknown tool: ${name}`);
259
266
  }
@@ -622,6 +622,116 @@ export const TOOLS = [
622
622
  },
623
623
  },
624
624
  },
625
+ // ============================================================
626
+ // WEB CRAWLER TOOL (Crawlee-inspired)
627
+ // ============================================================
628
+ {
629
+ name: 'web_crawler',
630
+ description: 'Advanced web crawler with Crawlee-like features: URL queue (breadth/depth-first), proxy rotation, session management, auto-retry, rate limiting, concurrency control, and data extraction. Supports both browser and HTTP modes.',
631
+ inputSchema: {
632
+ type: 'object',
633
+ additionalProperties: false,
634
+ properties: {
635
+ startUrls: {
636
+ type: 'array',
637
+ items: { type: 'string' },
638
+ description: 'Initial URLs to start crawling from'
639
+ },
640
+ maxDepth: {
641
+ type: 'number',
642
+ description: 'Maximum crawl depth (1 = only start URLs)',
643
+ default: 3
644
+ },
645
+ maxPages: {
646
+ type: 'number',
647
+ description: 'Maximum pages to crawl',
648
+ default: 50
649
+ },
650
+ concurrency: {
651
+ type: 'number',
652
+ description: 'Number of concurrent requests',
653
+ default: 3
654
+ },
655
+ rateLimit: {
656
+ type: 'number',
657
+ description: 'Maximum requests per second',
658
+ default: 2
659
+ },
660
+ crawlStrategy: {
661
+ type: 'string',
662
+ enum: ['breadth-first', 'depth-first'],
663
+ description: 'URL queue strategy',
664
+ default: 'breadth-first'
665
+ },
666
+ includePattern: {
667
+ type: 'string',
668
+ description: 'Regex pattern for URLs to include'
669
+ },
670
+ excludePattern: {
671
+ type: 'string',
672
+ description: 'Regex pattern for URLs to exclude'
673
+ },
674
+ extractSelectors: {
675
+ type: 'object',
676
+ description: 'CSS selectors for data extraction (e.g., {"title": "h1", "links": "a[href]"})'
677
+ },
678
+ followLinks: {
679
+ type: 'boolean',
680
+ description: 'Follow discovered links',
681
+ default: true
682
+ },
683
+ downloadMedia: {
684
+ type: 'boolean',
685
+ description: 'Download images/videos/files',
686
+ default: false
687
+ },
688
+ savePath: {
689
+ type: 'string',
690
+ description: 'Path to save downloaded files'
691
+ },
692
+ proxyList: {
693
+ type: 'array',
694
+ items: { type: 'string' },
695
+ description: 'Proxy URLs for rotation (format: protocol://host:port)'
696
+ },
697
+ retryCount: {
698
+ type: 'number',
699
+ description: 'Number of retries for failed requests',
700
+ default: 3
701
+ },
702
+ retryDelayMs: {
703
+ type: 'number',
704
+ description: 'Delay between retries in ms (exponential backoff)',
705
+ default: 1000
706
+ },
707
+ timeout: {
708
+ type: 'number',
709
+ description: 'Request timeout in ms',
710
+ default: 30000
711
+ },
712
+ mode: {
713
+ type: 'string',
714
+ enum: ['browser', 'http'],
715
+ description: 'Crawl mode (browser = Puppeteer, http = fast HTTP)',
716
+ default: 'browser'
717
+ },
718
+ respectRobotsTxt: {
719
+ type: 'boolean',
720
+ description: 'Respect robots.txt rules',
721
+ default: true
722
+ },
723
+ userAgent: {
724
+ type: 'string',
725
+ description: 'Custom User-Agent string'
726
+ },
727
+ headers: {
728
+ type: 'object',
729
+ description: 'Custom headers for all requests'
730
+ },
731
+ },
732
+ required: ['startUrls'],
733
+ },
734
+ },
625
735
  ];
626
736
  // Tool name constants for type safety
627
737
  export const TOOL_NAMES = {
@@ -659,6 +769,8 @@ export const TOOL_NAMES = {
659
769
  // Enhanced tools
660
770
  IFRAME_HANDLER: 'iframe_handler',
661
771
  STREAM_EXTRACTOR: 'stream_extractor',
772
+ // Crawler tool
773
+ WEB_CRAWLER: 'web_crawler',
662
774
  };
663
775
  // Tool categories for organization
664
776
  export const TOOL_CATEGORIES = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "brave-real-browser-mcp-server",
3
- "version": "2.24.4",
3
+ "version": "2.25.0",
4
4
  "description": "🦁 MCP server for Brave Real Browser - NPM Workspaces Monorepo with anti-detection features, SSE streaming, and LSP compatibility",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -50,7 +50,9 @@
50
50
  "dependencies": {
51
51
  "@modelcontextprotocol/sdk": "latest",
52
52
  "@types/turndown": "latest",
53
- "brave-real-browser": "^2.5.4",
53
+ "brave-real-browser": "^2.6.0",
54
+ "crawlee": "^3.15.3",
55
+ "puppeteer-core": "^24.35.0",
54
56
  "turndown": "latest",
55
57
  "vscode-languageserver": "^9.0.1",
56
58
  "vscode-languageserver-textdocument": "^1.0.12"