brave-real-browser-mcp-server 2.24.5 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2876,3 +2876,245 @@ export async function handleStreamExtractor(page, args) {
2876
2876
  : 'No direct URLs found',
2877
2877
  };
2878
2878
  }
2879
+ /**
2880
+ * Advanced web crawler with Crawlee + brave-real-launcher integration
2881
+ * Features: URL queue, proxy rotation, rate limiting, data extraction
2882
+ */
2883
+ export async function handleWebCrawler(page, args) {
2884
+ // Import Crawlee dynamically to avoid load-time errors if not installed
2885
+ let PuppeteerCrawler;
2886
+ let RequestQueue;
2887
+ let Configuration;
2888
+ try {
2889
+ const crawlee = await import('crawlee');
2890
+ PuppeteerCrawler = crawlee.PuppeteerCrawler;
2891
+ RequestQueue = crawlee.RequestQueue;
2892
+ Configuration = crawlee.Configuration;
2893
+ }
2894
+ catch (e) {
2895
+ return {
2896
+ success: false,
2897
+ crawledPages: 0,
2898
+ results: [],
2899
+ errors: ['Crawlee not installed. Run: npm install crawlee'],
2900
+ message: '❌ Crawlee package not found',
2901
+ };
2902
+ }
2903
+ // Import brave-real-launcher for browser launch
2904
+ let getBravePath;
2905
+ let braveRealPuppeteerCore;
2906
+ try {
2907
+ const launcher = await import('brave-real-launcher');
2908
+ getBravePath = launcher.getBravePath;
2909
+ }
2910
+ catch (e) {
2911
+ // Fallback - will use default Chromium
2912
+ }
2913
+ // Import brave-real-puppeteer-core for stealth features
2914
+ try {
2915
+ braveRealPuppeteerCore = await import('brave-real-puppeteer-core');
2916
+ }
2917
+ catch (e) {
2918
+ // Will use default puppeteer
2919
+ }
2920
+ const results = [];
2921
+ const errors = [];
2922
+ const visited = new Set();
2923
+ // Configuration
2924
+ const maxDepth = args.maxDepth ?? 3;
2925
+ const maxPages = args.maxPages ?? 50;
2926
+ const concurrency = args.concurrency ?? 3;
2927
+ const rateLimit = args.rateLimit ?? 2;
2928
+ const retryCount = args.retryCount ?? 3;
2929
+ const timeout = args.timeout ?? 30000;
2930
+ // URL filtering patterns
2931
+ const includePattern = args.includePattern ? new RegExp(args.includePattern, 'i') : null;
2932
+ const excludePattern = args.excludePattern ? new RegExp(args.excludePattern, 'i') : null;
2933
+ // Proxy rotation
2934
+ let proxyIndex = 0;
2935
+ const getNextProxy = () => {
2936
+ if (!args.proxyList || args.proxyList.length === 0)
2937
+ return undefined;
2938
+ const proxy = args.proxyList[proxyIndex % args.proxyList.length];
2939
+ proxyIndex++;
2940
+ return proxy;
2941
+ };
2942
+ // Rate limiting
2943
+ let lastRequestTime = 0;
2944
+ const rateLimitDelay = 1000 / rateLimit;
2945
+ const enforceRateLimit = async () => {
2946
+ const now = Date.now();
2947
+ const elapsed = now - lastRequestTime;
2948
+ if (elapsed < rateLimitDelay) {
2949
+ await new Promise(r => setTimeout(r, rateLimitDelay - elapsed));
2950
+ }
2951
+ lastRequestTime = Date.now();
2952
+ };
2953
+ try {
2954
+ // Configure Crawlee to use memory storage (no disk)
2955
+ Configuration.getGlobalConfig().set('persistStorage', false);
2956
+ // Create request queue with start URLs
2957
+ const requestQueue = await RequestQueue.open();
2958
+ for (const url of args.startUrls) {
2959
+ await requestQueue.addRequest({
2960
+ url,
2961
+ userData: { depth: 0 },
2962
+ });
2963
+ }
2964
+ // Get Brave executable path if available
2965
+ let executablePath;
2966
+ try {
2967
+ if (getBravePath) {
2968
+ executablePath = getBravePath();
2969
+ }
2970
+ }
2971
+ catch (e) {
2972
+ // Use default
2973
+ }
2974
+ // Create crawler based on mode
2975
+ const crawler = new PuppeteerCrawler({
2976
+ requestQueue,
2977
+ maxConcurrency: concurrency,
2978
+ maxRequestRetries: retryCount,
2979
+ requestHandlerTimeoutSecs: timeout / 1000,
2980
+ // Use brave-real-puppeteer-core with all stealth features
2981
+ launchContext: {
2982
+ // Use brave-real-puppeteer-core as custom launcher for 50+ stealth features
2983
+ launcher: braveRealPuppeteerCore || undefined,
2984
+ launchOptions: {
2985
+ headless: true,
2986
+ executablePath,
2987
+ args: [
2988
+ '--no-sandbox',
2989
+ '--disable-setuid-sandbox',
2990
+ '--disable-blink-features=AutomationControlled',
2991
+ '--disable-dev-shm-usage',
2992
+ '--disable-accelerated-2d-canvas',
2993
+ '--disable-gpu',
2994
+ ],
2995
+ },
2996
+ },
2997
+ // Browser pool configuration
2998
+ browserPoolOptions: {
2999
+ maxOpenPagesPerBrowser: 1,
3000
+ },
3001
+ // Pre-navigation hook for rate limiting and proxy
3002
+ preNavigationHooks: [
3003
+ async (crawlingContext) => {
3004
+ await enforceRateLimit();
3005
+ // Set custom user agent if provided
3006
+ if (args.userAgent) {
3007
+ await crawlingContext.page.setUserAgent(args.userAgent);
3008
+ }
3009
+ // Set custom headers if provided
3010
+ if (args.headers) {
3011
+ await crawlingContext.page.setExtraHTTPHeaders(args.headers);
3012
+ }
3013
+ },
3014
+ ],
3015
+ // Main request handler
3016
+ requestHandler: async ({ request, page: crawlerPage, enqueueLinks }) => {
3017
+ const depth = request.userData.depth || 0;
3018
+ const url = request.url;
3019
+ // Skip if already visited or max pages reached
3020
+ if (visited.has(url) || results.length >= maxPages) {
3021
+ return;
3022
+ }
3023
+ visited.add(url);
3024
+ // URL filtering
3025
+ if (includePattern && !includePattern.test(url))
3026
+ return;
3027
+ if (excludePattern && excludePattern.test(url))
3028
+ return;
3029
+ const result = {
3030
+ url,
3031
+ depth,
3032
+ };
3033
+ try {
3034
+ // Get page title
3035
+ result.title = await crawlerPage.title();
3036
+ // Extract data using selectors
3037
+ if (args.extractSelectors) {
3038
+ result.extractedData = {};
3039
+ for (const [key, selector] of Object.entries(args.extractSelectors)) {
3040
+ try {
3041
+ const elements = await crawlerPage.$$(selector);
3042
+ if (elements.length === 1) {
3043
+ result.extractedData[key] = await crawlerPage.$eval(selector, (el) => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src'));
3044
+ }
3045
+ else if (elements.length > 1) {
3046
+ result.extractedData[key] = await crawlerPage.$$eval(selector, (els) => els.map(el => el.textContent?.trim() || el.getAttribute('href') || el.getAttribute('src')).filter(Boolean));
3047
+ }
3048
+ }
3049
+ catch (e) {
3050
+ // Selector not found
3051
+ }
3052
+ }
3053
+ }
3054
+ // Follow links if enabled and depth allows
3055
+ if (args.followLinks !== false && depth < maxDepth && results.length < maxPages) {
3056
+ // Get all links
3057
+ const pageLinks = await crawlerPage.$$eval('a[href]', (anchors) => anchors.map(a => a.href).filter(href => href.startsWith('http')));
3058
+ result.links = pageLinks.slice(0, 100); // Limit stored links
3059
+ // Filter and enqueue links
3060
+ const linksToEnqueue = pageLinks.filter((link) => {
3061
+ if (visited.has(link))
3062
+ return false;
3063
+ if (includePattern && !includePattern.test(link))
3064
+ return false;
3065
+ if (excludePattern && excludePattern.test(link))
3066
+ return false;
3067
+ return true;
3068
+ });
3069
+ // Add filtered links using Crawlee's enqueueLinks
3070
+ for (const link of linksToEnqueue.slice(0, 50)) {
3071
+ try {
3072
+ await requestQueue.addRequest({
3073
+ url: link,
3074
+ userData: { depth: depth + 1 },
3075
+ });
3076
+ }
3077
+ catch (e) {
3078
+ // Link already in queue
3079
+ }
3080
+ }
3081
+ }
3082
+ // Download media if enabled
3083
+ if (args.downloadMedia && args.savePath) {
3084
+ const mediaUrls = await crawlerPage.$$eval('img[src], video source[src], a[href$=".pdf"], a[href$=".jpg"], a[href$=".png"]', (els) => els.map(el => el.getAttribute('src') || el.getAttribute('href')).filter(Boolean));
3085
+ result.extractedData = result.extractedData || {};
3086
+ result.extractedData.mediaUrls = mediaUrls;
3087
+ }
3088
+ results.push(result);
3089
+ }
3090
+ catch (error) {
3091
+ result.error = error instanceof Error ? error.message : String(error);
3092
+ errors.push(`${url}: ${result.error}`);
3093
+ results.push(result);
3094
+ }
3095
+ },
3096
+ // Failed request handler
3097
+ failedRequestHandler: async ({ request }, error) => {
3098
+ errors.push(`Failed: ${request.url} - ${error.message}`);
3099
+ },
3100
+ });
3101
+ // Run the crawler
3102
+ await crawler.run();
3103
+ return {
3104
+ success: results.length > 0,
3105
+ crawledPages: results.length,
3106
+ results,
3107
+ errors,
3108
+ message: `🕷️ Crawled ${results.length} pages (depth: ${maxDepth}, errors: ${errors.length})`,
3109
+ };
3110
+ }
3111
+ catch (error) {
3112
+ return {
3113
+ success: false,
3114
+ crawledPages: results.length,
3115
+ results,
3116
+ errors: [...errors, error instanceof Error ? error.message : String(error)],
3117
+ message: `❌ Crawler error: ${error instanceof Error ? error.message : String(error)}`,
3118
+ };
3119
+ }
3120
+ }
package/dist/index.js CHANGED
@@ -61,7 +61,9 @@ import { handleBreadcrumbNavigator, handleUrlRedirectTracer, handleSearchContent
61
61
  // Download tools
62
62
  handleFileDownloader,
63
63
  // Enhanced streaming/download tools
64
- handleIframeHandler, handleStreamExtractor, } from './handlers/advanced-tools.js';
64
+ handleIframeHandler, handleStreamExtractor,
65
+ // Web crawler
66
+ handleWebCrawler, } from './handlers/advanced-tools.js';
65
67
  // State for video recording
66
68
  const recorderState = new Map();
67
69
  debug('All modules loaded successfully');
@@ -254,6 +256,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
254
256
  if (!page)
255
257
  throw new Error('Browser not initialized. Call browser_init first.');
256
258
  return { content: [{ type: 'text', text: JSON.stringify(await handleStreamExtractor(page, args)) }] };
259
+ // Web Crawler (Crawlee + brave-real-launcher)
260
+ case TOOL_NAMES.WEB_CRAWLER:
261
+ if (!page)
262
+ throw new Error('Browser not initialized. Call browser_init first.');
263
+ return { content: [{ type: 'text', text: JSON.stringify(await handleWebCrawler(page, args)) }] };
257
264
  default:
258
265
  throw new Error(`Unknown tool: ${name}`);
259
266
  }
@@ -622,6 +622,116 @@ export const TOOLS = [
622
622
  },
623
623
  },
624
624
  },
625
+ // ============================================================
626
+ // WEB CRAWLER TOOL (Crawlee-inspired)
627
+ // ============================================================
628
+ {
629
+ name: 'web_crawler',
630
+ description: 'Advanced web crawler with Crawlee-like features: URL queue (breadth/depth-first), proxy rotation, session management, auto-retry, rate limiting, concurrency control, and data extraction. Supports both browser and HTTP modes.',
631
+ inputSchema: {
632
+ type: 'object',
633
+ additionalProperties: false,
634
+ properties: {
635
+ startUrls: {
636
+ type: 'array',
637
+ items: { type: 'string' },
638
+ description: 'Initial URLs to start crawling from'
639
+ },
640
+ maxDepth: {
641
+ type: 'number',
642
+ description: 'Maximum crawl depth (1 = only start URLs)',
643
+ default: 3
644
+ },
645
+ maxPages: {
646
+ type: 'number',
647
+ description: 'Maximum pages to crawl',
648
+ default: 50
649
+ },
650
+ concurrency: {
651
+ type: 'number',
652
+ description: 'Number of concurrent requests',
653
+ default: 3
654
+ },
655
+ rateLimit: {
656
+ type: 'number',
657
+ description: 'Maximum requests per second',
658
+ default: 2
659
+ },
660
+ crawlStrategy: {
661
+ type: 'string',
662
+ enum: ['breadth-first', 'depth-first'],
663
+ description: 'URL queue strategy',
664
+ default: 'breadth-first'
665
+ },
666
+ includePattern: {
667
+ type: 'string',
668
+ description: 'Regex pattern for URLs to include'
669
+ },
670
+ excludePattern: {
671
+ type: 'string',
672
+ description: 'Regex pattern for URLs to exclude'
673
+ },
674
+ extractSelectors: {
675
+ type: 'object',
676
+ description: 'CSS selectors for data extraction (e.g., {"title": "h1", "links": "a[href]"})'
677
+ },
678
+ followLinks: {
679
+ type: 'boolean',
680
+ description: 'Follow discovered links',
681
+ default: true
682
+ },
683
+ downloadMedia: {
684
+ type: 'boolean',
685
+ description: 'Download images/videos/files',
686
+ default: false
687
+ },
688
+ savePath: {
689
+ type: 'string',
690
+ description: 'Path to save downloaded files'
691
+ },
692
+ proxyList: {
693
+ type: 'array',
694
+ items: { type: 'string' },
695
+ description: 'Proxy URLs for rotation (format: protocol://host:port)'
696
+ },
697
+ retryCount: {
698
+ type: 'number',
699
+ description: 'Number of retries for failed requests',
700
+ default: 3
701
+ },
702
+ retryDelayMs: {
703
+ type: 'number',
704
+ description: 'Delay between retries in ms (exponential backoff)',
705
+ default: 1000
706
+ },
707
+ timeout: {
708
+ type: 'number',
709
+ description: 'Request timeout in ms',
710
+ default: 30000
711
+ },
712
+ mode: {
713
+ type: 'string',
714
+ enum: ['browser', 'http'],
715
+ description: 'Crawl mode (browser = Puppeteer, http = fast HTTP)',
716
+ default: 'browser'
717
+ },
718
+ respectRobotsTxt: {
719
+ type: 'boolean',
720
+ description: 'Respect robots.txt rules',
721
+ default: true
722
+ },
723
+ userAgent: {
724
+ type: 'string',
725
+ description: 'Custom User-Agent string'
726
+ },
727
+ headers: {
728
+ type: 'object',
729
+ description: 'Custom headers for all requests'
730
+ },
731
+ },
732
+ required: ['startUrls'],
733
+ },
734
+ },
625
735
  ];
626
736
  // Tool name constants for type safety
627
737
  export const TOOL_NAMES = {
@@ -659,6 +769,8 @@ export const TOOL_NAMES = {
659
769
  // Enhanced tools
660
770
  IFRAME_HANDLER: 'iframe_handler',
661
771
  STREAM_EXTRACTOR: 'stream_extractor',
772
+ // Crawler tool
773
+ WEB_CRAWLER: 'web_crawler',
662
774
  };
663
775
  // Tool categories for organization
664
776
  export const TOOL_CATEGORIES = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "brave-real-browser-mcp-server",
3
- "version": "2.24.5",
3
+ "version": "2.25.0",
4
4
  "description": "🦁 MCP server for Brave Real Browser - NPM Workspaces Monorepo with anti-detection features, SSE streaming, and LSP compatibility",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -50,7 +50,9 @@
50
50
  "dependencies": {
51
51
  "@modelcontextprotocol/sdk": "latest",
52
52
  "@types/turndown": "latest",
53
- "brave-real-browser": "^2.5.5",
53
+ "brave-real-browser": "^2.6.0",
54
+ "crawlee": "^3.15.3",
55
+ "puppeteer-core": "^24.35.0",
54
56
  "turndown": "latest",
55
57
  "vscode-languageserver": "^9.0.1",
56
58
  "vscode-languageserver-textdocument": "^1.0.12"