npm - mcpbrowser - Versions diffs - 0.2.25 → 0.2.27 - Mend

mcpbrowser 0.2.25 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/extension/icon.png +0 -0
package/extension/icon.svg +49 -28
package/extension/package.json +1 -1
package/package.json +1 -1
package/src/mcp-browser.js +174 -40
package/tests/domain-tab-pooling.test.js +110 -0
package/tests/integration.test.js +28 -0
package/tests/prepare-html.test.js +206 -2

package/extension/icon.png CHANGED Viewed

Binary file

package/extension/icon.svg CHANGED Viewed

@@ -1,35 +1,56 @@
 <svg width="128" height="128" viewBox="0 0 128 128" xmlns="http://www.w3.org/2000/svg">
-  <!-- Background circle -->
-  <circle cx="64" cy="64" r="60" fill="#0078D4"/>
-  <!-- Browser window outline -->
-  <rect x="24" y="28" width="80" height="56" rx="4" fill="white"/>
-  <rect x="28" y="32" width="72" height="48" rx="2" fill="#E8F4FD"/>
-  <!-- Browser chrome bar -->
-  <rect x="28" y="32" width="72" height="8" fill="#0078D4"/>
-  <!-- Browser dots (window controls) -->
-  <circle cx="33" cy="36" r="1.5" fill="white"/>
-  <circle cx="38" cy="36" r="1.5" fill="white"/>
-  <circle cx="43" cy="36" r="1.5" fill="white"/>
-  <!-- Lock/Shield icon (authentication symbol) -->
-  <g transform="translate(48, 48)">
-    <!-- Shield -->
-    <path d="M 16 4 L 16 14 C 16 18 12 22 8 24 C 4 22 0 18 0 14 L 0 4 L 8 0 Z"
-          fill="#10B981" stroke="none"/>
-    <!-- Checkmark inside shield -->
-    <path d="M 5 12 L 7 14 L 12 8"
+  <!-- Modern gradient with MCP typography focus -->
+  <defs>
+    <linearGradient id="bg6" x1="0%" y1="0%" x2="100%" y2="100%">
+      <stop offset="0%" style="stop-color:#4F46E5;stop-opacity:1" />
+      <stop offset="100%" style="stop-color:#6366F1;stop-opacity:1" />
+    </linearGradient>
+  </defs>
+  <!-- Gradient background -->
+  <rect width="128" height="128" rx="24" fill="url(#bg6)"/>
+  <!-- Large white rounded square -->
+  <rect x="16" y="16" width="96" height="96" rx="16" fill="white" opacity="0.98"/>
+  <!-- Top section with browser chrome -->
+  <rect x="24" y="24" width="80" height="14" rx="7" fill="#F8FAFC"/>
+  <!-- Browser dots -->
+  <circle cx="32" cy="31" r="2" fill="#E5E7EB"/>
+  <circle cx="40" cy="31" r="2" fill="#E5E7EB"/>
+  <circle cx="48" cy="31" r="2" fill="#E5E7EB"/>
+  <!-- Lock with shield -->
+  <g transform="translate(72, 26)">
+    <circle cx="5" cy="5" r="5" fill="#10B981"/>
+    <path d="M 3.5 5 L 3.5 3.8 C 3.5 3 4 2.5 5 2.5 C 6 2.5 6.5 3 6.5 3.8 L 6.5 5"
+          stroke="white" stroke-width="0.8" fill="none"/>
+    <rect x="3" y="5" width="4" height="4" rx="0.5" fill="white"/>
+  </g>
+  <!-- Large MCP text - main focus -->
+  <text x="64" y="68" font-family="Arial, sans-serif" font-size="32" font-weight="bold" fill="#4F46E5" text-anchor="middle">MCP</text>
+  <!-- Subtitle -->
+  <text x="64" y="82" font-family="Arial, sans-serif" font-size="9" fill="#475569" text-anchor="middle">BROWSER</text>
+  <!-- Bottom decorative line -->
+  <rect x="36" y="92" width="56" height="2" rx="1" fill="#E0E7FF"/>
+  <!-- Connection dots -->
+  <circle cx="44" cy="100" r="2.5" fill="#10B981"/>
+  <circle cx="52" cy="100" r="2.5" fill="#10B981" opacity="0.6"/>
+  <circle cx="60" cy="100" r="2.5" fill="#10B981" opacity="0.3"/>
+  <!-- Security badge -->
+  <g transform="translate(66, 96)">
+    <circle cx="6" cy="4" r="6" fill="#10B981"/>
+    <path d="M 4 4 L 5.5 5.5 L 8.5 2.5"
           stroke="white"
-          stroke-width="2"
+          stroke-width="1.2"
           stroke-linecap="round"
           stroke-linejoin="round"
           fill="none"/>
   </g>
-  <!-- Connection indicator (small dots) -->
-  <circle cx="90" cy="48" r="3" fill="#10B981"/>
-  <circle cx="98" cy="48" r="2" fill="#10B981" opacity="0.7"/>
-  <circle cx="104" cy="48" r="1.5" fill="#10B981" opacity="0.4"/>
 </svg>

package/extension/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "mcpbrowser",
   "displayName": "MCP Browser",
   "description": "Lightweight MCP server-extension for in-browser web page fetching - handles login, SSO, and anti-crawler restrictions. Should be used when standard fetch_webpage fails",
-  "version": "0.2.25",
+  "version": "0.2.27",
   "publisher": "cherchyk",
   "icon": "icon.png",
   "engines": {

package/package.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {  "name": "mcpbrowser",
-  "version": "0.2.25",
+  "version": "0.2.27",
   "mcpName": "io.github.cherchyk/browser",
   "type": "module",
   "description": "MCP server for in-browser web page fetching using Chrome DevTools Protocol",

package/src/mcp-browser.js CHANGED Viewed

@@ -15,7 +15,11 @@ const chromeHost = process.env.CHROME_REMOTE_DEBUG_HOST || "127.0.0.1";
 const chromePort = Number(process.env.CHROME_REMOTE_DEBUG_PORT || 9222);
 const explicitWSEndpoint = process.env.CHROME_WS_ENDPOINT;
-// Use default Chrome profile if not explicitly set
+/**
+ * Get the default user data directory for Chrome debugging profile.
+ * Creates a dedicated profile directory to avoid conflicts with the user's main Chrome profile.
+ * @returns {string} The platform-specific path to the Chrome debug profile directory
+ */
 function getDefaultUserDataDir() {
   const platform = os.platform();
   const home = os.homedir();
@@ -33,6 +37,10 @@ function getDefaultUserDataDir() {
 const userDataDir = process.env.CHROME_USER_DATA_DIR || getDefaultUserDataDir();
 const chromePathEnv = process.env.CHROME_PATH;
+/**
+ * Get platform-specific default paths where Chrome/Edge browsers are typically installed.
+ * @returns {string[]} Array of possible browser executable paths for the current platform
+ */
 function getDefaultChromePaths() {
   const platform = os.platform();
@@ -63,6 +71,10 @@ let cachedBrowser = null;
 let domainPages = new Map(); // hostname -> page mapping for tab reuse across domains
 let chromeLaunchPromise = null; // prevent multiple simultaneous launches
+/**
+ * Check if Chrome DevTools Protocol endpoint is available and responding.
+ * @returns {Promise<boolean>} True if DevTools endpoint is accessible, false otherwise
+ */
 async function devtoolsAvailable() {
   try {
     const url = `http://${chromeHost}:${chromePort}/json/version`;
@@ -75,11 +87,22 @@ async function devtoolsAvailable() {
   }
 }
+/**
+ * Find the Chrome/Edge executable path, checking environment variable first, then default locations.
+ * @returns {string|undefined} Path to the browser executable, or undefined if not found
+ */
 function findChromePath() {
   if (chromePathEnv && existsSync(chromePathEnv)) return chromePathEnv;
   return defaultChromePaths.find((p) => existsSync(p));
 }
+/**
+ * Launch Chrome with remote debugging enabled if not already running.
+ * Uses a singleton pattern to prevent multiple simultaneous launches.
+ * Waits up to 20 seconds for Chrome to become available on the DevTools port.
+ * @returns {Promise<void>}
+ * @throws {Error} If Chrome cannot be found or fails to start within timeout
+ */
 async function launchChromeIfNeeded() {
   if (explicitWSEndpoint) return; // user provided explicit endpoint; assume managed externally
@@ -128,6 +151,12 @@ async function launchChromeIfNeeded() {
   return await chromeLaunchPromise;
 }
+/**
+ * Resolve the WebSocket endpoint URL for connecting to Chrome DevTools Protocol.
+ * Either returns the explicitly configured endpoint or queries it from the DevTools JSON API.
+ * @returns {Promise<string>} The WebSocket URL for connecting to Chrome
+ * @throws {Error} If unable to reach DevTools or no WebSocket URL is available
+ */
 async function resolveWSEndpoint() {
   if (explicitWSEndpoint) return explicitWSEndpoint;
   const url = `http://${chromeHost}:${chromePort}/json/version`;
@@ -142,6 +171,55 @@ async function resolveWSEndpoint() {
   return data.webSocketDebuggerUrl;
 }
+/**
+ * Rebuild the domain-to-page mapping from existing browser tabs.
+ * This enables tab reuse across reconnections by discovering tabs that are already open.
+ * Skips internal pages like about:blank and chrome:// URLs.
+ * @param {Browser} browser - The Puppeteer browser instance
+ * @returns {Promise<void>}
+ */
+async function rebuildDomainPagesMap(browser) {
+  try {
+    const pages = await browser.pages();
+    console.error(`[MCPBrowser] Reconnected to browser with ${pages.length} existing tabs`);
+    for (const page of pages) {
+      try {
+        const pageUrl = page.url();
+        // Skip chrome:// pages, about:blank, and other internal pages
+        if (!pageUrl ||
+            pageUrl === 'about:blank' ||
+            pageUrl.startsWith('chrome://') ||
+            pageUrl.startsWith('chrome-extension://') ||
+            pageUrl.startsWith('devtools://')) {
+          continue;
+        }
+        const hostname = new URL(pageUrl).hostname;
+        if (hostname && !domainPages.has(hostname)) {
+          domainPages.set(hostname, page);
+          console.error(`[MCPBrowser] Mapped existing tab for domain: ${hostname} (${pageUrl})`);
+        }
+      } catch (err) {
+        // Skip pages that are inaccessible or have invalid URLs
+        continue;
+      }
+    }
+    if (domainPages.size > 0) {
+      console.error(`[MCPBrowser] Restored ${domainPages.size} domain-to-tab mappings`);
+    }
+  } catch (err) {
+    console.error(`[MCPBrowser] Warning: Could not rebuild domain pages map: ${err.message}`);
+  }
+}
+/**
+ * Get or create a connection to the Chrome browser.
+ * Returns cached browser if still connected, otherwise establishes a new connection.
+ * Rebuilds domain-to-page mapping on reconnection to enable tab reuse.
+ * @returns {Promise<Browser>} Connected Puppeteer browser instance
+ */
 async function getBrowser() {
   await launchChromeIfNeeded();
   if (cachedBrowser && cachedBrowser.isConnected()) return cachedBrowser;
@@ -154,10 +232,24 @@ async function getBrowser() {
     cachedBrowser = null;
     domainPages.clear(); // Clear all domain page mappings
   });
+  // Rebuild domainPages map from existing tabs to enable reuse across reconnections
+  await rebuildDomainPagesMap(cachedBrowser);
   return cachedBrowser;
 }
-async function fetchPage({ url }) {
+/**
+ * Fetch a web page using Chrome browser, with support for authentication flows and tab reuse.
+ * Reuses existing tabs per domain when possible. Handles authentication redirects by waiting
+ * for user to complete login (up to 10 minutes). Processes HTML to remove unnecessary elements
+ * and convert relative URLs to absolute.
+ * @param {Object} params - Fetch parameters
+ * @param {string} params.url - The URL to fetch
+ * @param {boolean} [params.removeUnnecessaryHTML=true] - Whether to clean HTML (removes scripts, styles, etc.)
+ * @returns {Promise<Object>} Result object with success status, URL, HTML content, or error details
+ */
+async function fetchPage({ url, removeUnnecessaryHTML = true }) {
   // Hardcoded smart defaults
   const waitUntil = "networkidle0";
   const navigationTimeout = 60000; // Initial navigation timeout
@@ -280,11 +372,20 @@ async function fetchPage({ url }) {
     // Extract HTML content
     const html = await page.evaluate(() => document.documentElement?.outerHTML || "");
-    const preparedHtml = prepareHtml(html, page.url());
+    // Process HTML based on removeUnnecessaryHTML parameter
+    let processedHtml;
+    if (removeUnnecessaryHTML) {
+      const cleaned = cleanHtml(html);
+      processedHtml = enrichHtml(cleaned, page.url());
+    } else {
+      processedHtml = enrichHtml(html, page.url());
+    }
     const result = {
       success: true,
       url: page.url(),
-      html: preparedHtml
+      html: processedHtml
     };
     wasSuccess = true;
@@ -297,20 +398,25 @@ async function fetchPage({ url }) {
   }
 }
+/**
+ * Truncate a string to a maximum length, adding "... [truncated]" if truncated.
+ * @param {string} str - The string to truncate
+ * @param {number} max - Maximum length
+ * @returns {string} The original or truncated string
+ */
 function truncate(str, max) {
   if (!str) return "";
   return str.length > max ? `${str.slice(0, max)}... [truncated]` : str;
 }
 /**
- * Prepares HTML for consumption by:
- * 1. Converting relative URLs to absolute URLs
- * 2. Removing non-content elements (scripts, styles, meta tags, comments)
- * 3. Removing code-related attributes (class, id, style, data-*, event handlers)
- * 4. Removing SVG graphics and other non-text elements
- * 5. Collapsing excessive whitespace
+ * Removes non-content elements and attributes from HTML:
+ * 1. Removing non-content elements (scripts, styles, meta tags, comments)
+ * 2. Removing code-related attributes (class, id, style, data-*, event handlers)
+ * 3. Removing SVG graphics and other non-text elements
+ * 4. Collapsing excessive whitespace
  */
-function prepareHtml(html, baseUrl) {
+function cleanHtml(html) {
   if (!html) return "";
   let cleaned = html;
@@ -336,32 +442,6 @@ function prepareHtml(html, baseUrl) {
   // Remove link tags (stylesheets, preload, etc.)
   cleaned = cleaned.replace(/<link\b[^>]*>/gi, '');
-  // Convert relative URLs to absolute in href attributes
-  cleaned = cleaned.replace(/href=["']([^"']+)["']/gi, (match, url) => {
-    if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('#') || url.startsWith('mailto:') || url.startsWith('tel:')) {
-      return match;
-    }
-    try {
-      const absoluteUrl = new URL(url, baseUrl).href;
-      return `href="${absoluteUrl}"`;
-    } catch {
-      return match;
-    }
-  });
-  // Convert relative URLs to absolute in src attributes
-  cleaned = cleaned.replace(/src=["']([^"']+)["']/gi, (match, url) => {
-    if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('data:')) {
-      return match;
-    }
-    try {
-      const absoluteUrl = new URL(url, baseUrl).href;
-      return `src="${absoluteUrl}"`;
-    } catch {
-      return match;
-    }
-  });
   // Remove inline style attributes
   cleaned = cleaned.replace(/\s+style=["'][^"']*["']/gi, '');
@@ -392,17 +472,71 @@ function prepareHtml(html, baseUrl) {
   return cleaned;
 }
+/**
+ * Enriches HTML by converting relative URLs to absolute URLs
+ */
+function enrichHtml(html, baseUrl) {
+  if (!html) return "";
+  let enriched = html;
+  // Convert relative URLs to absolute in href attributes
+  enriched = enriched.replace(/href=["']([^"']+)["']/gi, (match, url) => {
+    if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('#') || url.startsWith('mailto:') || url.startsWith('tel:')) {
+      return match;
+    }
+    try {
+      const absoluteUrl = new URL(url, baseUrl).href;
+      return `href="${absoluteUrl}"`;
+    } catch {
+      return match;
+    }
+  });
+  // Convert relative URLs to absolute in src attributes
+  enriched = enriched.replace(/src=["']([^"']+)["']/gi, (match, url) => {
+    if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('data:')) {
+      return match;
+    }
+    try {
+      const absoluteUrl = new URL(url, baseUrl).href;
+      return `src="${absoluteUrl}"`;
+    } catch {
+      return match;
+    }
+  });
+  return enriched;
+}
+/**
+ * Prepares HTML for consumption by cleaning and enriching it.
+ * @deprecated Use cleanHtml and enrichHtml separately for better control
+ */
+function prepareHtml(html, baseUrl) {
+  if (!html) return "";
+  const cleaned = cleanHtml(html);
+  return enrichHtml(cleaned, baseUrl);
+}
+/**
+ * Main entry point for the MCP server.
+ * Sets up the Model Context Protocol server with fetch_webpage_protected tool,
+ * configures request handlers, and starts the stdio transport.
+ * @returns {Promise<void>}
+ */
 async function main() {
-  const server = new Server({ name: "MCPBrowser", version: "0.2.25" }, { capabilities: { tools: {} } });
+  const server = new Server({ name: "MCPBrowser", version: "0.2.26" }, { capabilities: { tools: {} } });
   const tools = [
     {
       name: "fetch_webpage_protected",
-      description: "Fetches web pages by loading them in Chrome/Edge browser. Use for: (1) auth-required pages (401/403, login, SSO, corporate intranets), (2) anti-bot/crawler blocks, CAPTCHA/human verification, (3) JavaScript-heavy sites (SPAs, dynamic content).\n\nAUTH FLOW: If page requires authentication, browser opens and WAITS (up to 10 min) for user to log in, then automatically returns content once loaded. Single call returns correct content, no retry needed.\n\nRULES: (1) ONE URL at a time, never parallel. (2) Wait for full response - may take minutes for auth. (3) Skip only if 404. (4) Returns HTML with clickable links for subpage navigation.",
+      description: "Fetches web pages using Chrome/Edge browser. Handles auth-required pages, CAPTCHA, SSO, anti-bot protection, and JavaScript-heavy sites.\n\nWaits for user interaction (login, CAPTCHA) if needed, then returns content automatically.\n\nIMPORTANT: Call ONE URL at a time only. Never parallel - causes conflicts. Wait for completion before next URL.",
       inputSchema: {
         type: "object",
         properties: {
           url: { type: "string", description: "The URL to fetch" },
+          removeUnnecessaryHTML: { type: "boolean", description: "Remove Unnecessary HTML for size reduction by 90%.", default: true }
         },
         required: ["url"],
         additionalProperties: false,
@@ -453,7 +587,7 @@ async function main() {
 }
 // Export for testing
-export { fetchPage, getBrowser, prepareHtml };
+export { fetchPage, getBrowser, prepareHtml, cleanHtml, enrichHtml };
 // Run the MCP server
 main().catch((err) => {

package/tests/domain-tab-pooling.test.js CHANGED Viewed

@@ -310,6 +310,116 @@ async function runTests() {
     assert(domainPages.size === 1, 'Should still have only 1 domain (eng.ms) in map after all loads');
   });
+  await test('Should rebuild domain pages map on reconnection', async () => {
+    const domainPages = new Map();
+    const browser = new MockBrowser();
+    // Simulate having tabs already open from previous session
+    const page1 = await browser.newPage();
+    await page1.goto('https://github.com/user/repo');
+    const page2 = await browser.newPage();
+    await page2.goto('https://microsoft.com/docs');
+    const page3 = await browser.newPage();
+    await page3.goto('https://eng.ms/docs/products');
+    const page4 = await browser.newPage();
+    await page4.goto('about:blank');
+    // Verify pages exist but map is empty (simulating disconnection)
+    assert(domainPages.size === 0, 'Domain pages map should be empty before rebuild');
+    // Simulate rebuildDomainPagesMap function
+    const pages = await browser.pages();
+    assert(pages.length === 4, `Should have 4 tabs open, got ${pages.length}`);
+    for (const page of pages) {
+      try {
+        const pageUrl = page.url();
+        // Skip internal pages
+        if (!pageUrl ||
+            pageUrl === 'about:blank' ||
+            pageUrl.startsWith('chrome://') ||
+            pageUrl.startsWith('chrome-extension://') ||
+            pageUrl.startsWith('devtools://')) {
+          continue;
+        }
+        const hostname = new URL(pageUrl).hostname;
+        if (hostname && !domainPages.has(hostname)) {
+          domainPages.set(hostname, page);
+        }
+      } catch (err) {
+        // Skip pages with invalid URLs
+        continue;
+      }
+    }
+    // Verify map was rebuilt correctly
+    assert(domainPages.size === 3, `Should have 3 domains in map (excluding about:blank), got ${domainPages.size}`);
+    assert(domainPages.has('github.com'), 'Should have github.com in map');
+    assert(domainPages.has('microsoft.com'), 'Should have microsoft.com in map');
+    assert(domainPages.has('eng.ms'), 'Should have eng.ms in map');
+    assert(!domainPages.has('about:blank'), 'Should not have about:blank in map');
+    // Verify correct pages are mapped
+    assert(domainPages.get('github.com').url() === 'https://github.com/user/repo', 'github.com should map to correct page');
+    assert(domainPages.get('microsoft.com').url() === 'https://microsoft.com/docs', 'microsoft.com should map to correct page');
+    assert(domainPages.get('eng.ms').url() === 'https://eng.ms/docs/products', 'eng.ms should map to correct page');
+    // Verify tabs can be reused after rebuild
+    const githubPage = domainPages.get('github.com');
+    assert(!githubPage.isClosed(), 'Rebuilt github.com page should still be open');
+    await githubPage.goto('https://github.com/another/repo');
+    assert(githubPage.url() === 'https://github.com/another/repo', 'Rebuilt page should be navigable');
+  });
+  await test('Should skip chrome:// and internal pages during rebuild', async () => {
+    const domainPages = new Map();
+    const browser = new MockBrowser();
+    // Create pages with various internal URLs
+    const page1 = await browser.newPage();
+    await page1.goto('chrome://settings');
+    const page2 = await browser.newPage();
+    await page2.goto('chrome-extension://abc123/popup.html');
+    const page3 = await browser.newPage();
+    await page3.goto('devtools://devtools/bundled/devtools_app.html');
+    const page4 = await browser.newPage();
+    await page4.goto('https://example.com/page');
+    // Rebuild domain pages map
+    const pages = await browser.pages();
+    for (const page of pages) {
+      try {
+        const pageUrl = page.url();
+        if (!pageUrl ||
+            pageUrl === 'about:blank' ||
+            pageUrl.startsWith('chrome://') ||
+            pageUrl.startsWith('chrome-extension://') ||
+            pageUrl.startsWith('devtools://')) {
+          continue;
+        }
+        const hostname = new URL(pageUrl).hostname;
+        if (hostname && !domainPages.has(hostname)) {
+          domainPages.set(hostname, page);
+        }
+      } catch (err) {
+        continue;
+      }
+    }
+    // Only example.com should be in the map
+    assert(domainPages.size === 1, `Should only have 1 domain (example.com), got ${domainPages.size}`);
+    assert(domainPages.has('example.com'), 'Should have example.com in map');
+    assert(!domainPages.has('chrome'), 'Should not have chrome:// pages in map');
+  });
   // Summary
   console.log('\n' + '='.repeat(50));

package/tests/integration.test.js CHANGED Viewed

@@ -130,6 +130,34 @@ async function runIntegrationTests() {
         assert(linkResult.html && linkResult.html.length > 0, `Link ${i+1} should return HTML content`);
       }
     });
+    await test('Should support removeUnnecessaryHTML parameter', async () => {
+      const url = 'https://eng.ms/docs/products/geneva';
+      console.log(`   📄 Fetching with removeUnnecessaryHTML=true (default)`);
+      const cleanResult = await fetchPage({ url, removeUnnecessaryHTML: true });
+      assert(cleanResult.success, 'Should successfully fetch with removeUnnecessaryHTML=true');
+      assert(cleanResult.html && cleanResult.html.length > 0, 'Should return cleaned HTML');
+      assert(!cleanResult.html.includes('<script'), 'Cleaned HTML should not contain script tags');
+      assert(!cleanResult.html.includes('<style'), 'Cleaned HTML should not contain style tags');
+      assert(!cleanResult.html.includes('class='), 'Cleaned HTML should not contain class attributes');
+      console.log(`   ✅ Cleaned HTML length: ${cleanResult.html.length} chars`);
+      console.log(`   📄 Fetching with removeUnnecessaryHTML=false`);
+      const rawResult = await fetchPage({ url, removeUnnecessaryHTML: false });
+      assert(rawResult.success, 'Should successfully fetch with removeUnnecessaryHTML=false');
+      assert(rawResult.html && rawResult.html.length > 0, 'Should return raw HTML');
+      console.log(`   ✅ Raw HTML length: ${rawResult.html.length} chars`);
+      // Raw HTML should be larger than cleaned HTML
+      assert(rawResult.html.length > cleanResult.html.length,
+        `Raw HTML (${rawResult.html.length}) should be larger than cleaned (${cleanResult.html.length})`);
+      const reductionPercent = ((rawResult.html.length - cleanResult.html.length) / rawResult.html.length * 100).toFixed(1);
+      console.log(`   📊 Size reduction: ${reductionPercent}% (${rawResult.html.length} → ${cleanResult.html.length} chars)`);
+    });
   } catch (error) {
     console.error('\n❌ Test suite error:', error.message);

package/tests/prepare-html.test.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import assert from 'assert';
-import { prepareHtml } from '../src/mcp-browser.js';
+import { prepareHtml, cleanHtml, enrichHtml } from '../src/mcp-browser.js';
-console.log('🧪 Testing prepareHtml function\n');
+console.log('🧪 Testing HTML processing functions\n');
 let testsPassed = 0;
 let testsFailed = 0;
@@ -299,6 +299,210 @@ test('Should handle HTML with all types of removals', () => {
   assert(result.includes('Text content'), 'Should preserve text');
 });
+// ==================================================
+// cleanHtml Function Tests
+// ==================================================
+console.log('\n🧹 Testing cleanHtml function\n');
+// Test cleanHtml 1: Remove HTML comments
+test('cleanHtml: Should remove HTML comments', () => {
+  const html = '<div>Content<!-- This is a comment --></div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('<!--'), 'Should not contain comment start');
+  assert(!result.includes('-->'), 'Should not contain comment end');
+  assert(result.includes('Content'), 'Should preserve content');
+});
+// Test cleanHtml 2: Remove script tags
+test('cleanHtml: Should remove script tags and their content', () => {
+  const html = '<div>Keep this</div><script>alert("remove");</script><div>And this</div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('<script'), 'Should not contain script tag');
+  assert(!result.includes('alert'), 'Should not contain script content');
+  assert(result.includes('Keep this'), 'Should preserve content');
+});
+// Test cleanHtml 3: Remove style tags
+test('cleanHtml: Should remove style tags and their content', () => {
+  const html = '<div>Content</div><style>.class { color: red; }</style>';
+  const result = cleanHtml(html);
+  assert(!result.includes('<style'), 'Should not contain style tag');
+  assert(!result.includes('color: red'), 'Should not contain style content');
+  assert(result.includes('Content'), 'Should preserve content');
+});
+// Test cleanHtml 4: Remove meta tags
+test('cleanHtml: Should remove meta tags', () => {
+  const html = '<head><meta charset="utf-8"><meta name="viewport" content="width=device-width"></head><body>Content</body>';
+  const result = cleanHtml(html);
+  assert(!result.includes('<meta'), 'Should not contain meta tags');
+  assert(result.includes('Content'), 'Should preserve content');
+});
+// Test cleanHtml 5: Remove inline style attributes
+test('cleanHtml: Should remove inline style attributes', () => {
+  const html = '<div style="color: red; font-size: 14px;">Content</div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('style='), 'Should remove style attribute');
+  assert(result.includes('Content'), 'Should preserve content');
+});
+// Test cleanHtml 6: Remove class attributes
+test('cleanHtml: Should remove class attributes', () => {
+  const html = '<div class="container main-content">Text</div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('class='), 'Should remove class attribute');
+  assert(result.includes('Text'), 'Should preserve content');
+});
+// Test cleanHtml 7: Remove id attributes
+test('cleanHtml: Should remove id attributes', () => {
+  const html = '<div id="main-section">Content</div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('id='), 'Should remove id attribute');
+  assert(result.includes('Content'), 'Should preserve content');
+});
+// Test cleanHtml 8: Remove SVG tags
+test('cleanHtml: Should remove SVG tags and content', () => {
+  const html = '<div>Text</div><svg width="100" height="100"><circle cx="50" cy="50" r="40"/></svg>';
+  const result = cleanHtml(html);
+  assert(!result.includes('<svg'), 'Should remove svg tag');
+  assert(!result.includes('circle'), 'Should remove svg content');
+  assert(result.includes('Text'), 'Should preserve content');
+});
+// Test cleanHtml 9: Collapse whitespace
+test('cleanHtml: Should collapse multiple whitespace into single space', () => {
+  const html = '<div>Line 1\n\n\n   Line 2\t\t\tLine 3</div>';
+  const result = cleanHtml(html);
+  assert(!result.includes('\n\n'), 'Should remove multiple newlines');
+  assert(!result.includes('   '), 'Should remove multiple spaces');
+  assert(result.includes('Line 1'), 'Should preserve content');
+});
+// Test cleanHtml 10: Does NOT modify URLs (that's enrichHtml's job)
+test('cleanHtml: Should NOT modify relative URLs', () => {
+  const html = '<a href="/docs/page">Link</a><img src="/images/logo.png">';
+  const result = cleanHtml(html);
+  assert(result.includes('href="/docs/page"'), 'Should keep relative href unchanged');
+  assert(result.includes('src="/images/logo.png"'), 'Should keep relative src unchanged');
+});
+// ==================================================
+// enrichHtml Function Tests
+// ==================================================
+console.log('\n🔗 Testing enrichHtml function\n');
+// Test enrichHtml 1: Convert relative href URLs
+test('enrichHtml: Should convert relative href URLs to absolute', () => {
+  const html = '<a href="/docs/page">Link</a>';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('href="https://example.com/docs/page"'), 'Should convert relative href to absolute');
+});
+// Test enrichHtml 2: Keep absolute href URLs unchanged
+test('enrichHtml: Should keep absolute href URLs unchanged', () => {
+  const html = '<a href="https://other.com/page">Link</a>';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('href="https://other.com/page"'), 'Should keep absolute href unchanged');
+});
+// Test enrichHtml 3: Convert relative src URLs
+test('enrichHtml: Should convert relative src URLs to absolute', () => {
+  const html = '<img src="/images/logo.png">';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('src="https://example.com/images/logo.png"'), 'Should convert relative src to absolute');
+});
+// Test enrichHtml 4: Keep absolute src URLs unchanged
+test('enrichHtml: Should keep absolute src URLs unchanged', () => {
+  const html = '<img src="https://cdn.example.com/logo.png">';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('src="https://cdn.example.com/logo.png"'), 'Should keep absolute src unchanged');
+});
+// Test enrichHtml 5: Handle anchor links
+test('enrichHtml: Should not modify anchor links', () => {
+  const html = '<a href="#section">Jump</a>';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('href="#section"'), 'Should keep anchor links unchanged');
+});
+// Test enrichHtml 6: Handle mailto and tel links
+test('enrichHtml: Should not modify mailto and tel links', () => {
+  const html = '<a href="mailto:test@example.com">Email</a><a href="tel:+1234567890">Call</a>';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('href="mailto:test@example.com"'), 'Should keep mailto unchanged');
+  assert(result.includes('href="tel:+1234567890"'), 'Should keep tel unchanged');
+});
+// Test enrichHtml 7: Handle data URIs
+test('enrichHtml: Should not modify data URIs', () => {
+  const html = '<img src="data:image/png;base64,iVBORw0KGg==">';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('src="data:image/png;base64,iVBORw0KGg=="'), 'Should keep data URI unchanged');
+});
+// Test enrichHtml 8: Handle protocol-relative URLs
+test('enrichHtml: Should not modify protocol-relative URLs', () => {
+  const html = '<img src="//cdn.example.com/image.png">';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('src="//cdn.example.com/image.png"'), 'Should keep protocol-relative URL unchanged');
+});
+// Test enrichHtml 9: Does NOT remove elements (that's cleanHtml's job)
+test('enrichHtml: Should NOT remove script or style tags', () => {
+  const html = '<script>console.log("test");</script><style>.test{}</style>';
+  const result = enrichHtml(html, 'https://example.com');
+  assert(result.includes('<script'), 'Should keep script tag');
+  assert(result.includes('<style'), 'Should keep style tag');
+});
+// ==================================================
+// Combined cleanHtml + enrichHtml Tests
+// ==================================================
+console.log('\n🔄 Testing cleanHtml + enrichHtml combination\n');
+// Test Combined 1: Clean then enrich
+test('Combined: Should clean HTML then enrich URLs', () => {
+  const html = '<div class="test" style="color:red"><a href="/page">Link</a><script>alert();</script></div>';
+  const cleaned = cleanHtml(html);
+  const enriched = enrichHtml(cleaned, 'https://example.com');
+  // Should not have cleaned elements
+  assert(!enriched.includes('class='), 'Should not have class');
+  assert(!enriched.includes('style='), 'Should not have style');
+  assert(!enriched.includes('<script'), 'Should not have script');
+  // Should have enriched URL
+  assert(enriched.includes('href="https://example.com/page"'), 'Should have absolute URL');
+  assert(enriched.includes('Link'), 'Should preserve content');
+});
+// Test Combined 2: Verify prepareHtml still works (backward compatibility)
+test('Combined: prepareHtml should still work as before', () => {
+  const html = '<div class="test"><a href="/page">Link</a><script>alert();</script></div>';
+  const result = prepareHtml(html, 'https://example.com');
+  // Should clean
+  assert(!result.includes('class='), 'Should clean attributes');
+  assert(!result.includes('<script'), 'Should remove script');
+  // Should enrich
+  assert(result.includes('href="https://example.com/page"'), 'Should convert URL');
+  assert(result.includes('Link'), 'Should preserve content');
+});
+// ==================================================
+// Original prepareHtml Tests (for backward compatibility)
+// ==================================================
+console.log('\n📦 Testing prepareHtml (backward compatibility)\n');
 console.log('\n==================================================');
 console.log(`✅ Tests Passed: ${testsPassed}`);
 console.log(`❌ Tests Failed: ${testsFailed}`);