mcpbrowser 0.2.25 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -1,35 +1,56 @@
1
1
  <svg width="128" height="128" viewBox="0 0 128 128" xmlns="http://www.w3.org/2000/svg">
2
- <!-- Background circle -->
3
- <circle cx="64" cy="64" r="60" fill="#0078D4"/>
4
-
5
- <!-- Browser window outline -->
6
- <rect x="24" y="28" width="80" height="56" rx="4" fill="white"/>
7
- <rect x="28" y="32" width="72" height="48" rx="2" fill="#E8F4FD"/>
8
-
9
- <!-- Browser chrome bar -->
10
- <rect x="28" y="32" width="72" height="8" fill="#0078D4"/>
11
-
12
- <!-- Browser dots (window controls) -->
13
- <circle cx="33" cy="36" r="1.5" fill="white"/>
14
- <circle cx="38" cy="36" r="1.5" fill="white"/>
15
- <circle cx="43" cy="36" r="1.5" fill="white"/>
16
-
17
- <!-- Lock/Shield icon (authentication symbol) -->
18
- <g transform="translate(48, 48)">
19
- <!-- Shield -->
20
- <path d="M 16 4 L 16 14 C 16 18 12 22 8 24 C 4 22 0 18 0 14 L 0 4 L 8 0 Z"
21
- fill="#10B981" stroke="none"/>
22
- <!-- Checkmark inside shield -->
23
- <path d="M 5 12 L 7 14 L 12 8"
2
+ <!-- Modern gradient with MCP typography focus -->
3
+ <defs>
4
+ <linearGradient id="bg6" x1="0%" y1="0%" x2="100%" y2="100%">
5
+ <stop offset="0%" style="stop-color:#4F46E5;stop-opacity:1" />
6
+ <stop offset="100%" style="stop-color:#6366F1;stop-opacity:1" />
7
+ </linearGradient>
8
+ </defs>
9
+
10
+ <!-- Gradient background -->
11
+ <rect width="128" height="128" rx="24" fill="url(#bg6)"/>
12
+
13
+ <!-- Large white rounded square -->
14
+ <rect x="16" y="16" width="96" height="96" rx="16" fill="white" opacity="0.98"/>
15
+
16
+ <!-- Top section with browser chrome -->
17
+ <rect x="24" y="24" width="80" height="14" rx="7" fill="#F8FAFC"/>
18
+
19
+ <!-- Browser dots -->
20
+ <circle cx="32" cy="31" r="2" fill="#E5E7EB"/>
21
+ <circle cx="40" cy="31" r="2" fill="#E5E7EB"/>
22
+ <circle cx="48" cy="31" r="2" fill="#E5E7EB"/>
23
+
24
+ <!-- Lock with shield -->
25
+ <g transform="translate(72, 26)">
26
+ <circle cx="5" cy="5" r="5" fill="#10B981"/>
27
+ <path d="M 3.5 5 L 3.5 3.8 C 3.5 3 4 2.5 5 2.5 C 6 2.5 6.5 3 6.5 3.8 L 6.5 5"
28
+ stroke="white" stroke-width="0.8" fill="none"/>
29
+ <rect x="3" y="5" width="4" height="4" rx="0.5" fill="white"/>
30
+ </g>
31
+
32
+ <!-- Large MCP text - main focus -->
33
+ <text x="64" y="68" font-family="Arial, sans-serif" font-size="32" font-weight="bold" fill="#4F46E5" text-anchor="middle">MCP</text>
34
+
35
+ <!-- Subtitle -->
36
+ <text x="64" y="82" font-family="Arial, sans-serif" font-size="9" fill="#475569" text-anchor="middle">BROWSER</text>
37
+
38
+ <!-- Bottom decorative line -->
39
+ <rect x="36" y="92" width="56" height="2" rx="1" fill="#E0E7FF"/>
40
+
41
+ <!-- Connection dots -->
42
+ <circle cx="44" cy="100" r="2.5" fill="#10B981"/>
43
+ <circle cx="52" cy="100" r="2.5" fill="#10B981" opacity="0.6"/>
44
+ <circle cx="60" cy="100" r="2.5" fill="#10B981" opacity="0.3"/>
45
+
46
+ <!-- Security badge -->
47
+ <g transform="translate(66, 96)">
48
+ <circle cx="6" cy="4" r="6" fill="#10B981"/>
49
+ <path d="M 4 4 L 5.5 5.5 L 8.5 2.5"
24
50
  stroke="white"
25
- stroke-width="2"
51
+ stroke-width="1.2"
26
52
  stroke-linecap="round"
27
53
  stroke-linejoin="round"
28
54
  fill="none"/>
29
55
  </g>
30
-
31
- <!-- Connection indicator (small dots) -->
32
- <circle cx="90" cy="48" r="3" fill="#10B981"/>
33
- <circle cx="98" cy="48" r="2" fill="#10B981" opacity="0.7"/>
34
- <circle cx="104" cy="48" r="1.5" fill="#10B981" opacity="0.4"/>
35
56
  </svg>
@@ -2,7 +2,7 @@
2
2
  "name": "mcpbrowser",
3
3
  "displayName": "MCP Browser",
4
4
  "description": "Lightweight MCP server-extension for in-browser web page fetching - handles login, SSO, and anti-crawler restrictions. Should be used when standard fetch_webpage fails",
5
- "version": "0.2.25",
5
+ "version": "0.2.27",
6
6
  "publisher": "cherchyk",
7
7
  "icon": "icon.png",
8
8
  "engines": {
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  { "name": "mcpbrowser",
2
- "version": "0.2.25",
2
+ "version": "0.2.27",
3
3
  "mcpName": "io.github.cherchyk/browser",
4
4
  "type": "module",
5
5
  "description": "MCP server for in-browser web page fetching using Chrome DevTools Protocol",
@@ -15,7 +15,11 @@ const chromeHost = process.env.CHROME_REMOTE_DEBUG_HOST || "127.0.0.1";
15
15
  const chromePort = Number(process.env.CHROME_REMOTE_DEBUG_PORT || 9222);
16
16
  const explicitWSEndpoint = process.env.CHROME_WS_ENDPOINT;
17
17
 
18
- // Use default Chrome profile if not explicitly set
18
+ /**
19
+ * Get the default user data directory for Chrome debugging profile.
20
+ * Creates a dedicated profile directory to avoid conflicts with the user's main Chrome profile.
21
+ * @returns {string} The platform-specific path to the Chrome debug profile directory
22
+ */
19
23
  function getDefaultUserDataDir() {
20
24
  const platform = os.platform();
21
25
  const home = os.homedir();
@@ -33,6 +37,10 @@ function getDefaultUserDataDir() {
33
37
  const userDataDir = process.env.CHROME_USER_DATA_DIR || getDefaultUserDataDir();
34
38
  const chromePathEnv = process.env.CHROME_PATH;
35
39
 
40
+ /**
41
+ * Get platform-specific default paths where Chrome/Edge browsers are typically installed.
42
+ * @returns {string[]} Array of possible browser executable paths for the current platform
43
+ */
36
44
  function getDefaultChromePaths() {
37
45
  const platform = os.platform();
38
46
 
@@ -63,6 +71,10 @@ let cachedBrowser = null;
63
71
  let domainPages = new Map(); // hostname -> page mapping for tab reuse across domains
64
72
  let chromeLaunchPromise = null; // prevent multiple simultaneous launches
65
73
 
74
+ /**
75
+ * Check if Chrome DevTools Protocol endpoint is available and responding.
76
+ * @returns {Promise<boolean>} True if DevTools endpoint is accessible, false otherwise
77
+ */
66
78
  async function devtoolsAvailable() {
67
79
  try {
68
80
  const url = `http://${chromeHost}:${chromePort}/json/version`;
@@ -75,11 +87,22 @@ async function devtoolsAvailable() {
75
87
  }
76
88
  }
77
89
 
90
+ /**
91
+ * Find the Chrome/Edge executable path, checking environment variable first, then default locations.
92
+ * @returns {string|undefined} Path to the browser executable, or undefined if not found
93
+ */
78
94
  function findChromePath() {
79
95
  if (chromePathEnv && existsSync(chromePathEnv)) return chromePathEnv;
80
96
  return defaultChromePaths.find((p) => existsSync(p));
81
97
  }
82
98
 
99
+ /**
100
+ * Launch Chrome with remote debugging enabled if not already running.
101
+ * Uses a singleton pattern to prevent multiple simultaneous launches.
102
+ * Waits up to 20 seconds for Chrome to become available on the DevTools port.
103
+ * @returns {Promise<void>}
104
+ * @throws {Error} If Chrome cannot be found or fails to start within timeout
105
+ */
83
106
  async function launchChromeIfNeeded() {
84
107
  if (explicitWSEndpoint) return; // user provided explicit endpoint; assume managed externally
85
108
 
@@ -128,6 +151,12 @@ async function launchChromeIfNeeded() {
128
151
  return await chromeLaunchPromise;
129
152
  }
130
153
 
154
+ /**
155
+ * Resolve the WebSocket endpoint URL for connecting to Chrome DevTools Protocol.
156
+ * Either returns the explicitly configured endpoint or queries it from the DevTools JSON API.
157
+ * @returns {Promise<string>} The WebSocket URL for connecting to Chrome
158
+ * @throws {Error} If unable to reach DevTools or no WebSocket URL is available
159
+ */
131
160
  async function resolveWSEndpoint() {
132
161
  if (explicitWSEndpoint) return explicitWSEndpoint;
133
162
  const url = `http://${chromeHost}:${chromePort}/json/version`;
@@ -142,6 +171,55 @@ async function resolveWSEndpoint() {
142
171
  return data.webSocketDebuggerUrl;
143
172
  }
144
173
 
174
+ /**
175
+ * Rebuild the domain-to-page mapping from existing browser tabs.
176
+ * This enables tab reuse across reconnections by discovering tabs that are already open.
177
+ * Skips internal pages like about:blank and chrome:// URLs.
178
+ * @param {Browser} browser - The Puppeteer browser instance
179
+ * @returns {Promise<void>}
180
+ */
181
+ async function rebuildDomainPagesMap(browser) {
182
+ try {
183
+ const pages = await browser.pages();
184
+ console.error(`[MCPBrowser] Reconnected to browser with ${pages.length} existing tabs`);
185
+
186
+ for (const page of pages) {
187
+ try {
188
+ const pageUrl = page.url();
189
+ // Skip chrome:// pages, about:blank, and other internal pages
190
+ if (!pageUrl ||
191
+ pageUrl === 'about:blank' ||
192
+ pageUrl.startsWith('chrome://') ||
193
+ pageUrl.startsWith('chrome-extension://') ||
194
+ pageUrl.startsWith('devtools://')) {
195
+ continue;
196
+ }
197
+
198
+ const hostname = new URL(pageUrl).hostname;
199
+ if (hostname && !domainPages.has(hostname)) {
200
+ domainPages.set(hostname, page);
201
+ console.error(`[MCPBrowser] Mapped existing tab for domain: ${hostname} (${pageUrl})`);
202
+ }
203
+ } catch (err) {
204
+ // Skip pages that are inaccessible or have invalid URLs
205
+ continue;
206
+ }
207
+ }
208
+
209
+ if (domainPages.size > 0) {
210
+ console.error(`[MCPBrowser] Restored ${domainPages.size} domain-to-tab mappings`);
211
+ }
212
+ } catch (err) {
213
+ console.error(`[MCPBrowser] Warning: Could not rebuild domain pages map: ${err.message}`);
214
+ }
215
+ }
216
+
217
+ /**
218
+ * Get or create a connection to the Chrome browser.
219
+ * Returns cached browser if still connected, otherwise establishes a new connection.
220
+ * Rebuilds domain-to-page mapping on reconnection to enable tab reuse.
221
+ * @returns {Promise<Browser>} Connected Puppeteer browser instance
222
+ */
145
223
  async function getBrowser() {
146
224
  await launchChromeIfNeeded();
147
225
  if (cachedBrowser && cachedBrowser.isConnected()) return cachedBrowser;
@@ -154,10 +232,24 @@ async function getBrowser() {
154
232
  cachedBrowser = null;
155
233
  domainPages.clear(); // Clear all domain page mappings
156
234
  });
235
+
236
+ // Rebuild domainPages map from existing tabs to enable reuse across reconnections
237
+ await rebuildDomainPagesMap(cachedBrowser);
238
+
157
239
  return cachedBrowser;
158
240
  }
159
241
 
160
- async function fetchPage({ url }) {
242
+ /**
243
+ * Fetch a web page using Chrome browser, with support for authentication flows and tab reuse.
244
+ * Reuses existing tabs per domain when possible. Handles authentication redirects by waiting
245
+ * for user to complete login (up to 10 minutes). Processes HTML to remove unnecessary elements
246
+ * and convert relative URLs to absolute.
247
+ * @param {Object} params - Fetch parameters
248
+ * @param {string} params.url - The URL to fetch
249
+ * @param {boolean} [params.removeUnnecessaryHTML=true] - Whether to clean HTML (removes scripts, styles, etc.)
250
+ * @returns {Promise<Object>} Result object with success status, URL, HTML content, or error details
251
+ */
252
+ async function fetchPage({ url, removeUnnecessaryHTML = true }) {
161
253
  // Hardcoded smart defaults
162
254
  const waitUntil = "networkidle0";
163
255
  const navigationTimeout = 60000; // Initial navigation timeout
@@ -280,11 +372,20 @@ async function fetchPage({ url }) {
280
372
 
281
373
  // Extract HTML content
282
374
  const html = await page.evaluate(() => document.documentElement?.outerHTML || "");
283
- const preparedHtml = prepareHtml(html, page.url());
375
+
376
+ // Process HTML based on removeUnnecessaryHTML parameter
377
+ let processedHtml;
378
+ if (removeUnnecessaryHTML) {
379
+ const cleaned = cleanHtml(html);
380
+ processedHtml = enrichHtml(cleaned, page.url());
381
+ } else {
382
+ processedHtml = enrichHtml(html, page.url());
383
+ }
384
+
284
385
  const result = {
285
386
  success: true,
286
387
  url: page.url(),
287
- html: preparedHtml
388
+ html: processedHtml
288
389
  };
289
390
 
290
391
  wasSuccess = true;
@@ -297,20 +398,25 @@ async function fetchPage({ url }) {
297
398
  }
298
399
  }
299
400
 
401
+ /**
402
+ * Truncate a string to a maximum length, adding "... [truncated]" if truncated.
403
+ * @param {string} str - The string to truncate
404
+ * @param {number} max - Maximum length
405
+ * @returns {string} The original or truncated string
406
+ */
300
407
  function truncate(str, max) {
301
408
  if (!str) return "";
302
409
  return str.length > max ? `${str.slice(0, max)}... [truncated]` : str;
303
410
  }
304
411
 
305
412
  /**
306
- * Prepares HTML for consumption by:
307
- * 1. Converting relative URLs to absolute URLs
308
- * 2. Removing non-content elements (scripts, styles, meta tags, comments)
309
- * 3. Removing code-related attributes (class, id, style, data-*, event handlers)
310
- * 4. Removing SVG graphics and other non-text elements
311
- * 5. Collapsing excessive whitespace
413
+ * Removes non-content elements and attributes from HTML:
414
+ * 1. Removing non-content elements (scripts, styles, meta tags, comments)
415
+ * 2. Removing code-related attributes (class, id, style, data-*, event handlers)
416
+ * 3. Removing SVG graphics and other non-text elements
417
+ * 4. Collapsing excessive whitespace
312
418
  */
313
- function prepareHtml(html, baseUrl) {
419
+ function cleanHtml(html) {
314
420
  if (!html) return "";
315
421
 
316
422
  let cleaned = html;
@@ -336,32 +442,6 @@ function prepareHtml(html, baseUrl) {
336
442
  // Remove link tags (stylesheets, preload, etc.)
337
443
  cleaned = cleaned.replace(/<link\b[^>]*>/gi, '');
338
444
 
339
- // Convert relative URLs to absolute in href attributes
340
- cleaned = cleaned.replace(/href=["']([^"']+)["']/gi, (match, url) => {
341
- if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('#') || url.startsWith('mailto:') || url.startsWith('tel:')) {
342
- return match;
343
- }
344
- try {
345
- const absoluteUrl = new URL(url, baseUrl).href;
346
- return `href="${absoluteUrl}"`;
347
- } catch {
348
- return match;
349
- }
350
- });
351
-
352
- // Convert relative URLs to absolute in src attributes
353
- cleaned = cleaned.replace(/src=["']([^"']+)["']/gi, (match, url) => {
354
- if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('data:')) {
355
- return match;
356
- }
357
- try {
358
- const absoluteUrl = new URL(url, baseUrl).href;
359
- return `src="${absoluteUrl}"`;
360
- } catch {
361
- return match;
362
- }
363
- });
364
-
365
445
  // Remove inline style attributes
366
446
  cleaned = cleaned.replace(/\s+style=["'][^"']*["']/gi, '');
367
447
 
@@ -392,17 +472,71 @@ function prepareHtml(html, baseUrl) {
392
472
  return cleaned;
393
473
  }
394
474
 
475
+ /**
476
+ * Enriches HTML by converting relative URLs to absolute URLs
477
+ */
478
+ function enrichHtml(html, baseUrl) {
479
+ if (!html) return "";
480
+
481
+ let enriched = html;
482
+
483
+ // Convert relative URLs to absolute in href attributes
484
+ enriched = enriched.replace(/href=["']([^"']+)["']/gi, (match, url) => {
485
+ if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('#') || url.startsWith('mailto:') || url.startsWith('tel:')) {
486
+ return match;
487
+ }
488
+ try {
489
+ const absoluteUrl = new URL(url, baseUrl).href;
490
+ return `href="${absoluteUrl}"`;
491
+ } catch {
492
+ return match;
493
+ }
494
+ });
495
+
496
+ // Convert relative URLs to absolute in src attributes
497
+ enriched = enriched.replace(/src=["']([^"']+)["']/gi, (match, url) => {
498
+ if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('data:')) {
499
+ return match;
500
+ }
501
+ try {
502
+ const absoluteUrl = new URL(url, baseUrl).href;
503
+ return `src="${absoluteUrl}"`;
504
+ } catch {
505
+ return match;
506
+ }
507
+ });
508
+
509
+ return enriched;
510
+ }
511
+
512
+ /**
513
+ * Prepares HTML for consumption by cleaning and enriching it.
514
+ * @deprecated Use cleanHtml and enrichHtml separately for better control
515
+ */
516
+ function prepareHtml(html, baseUrl) {
517
+ if (!html) return "";
518
+ const cleaned = cleanHtml(html);
519
+ return enrichHtml(cleaned, baseUrl);
520
+ }
521
+
522
+ /**
523
+ * Main entry point for the MCP server.
524
+ * Sets up the Model Context Protocol server with fetch_webpage_protected tool,
525
+ * configures request handlers, and starts the stdio transport.
526
+ * @returns {Promise<void>}
527
+ */
395
528
  async function main() {
396
- const server = new Server({ name: "MCPBrowser", version: "0.2.25" }, { capabilities: { tools: {} } });
529
+ const server = new Server({ name: "MCPBrowser", version: "0.2.26" }, { capabilities: { tools: {} } });
397
530
 
398
531
  const tools = [
399
532
  {
400
533
  name: "fetch_webpage_protected",
401
- description: "Fetches web pages by loading them in Chrome/Edge browser. Use for: (1) auth-required pages (401/403, login, SSO, corporate intranets), (2) anti-bot/crawler blocks, CAPTCHA/human verification, (3) JavaScript-heavy sites (SPAs, dynamic content).\n\nAUTH FLOW: If page requires authentication, browser opens and WAITS (up to 10 min) for user to log in, then automatically returns content once loaded. Single call returns correct content, no retry needed.\n\nRULES: (1) ONE URL at a time, never parallel. (2) Wait for full response - may take minutes for auth. (3) Skip only if 404. (4) Returns HTML with clickable links for subpage navigation.",
534
+ description: "Fetches web pages using Chrome/Edge browser. Handles auth-required pages, CAPTCHA, SSO, anti-bot protection, and JavaScript-heavy sites.\n\nWaits for user interaction (login, CAPTCHA) if needed, then returns content automatically.\n\nIMPORTANT: Call ONE URL at a time only. Never parallel - causes conflicts. Wait for completion before next URL.",
402
535
  inputSchema: {
403
536
  type: "object",
404
537
  properties: {
405
538
  url: { type: "string", description: "The URL to fetch" },
539
+ removeUnnecessaryHTML: { type: "boolean", description: "Remove Unnecessary HTML for size reduction by 90%.", default: true }
406
540
  },
407
541
  required: ["url"],
408
542
  additionalProperties: false,
@@ -453,7 +587,7 @@ async function main() {
453
587
  }
454
588
 
455
589
  // Export for testing
456
- export { fetchPage, getBrowser, prepareHtml };
590
+ export { fetchPage, getBrowser, prepareHtml, cleanHtml, enrichHtml };
457
591
 
458
592
  // Run the MCP server
459
593
  main().catch((err) => {
@@ -310,6 +310,116 @@ async function runTests() {
310
310
 
311
311
  assert(domainPages.size === 1, 'Should still have only 1 domain (eng.ms) in map after all loads');
312
312
  });
313
+
314
+ await test('Should rebuild domain pages map on reconnection', async () => {
315
+ const domainPages = new Map();
316
+ const browser = new MockBrowser();
317
+
318
+ // Simulate having tabs already open from previous session
319
+ const page1 = await browser.newPage();
320
+ await page1.goto('https://github.com/user/repo');
321
+
322
+ const page2 = await browser.newPage();
323
+ await page2.goto('https://microsoft.com/docs');
324
+
325
+ const page3 = await browser.newPage();
326
+ await page3.goto('https://eng.ms/docs/products');
327
+
328
+ const page4 = await browser.newPage();
329
+ await page4.goto('about:blank');
330
+
331
+ // Verify pages exist but map is empty (simulating disconnection)
332
+ assert(domainPages.size === 0, 'Domain pages map should be empty before rebuild');
333
+
334
+ // Simulate rebuildDomainPagesMap function
335
+ const pages = await browser.pages();
336
+ assert(pages.length === 4, `Should have 4 tabs open, got ${pages.length}`);
337
+
338
+ for (const page of pages) {
339
+ try {
340
+ const pageUrl = page.url();
341
+ // Skip internal pages
342
+ if (!pageUrl ||
343
+ pageUrl === 'about:blank' ||
344
+ pageUrl.startsWith('chrome://') ||
345
+ pageUrl.startsWith('chrome-extension://') ||
346
+ pageUrl.startsWith('devtools://')) {
347
+ continue;
348
+ }
349
+
350
+ const hostname = new URL(pageUrl).hostname;
351
+ if (hostname && !domainPages.has(hostname)) {
352
+ domainPages.set(hostname, page);
353
+ }
354
+ } catch (err) {
355
+ // Skip pages with invalid URLs
356
+ continue;
357
+ }
358
+ }
359
+
360
+ // Verify map was rebuilt correctly
361
+ assert(domainPages.size === 3, `Should have 3 domains in map (excluding about:blank), got ${domainPages.size}`);
362
+ assert(domainPages.has('github.com'), 'Should have github.com in map');
363
+ assert(domainPages.has('microsoft.com'), 'Should have microsoft.com in map');
364
+ assert(domainPages.has('eng.ms'), 'Should have eng.ms in map');
365
+ assert(!domainPages.has('about:blank'), 'Should not have about:blank in map');
366
+
367
+ // Verify correct pages are mapped
368
+ assert(domainPages.get('github.com').url() === 'https://github.com/user/repo', 'github.com should map to correct page');
369
+ assert(domainPages.get('microsoft.com').url() === 'https://microsoft.com/docs', 'microsoft.com should map to correct page');
370
+ assert(domainPages.get('eng.ms').url() === 'https://eng.ms/docs/products', 'eng.ms should map to correct page');
371
+
372
+ // Verify tabs can be reused after rebuild
373
+ const githubPage = domainPages.get('github.com');
374
+ assert(!githubPage.isClosed(), 'Rebuilt github.com page should still be open');
375
+ await githubPage.goto('https://github.com/another/repo');
376
+ assert(githubPage.url() === 'https://github.com/another/repo', 'Rebuilt page should be navigable');
377
+ });
378
+
379
+ await test('Should skip chrome:// and internal pages during rebuild', async () => {
380
+ const domainPages = new Map();
381
+ const browser = new MockBrowser();
382
+
383
+ // Create pages with various internal URLs
384
+ const page1 = await browser.newPage();
385
+ await page1.goto('chrome://settings');
386
+
387
+ const page2 = await browser.newPage();
388
+ await page2.goto('chrome-extension://abc123/popup.html');
389
+
390
+ const page3 = await browser.newPage();
391
+ await page3.goto('devtools://devtools/bundled/devtools_app.html');
392
+
393
+ const page4 = await browser.newPage();
394
+ await page4.goto('https://example.com/page');
395
+
396
+ // Rebuild domain pages map
397
+ const pages = await browser.pages();
398
+ for (const page of pages) {
399
+ try {
400
+ const pageUrl = page.url();
401
+ if (!pageUrl ||
402
+ pageUrl === 'about:blank' ||
403
+ pageUrl.startsWith('chrome://') ||
404
+ pageUrl.startsWith('chrome-extension://') ||
405
+ pageUrl.startsWith('devtools://')) {
406
+ continue;
407
+ }
408
+
409
+ const hostname = new URL(pageUrl).hostname;
410
+ if (hostname && !domainPages.has(hostname)) {
411
+ domainPages.set(hostname, page);
412
+ }
413
+ } catch (err) {
414
+ continue;
415
+ }
416
+ }
417
+
418
+ // Only example.com should be in the map
419
+ assert(domainPages.size === 1, `Should only have 1 domain (example.com), got ${domainPages.size}`);
420
+ assert(domainPages.has('example.com'), 'Should have example.com in map');
421
+ assert(!domainPages.has('chrome'), 'Should not have chrome:// pages in map');
422
+ });
313
423
 
314
424
  // Summary
315
425
  console.log('\n' + '='.repeat(50));
@@ -130,6 +130,34 @@ async function runIntegrationTests() {
130
130
  assert(linkResult.html && linkResult.html.length > 0, `Link ${i+1} should return HTML content`);
131
131
  }
132
132
  });
133
+
134
+ await test('Should support removeUnnecessaryHTML parameter', async () => {
135
+ const url = 'https://eng.ms/docs/products/geneva';
136
+
137
+ console.log(` šŸ“„ Fetching with removeUnnecessaryHTML=true (default)`);
138
+ const cleanResult = await fetchPage({ url, removeUnnecessaryHTML: true });
139
+
140
+ assert(cleanResult.success, 'Should successfully fetch with removeUnnecessaryHTML=true');
141
+ assert(cleanResult.html && cleanResult.html.length > 0, 'Should return cleaned HTML');
142
+ assert(!cleanResult.html.includes('<script'), 'Cleaned HTML should not contain script tags');
143
+ assert(!cleanResult.html.includes('<style'), 'Cleaned HTML should not contain style tags');
144
+ assert(!cleanResult.html.includes('class='), 'Cleaned HTML should not contain class attributes');
145
+ console.log(` āœ… Cleaned HTML length: ${cleanResult.html.length} chars`);
146
+
147
+ console.log(` šŸ“„ Fetching with removeUnnecessaryHTML=false`);
148
+ const rawResult = await fetchPage({ url, removeUnnecessaryHTML: false });
149
+
150
+ assert(rawResult.success, 'Should successfully fetch with removeUnnecessaryHTML=false');
151
+ assert(rawResult.html && rawResult.html.length > 0, 'Should return raw HTML');
152
+ console.log(` āœ… Raw HTML length: ${rawResult.html.length} chars`);
153
+
154
+ // Raw HTML should be larger than cleaned HTML
155
+ assert(rawResult.html.length > cleanResult.html.length,
156
+ `Raw HTML (${rawResult.html.length}) should be larger than cleaned (${cleanResult.html.length})`);
157
+
158
+ const reductionPercent = ((rawResult.html.length - cleanResult.html.length) / rawResult.html.length * 100).toFixed(1);
159
+ console.log(` šŸ“Š Size reduction: ${reductionPercent}% (${rawResult.html.length} → ${cleanResult.html.length} chars)`);
160
+ });
133
161
 
134
162
  } catch (error) {
135
163
  console.error('\nāŒ Test suite error:', error.message);
@@ -1,7 +1,7 @@
1
1
  import assert from 'assert';
2
- import { prepareHtml } from '../src/mcp-browser.js';
2
+ import { prepareHtml, cleanHtml, enrichHtml } from '../src/mcp-browser.js';
3
3
 
4
- console.log('🧪 Testing prepareHtml function\n');
4
+ console.log('🧪 Testing HTML processing functions\n');
5
5
 
6
6
  let testsPassed = 0;
7
7
  let testsFailed = 0;
@@ -299,6 +299,210 @@ test('Should handle HTML with all types of removals', () => {
299
299
  assert(result.includes('Text content'), 'Should preserve text');
300
300
  });
301
301
 
302
+ // ==================================================
303
+ // cleanHtml Function Tests
304
+ // ==================================================
305
+
306
+ console.log('\n🧹 Testing cleanHtml function\n');
307
+
308
+ // Test cleanHtml 1: Remove HTML comments
309
+ test('cleanHtml: Should remove HTML comments', () => {
310
+ const html = '<div>Content<!-- This is a comment --></div>';
311
+ const result = cleanHtml(html);
312
+ assert(!result.includes('<!--'), 'Should not contain comment start');
313
+ assert(!result.includes('-->'), 'Should not contain comment end');
314
+ assert(result.includes('Content'), 'Should preserve content');
315
+ });
316
+
317
+ // Test cleanHtml 2: Remove script tags
318
+ test('cleanHtml: Should remove script tags and their content', () => {
319
+ const html = '<div>Keep this</div><script>alert("remove");</script><div>And this</div>';
320
+ const result = cleanHtml(html);
321
+ assert(!result.includes('<script'), 'Should not contain script tag');
322
+ assert(!result.includes('alert'), 'Should not contain script content');
323
+ assert(result.includes('Keep this'), 'Should preserve content');
324
+ });
325
+
326
+ // Test cleanHtml 3: Remove style tags
327
+ test('cleanHtml: Should remove style tags and their content', () => {
328
+ const html = '<div>Content</div><style>.class { color: red; }</style>';
329
+ const result = cleanHtml(html);
330
+ assert(!result.includes('<style'), 'Should not contain style tag');
331
+ assert(!result.includes('color: red'), 'Should not contain style content');
332
+ assert(result.includes('Content'), 'Should preserve content');
333
+ });
334
+
335
+ // Test cleanHtml 4: Remove meta tags
336
+ test('cleanHtml: Should remove meta tags', () => {
337
+ const html = '<head><meta charset="utf-8"><meta name="viewport" content="width=device-width"></head><body>Content</body>';
338
+ const result = cleanHtml(html);
339
+ assert(!result.includes('<meta'), 'Should not contain meta tags');
340
+ assert(result.includes('Content'), 'Should preserve content');
341
+ });
342
+
343
+ // Test cleanHtml 5: Remove inline style attributes
344
+ test('cleanHtml: Should remove inline style attributes', () => {
345
+ const html = '<div style="color: red; font-size: 14px;">Content</div>';
346
+ const result = cleanHtml(html);
347
+ assert(!result.includes('style='), 'Should remove style attribute');
348
+ assert(result.includes('Content'), 'Should preserve content');
349
+ });
350
+
351
+ // Test cleanHtml 6: Remove class attributes
352
+ test('cleanHtml: Should remove class attributes', () => {
353
+ const html = '<div class="container main-content">Text</div>';
354
+ const result = cleanHtml(html);
355
+ assert(!result.includes('class='), 'Should remove class attribute');
356
+ assert(result.includes('Text'), 'Should preserve content');
357
+ });
358
+
359
+ // Test cleanHtml 7: Remove id attributes
360
+ test('cleanHtml: Should remove id attributes', () => {
361
+ const html = '<div id="main-section">Content</div>';
362
+ const result = cleanHtml(html);
363
+ assert(!result.includes('id='), 'Should remove id attribute');
364
+ assert(result.includes('Content'), 'Should preserve content');
365
+ });
366
+
367
+ // Test cleanHtml 8: Remove SVG tags
368
+ test('cleanHtml: Should remove SVG tags and content', () => {
369
+ const html = '<div>Text</div><svg width="100" height="100"><circle cx="50" cy="50" r="40"/></svg>';
370
+ const result = cleanHtml(html);
371
+ assert(!result.includes('<svg'), 'Should remove svg tag');
372
+ assert(!result.includes('circle'), 'Should remove svg content');
373
+ assert(result.includes('Text'), 'Should preserve content');
374
+ });
375
+
376
+ // Test cleanHtml 9: Collapse whitespace
377
+ test('cleanHtml: Should collapse multiple whitespace into single space', () => {
378
+ const html = '<div>Line 1\n\n\n Line 2\t\t\tLine 3</div>';
379
+ const result = cleanHtml(html);
380
+ assert(!result.includes('\n\n'), 'Should remove multiple newlines');
381
+ assert(!result.includes(' '), 'Should remove multiple spaces');
382
+ assert(result.includes('Line 1'), 'Should preserve content');
383
+ });
384
+
385
+ // Test cleanHtml 10: Does NOT modify URLs (that's enrichHtml's job)
386
+ test('cleanHtml: Should NOT modify relative URLs', () => {
387
+ const html = '<a href="/docs/page">Link</a><img src="/images/logo.png">';
388
+ const result = cleanHtml(html);
389
+ assert(result.includes('href="/docs/page"'), 'Should keep relative href unchanged');
390
+ assert(result.includes('src="/images/logo.png"'), 'Should keep relative src unchanged');
391
+ });
392
+
393
+ // ==================================================
394
+ // enrichHtml Function Tests
395
+ // ==================================================
396
+
397
+ console.log('\nšŸ”— Testing enrichHtml function\n');
398
+
399
+ // Test enrichHtml 1: Convert relative href URLs
400
+ test('enrichHtml: Should convert relative href URLs to absolute', () => {
401
+ const html = '<a href="/docs/page">Link</a>';
402
+ const result = enrichHtml(html, 'https://example.com');
403
+ assert(result.includes('href="https://example.com/docs/page"'), 'Should convert relative href to absolute');
404
+ });
405
+
406
+ // Test enrichHtml 2: Keep absolute href URLs unchanged
407
+ test('enrichHtml: Should keep absolute href URLs unchanged', () => {
408
+ const html = '<a href="https://other.com/page">Link</a>';
409
+ const result = enrichHtml(html, 'https://example.com');
410
+ assert(result.includes('href="https://other.com/page"'), 'Should keep absolute href unchanged');
411
+ });
412
+
413
+ // Test enrichHtml 3: Convert relative src URLs
414
+ test('enrichHtml: Should convert relative src URLs to absolute', () => {
415
+ const html = '<img src="/images/logo.png">';
416
+ const result = enrichHtml(html, 'https://example.com');
417
+ assert(result.includes('src="https://example.com/images/logo.png"'), 'Should convert relative src to absolute');
418
+ });
419
+
420
+ // Test enrichHtml 4: Keep absolute src URLs unchanged
421
+ test('enrichHtml: Should keep absolute src URLs unchanged', () => {
422
+ const html = '<img src="https://cdn.example.com/logo.png">';
423
+ const result = enrichHtml(html, 'https://example.com');
424
+ assert(result.includes('src="https://cdn.example.com/logo.png"'), 'Should keep absolute src unchanged');
425
+ });
426
+
427
+ // Test enrichHtml 5: Handle anchor links
428
+ test('enrichHtml: Should not modify anchor links', () => {
429
+ const html = '<a href="#section">Jump</a>';
430
+ const result = enrichHtml(html, 'https://example.com');
431
+ assert(result.includes('href="#section"'), 'Should keep anchor links unchanged');
432
+ });
433
+
434
+ // Test enrichHtml 6: Handle mailto and tel links
435
+ test('enrichHtml: Should not modify mailto and tel links', () => {
436
+ const html = '<a href="mailto:test@example.com">Email</a><a href="tel:+1234567890">Call</a>';
437
+ const result = enrichHtml(html, 'https://example.com');
438
+ assert(result.includes('href="mailto:test@example.com"'), 'Should keep mailto unchanged');
439
+ assert(result.includes('href="tel:+1234567890"'), 'Should keep tel unchanged');
440
+ });
441
+
442
+ // Test enrichHtml 7: Handle data URIs
443
+ test('enrichHtml: Should not modify data URIs', () => {
444
+ const html = '<img src="data:image/png;base64,iVBORw0KGg==">';
445
+ const result = enrichHtml(html, 'https://example.com');
446
+ assert(result.includes('src="data:image/png;base64,iVBORw0KGg=="'), 'Should keep data URI unchanged');
447
+ });
448
+
449
+ // Test enrichHtml 8: Handle protocol-relative URLs
450
+ test('enrichHtml: Should not modify protocol-relative URLs', () => {
451
+ const html = '<img src="//cdn.example.com/image.png">';
452
+ const result = enrichHtml(html, 'https://example.com');
453
+ assert(result.includes('src="//cdn.example.com/image.png"'), 'Should keep protocol-relative URL unchanged');
454
+ });
455
+
456
+ // Test enrichHtml 9: Does NOT remove elements (that's cleanHtml's job)
457
+ test('enrichHtml: Should NOT remove script or style tags', () => {
458
+ const html = '<script>console.log("test");</script><style>.test{}</style>';
459
+ const result = enrichHtml(html, 'https://example.com');
460
+ assert(result.includes('<script'), 'Should keep script tag');
461
+ assert(result.includes('<style'), 'Should keep style tag');
462
+ });
463
+
464
+ // ==================================================
465
+ // Combined cleanHtml + enrichHtml Tests
466
+ // ==================================================
467
+
468
+ console.log('\nšŸ”„ Testing cleanHtml + enrichHtml combination\n');
469
+
470
+ // Test Combined 1: Clean then enrich
471
+ test('Combined: Should clean HTML then enrich URLs', () => {
472
+ const html = '<div class="test" style="color:red"><a href="/page">Link</a><script>alert();</script></div>';
473
+ const cleaned = cleanHtml(html);
474
+ const enriched = enrichHtml(cleaned, 'https://example.com');
475
+
476
+ // Should not have cleaned elements
477
+ assert(!enriched.includes('class='), 'Should not have class');
478
+ assert(!enriched.includes('style='), 'Should not have style');
479
+ assert(!enriched.includes('<script'), 'Should not have script');
480
+
481
+ // Should have enriched URL
482
+ assert(enriched.includes('href="https://example.com/page"'), 'Should have absolute URL');
483
+ assert(enriched.includes('Link'), 'Should preserve content');
484
+ });
485
+
486
+ // Test Combined 2: Verify prepareHtml still works (backward compatibility)
487
+ test('Combined: prepareHtml should still work as before', () => {
488
+ const html = '<div class="test"><a href="/page">Link</a><script>alert();</script></div>';
489
+ const result = prepareHtml(html, 'https://example.com');
490
+
491
+ // Should clean
492
+ assert(!result.includes('class='), 'Should clean attributes');
493
+ assert(!result.includes('<script'), 'Should remove script');
494
+
495
+ // Should enrich
496
+ assert(result.includes('href="https://example.com/page"'), 'Should convert URL');
497
+ assert(result.includes('Link'), 'Should preserve content');
498
+ });
499
+
500
+ // ==================================================
501
+ // Original prepareHtml Tests (for backward compatibility)
502
+ // ==================================================
503
+
504
+ console.log('\nšŸ“¦ Testing prepareHtml (backward compatibility)\n');
505
+
302
506
  console.log('\n==================================================');
303
507
  console.log(`āœ… Tests Passed: ${testsPassed}`);
304
508
  console.log(`āŒ Tests Failed: ${testsFailed}`);