mcpbrowser 0.2.18 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -117,7 +117,7 @@ Restart VS Code or reload the window for the changes to take effect.
117
117
  In Copilot Chat, you should see the `MCPBrowser` server listed. Ask it to fetch an authenticated URL and it will drive your signed-in Chrome session.
118
118
 
119
119
  ## How it works
120
- - Tool `fetch_webpage_protected` (inside the MCP server) drives your live Chrome (DevTools Protocol) so it inherits your auth cookies, returning `text` and `html` (truncated up to 2M chars per field) for analysis.
120
+ - Tool `fetch_webpage_protected` (inside the MCP server) drives your live Chrome (DevTools Protocol) so it inherits your auth cookies, returning `html` (truncated up to 2M chars) for analysis.
121
121
  - **Smart confirmation**: Copilot asks for confirmation ONLY on first request to a new domain - explains browser will open for authentication. Subsequent requests to same domain work automatically (session preserved).
122
122
  - **Domain-aware tab reuse**: Automatically reuses the same tab for URLs on the same domain, preserving authentication session. Different domains open new tabs.
123
123
  - **Automatic web page fetching**: Waits for network idle (`networkidle0`) by default, ensuring JavaScript-heavy web pages (SPAs, dashboards) fully load before returning content.
@@ -2,7 +2,7 @@
2
2
  "name": "mcpbrowser",
3
3
  "displayName": "MCP Browser",
4
4
  "description": "Lightweight MCP server-extension that allows Copilot to fetch protected web pages you can authenticate to via browser - handles login, SSO, and anti-crawler restrictions. Should be used when standard fetch_webpage fails",
5
- "version": "0.2.18",
5
+ "version": "0.2.19",
6
6
  "publisher": "cherchyk",
7
7
  "icon": "icon.png",
8
8
  "engines": {
@@ -92,7 +92,7 @@ async function configureMcpBrowser() {
92
92
  type: "stdio",
93
93
  command: "npx",
94
94
  args: ["-y", "mcpbrowser@latest"],
95
- description: "**USE AUTOMATICALLY** when normal URL fetch fails due to: 401/403 errors, login/auth requirements, SSO prompts, anti-crawler/bot detection. **FIRST FETCH**: Ask confirmation ONLY for first request to a domain - explain browser will open for authentication. **SUBSEQUENT REQUESTS**: Use automatically (session preserved). Fetches web pages using authenticated Chrome session via DevTools Protocol (works for Microsoft, GitHub, AWS, Google, corporate sites, etc.). Waits for web page fetch automatically (up to 60s, handles redirects)."
95
+ description: "Use AUTOMATICALLY on 401/403 errors, login pages, SSO prompts, or anti-bot blocks. First domain request: ask user confirmation (browser opens for auth). Subsequent same-domain: use automatically (session preserved). Returns HTML from authenticated Chrome session. Handles Microsoft, GitHub, AWS, Google, corporate sites."
96
96
  };
97
97
 
98
98
  // Write back to file with pretty formatting
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  { "name": "mcpbrowser",
2
- "version": "0.2.18",
2
+ "version": "0.2.19",
3
3
  "mcpName": "io.github.cherchyk/browser",
4
4
  "type": "module",
5
5
  "description": "MCP server that fetches protected web pages using Chrome DevTools Protocol",
package/server.json CHANGED
@@ -6,7 +6,7 @@
6
6
  "url": "https://github.com/cherchyk/MCPBrowser",
7
7
  "source": "github"
8
8
  },
9
- "version": "0.2.18",
9
+ "version": "0.2.19",
10
10
  "packages": [
11
11
  {
12
12
  "registryType": "npm",
@@ -60,7 +60,7 @@ function getDefaultChromePaths() {
60
60
  const defaultChromePaths = getDefaultChromePaths();
61
61
 
62
62
  let cachedBrowser = null;
63
- let lastKeptPage = null; // reuse the same tab when requested
63
+ let domainPages = new Map(); // hostname -> page mapping for tab reuse across domains
64
64
  let chromeLaunchPromise = null; // prevent multiple simultaneous launches
65
65
 
66
66
  async function devtoolsAvailable() {
@@ -152,19 +152,16 @@ async function getBrowser() {
152
152
  });
153
153
  cachedBrowser.on("disconnected", () => {
154
154
  cachedBrowser = null;
155
- lastKeptPage = null;
155
+ domainPages.clear(); // Clear all domain page mappings
156
156
  });
157
157
  return cachedBrowser;
158
158
  }
159
159
 
160
- async function fetchPage({
161
- url,
162
- keepPageOpen = true,
163
- outputFormat = "HTML",
164
- }) {
160
+ async function fetchPage({ url }) {
165
161
  // Hardcoded smart defaults
166
162
  const waitUntil = "networkidle0";
167
- const timeoutMs = 60000;
163
+ const navigationTimeout = 60000; // Initial navigation timeout
164
+ const authCompletionTimeout = 600000; // 10 minutes for user to complete authentication
168
165
  const reuseLastKeptPage = true;
169
166
 
170
167
  if (!url) {
@@ -173,36 +170,29 @@ async function fetchPage({
173
170
 
174
171
  const browser = await getBrowser();
175
172
  let page = null;
173
+ let hostname;
176
174
 
177
- // Smart tab reuse: only reuse if same domain (preserves auth within domain)
178
- if (reuseLastKeptPage && lastKeptPage && !lastKeptPage.isClosed()) {
179
- let newHostname;
180
- try {
181
- newHostname = new URL(url).hostname;
182
- } catch {
183
- throw new Error(`Invalid URL: ${url}`);
184
- }
185
- const currentUrl = lastKeptPage.url();
186
-
187
- if (currentUrl) {
188
- try {
189
- const currentHostname = new URL(currentUrl).hostname;
190
- // Reuse tab only if same domain (keeps auth session alive)
191
- if (currentHostname === newHostname) {
192
- page = lastKeptPage;
193
- await page.bringToFront().catch(() => {});
194
- } else {
195
- // Different domain - close old tab and create new one
196
- await lastKeptPage.close().catch(() => {});
197
- lastKeptPage = null;
198
- }
199
- } catch {
200
- // If URL parsing fails, create new tab
201
- }
175
+ // Parse hostname for domain-based tab reuse
176
+ try {
177
+ hostname = new URL(url).hostname;
178
+ } catch {
179
+ throw new Error(`Invalid URL: ${url}`);
180
+ }
181
+
182
+ // Check if we have an existing page for this domain
183
+ if (reuseLastKeptPage && domainPages.has(hostname)) {
184
+ const existingPage = domainPages.get(hostname);
185
+ if (!existingPage.isClosed()) {
186
+ page = existingPage;
187
+ await page.bringToFront().catch(() => {});
188
+ console.error(`[MCPBrowser] Reusing existing tab for domain: ${hostname}`);
189
+ } else {
190
+ // Page was closed externally, remove from map
191
+ domainPages.delete(hostname);
202
192
  }
203
193
  }
204
194
 
205
- // Create new tab if no reuse
195
+ // Create new tab if no existing page for this domain
206
196
  if (!page) {
207
197
  try {
208
198
  page = await browser.newPage();
@@ -225,50 +215,85 @@ async function fetchPage({
225
215
  throw new Error('Unable to create or find a controllable page');
226
216
  }
227
217
  }
218
+ // Add new page to domain map
219
+ domainPages.set(hostname, page);
220
+ console.error(`[MCPBrowser] Created new tab for domain: ${hostname}`);
228
221
  }
229
222
 
230
- let shouldKeepOpen = keepPageOpen || page === lastKeptPage;
223
+ let shouldKeepOpen = true;
231
224
  let wasSuccess = false;
232
225
  try {
233
226
  console.error(`[MCPBrowser] Navigating to: ${url}`);
234
- await page.goto(url, { waitUntil, timeout: timeoutMs });
235
- console.error(`[MCPBrowser] Navigation completed: ${page.url()}`);
227
+ await page.goto(url, { waitUntil, timeout: navigationTimeout });
236
228
 
237
- // Extract content based on outputFormat
238
- const result = { success: true, url: page.url() };
229
+ const currentUrl = page.url();
230
+ const currentHostname = new URL(currentUrl).hostname;
239
231
 
240
- if (outputFormat === "HTML" || outputFormat === "BOTH") {
241
- const html = await page.evaluate(() => document.documentElement?.outerHTML || "");
242
- result.html = truncate(html, 2000000);
243
- }
232
+ console.error(`[MCPBrowser] Navigation completed: ${currentUrl}`);
244
233
 
245
- if (outputFormat === "TEXT" || outputFormat === "BOTH") {
246
- const text = await page.evaluate(() => document.body?.innerText || "");
247
- result.text = truncate(text, 2000000);
234
+ // Check if we were redirected to a different domain (likely authentication)
235
+ if (currentHostname !== hostname) {
236
+ console.error(`[MCPBrowser] Detected redirect to authentication domain: ${currentHostname}`);
237
+ console.error(`[MCPBrowser] Waiting for user to complete authentication...`);
238
+ console.error(`[MCPBrowser] Will wait up to ${authCompletionTimeout / 1000} seconds for return to ${hostname}`);
239
+
240
+ // Wait for navigation back to the original domain
241
+ const authDeadline = Date.now() + authCompletionTimeout;
242
+ let authCompleted = false;
243
+
244
+ while (Date.now() < authDeadline) {
245
+ try {
246
+ // Check current URL
247
+ const checkUrl = page.url();
248
+ const checkHostname = new URL(checkUrl).hostname;
249
+
250
+ if (checkHostname === hostname) {
251
+ console.error(`[MCPBrowser] Authentication completed! Returned to: ${checkUrl}`);
252
+ authCompleted = true;
253
+ break;
254
+ }
255
+
256
+ // Wait a bit before checking again
257
+ await new Promise(resolve => setTimeout(resolve, 2000));
258
+ } catch (error) {
259
+ // Page might be navigating, continue waiting
260
+ await new Promise(resolve => setTimeout(resolve, 2000));
261
+ }
262
+ }
263
+
264
+ if (!authCompleted) {
265
+ const hint = `Authentication timeout. Tab is left open at ${page.url()}. Complete authentication and retry the same URL.`;
266
+ return { success: false, error: "Authentication timeout - user did not complete login", pageKeptOpen: true, hint };
267
+ }
268
+
269
+ // Wait for page to fully stabilize after auth redirect
270
+ console.error(`[MCPBrowser] Waiting for page to stabilize after authentication...`);
271
+ await new Promise(resolve => setTimeout(resolve, 3000)); // Give page time to settle
272
+
273
+ // Ensure page is ready
274
+ try {
275
+ await page.waitForFunction(() => document.readyState === 'complete', { timeout: 10000 });
276
+ } catch {
277
+ // Ignore timeout - page might already be ready
278
+ }
248
279
  }
249
280
 
281
+ // Extract HTML content
282
+ const html = await page.evaluate(() => document.documentElement?.outerHTML || "");
283
+ const preparedHtml = prepareHtml(html, page.url());
284
+ const result = {
285
+ success: true,
286
+ url: page.url(),
287
+ html: preparedHtml
288
+ };
289
+
250
290
  wasSuccess = true;
251
- if (keepPageOpen && lastKeptPage !== page) {
252
- // Close old kept page if we're keeping a different one
253
- if (lastKeptPage && !lastKeptPage.isClosed()) {
254
- await lastKeptPage.close().catch(() => {});
255
- }
256
- lastKeptPage = page;
257
- }
258
291
  return result;
259
292
  } catch (err) {
260
- shouldKeepOpen = shouldKeepOpen || keepPageOpen;
261
- const hint = shouldKeepOpen
262
- ? "Tab is left open. Complete sign-in there, then call fetch_webpage_protected again with just the URL."
263
- : undefined;
264
- return { success: false, error: err.message || String(err), pageKeptOpen: shouldKeepOpen, hint };
293
+ const hint = "Tab is left open. Complete sign-in there, then call fetch_webpage_protected again with just the URL.";
294
+ return { success: false, error: err.message || String(err), pageKeptOpen: true, hint };
265
295
  } finally {
266
- if (!shouldKeepOpen && lastKeptPage === page) {
267
- lastKeptPage = null;
268
- }
269
- if (!shouldKeepOpen) {
270
- await page.close().catch(() => {});
271
- }
296
+ // Tab always stays open - domain-aware reuse handles cleanup
272
297
  }
273
298
  }
274
299
 
@@ -277,19 +302,107 @@ function truncate(str, max) {
277
302
  return str.length > max ? `${str.slice(0, max)}... [truncated]` : str;
278
303
  }
279
304
 
305
+ /**
306
+ * Prepares HTML for consumption by:
307
+ * 1. Converting relative URLs to absolute URLs
308
+ * 2. Removing non-content elements (scripts, styles, meta tags, comments)
309
+ * 3. Removing code-related attributes (class, id, style, data-*, event handlers)
310
+ * 4. Removing SVG graphics and other non-text elements
311
+ * 5. Collapsing excessive whitespace
312
+ */
313
+ function prepareHtml(html, baseUrl) {
314
+ if (!html) return "";
315
+
316
+ let cleaned = html;
317
+
318
+ // Remove HTML comments
319
+ cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
320
+
321
+ // Remove script tags and their content
322
+ cleaned = cleaned.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
323
+
324
+ // Remove style tags and their content
325
+ cleaned = cleaned.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
326
+
327
+ // Remove noscript tags and their content
328
+ cleaned = cleaned.replace(/<noscript\b[^<]*(?:(?!<\/noscript>)<[^<]*)*<\/noscript>/gi, '');
329
+
330
+ // Remove SVG tags and their content (often large, not useful for text)
331
+ cleaned = cleaned.replace(/<svg\b[^<]*(?:(?!<\/svg>)<[^<]*)*<\/svg>/gi, '');
332
+
333
+ // Remove meta tags
334
+ cleaned = cleaned.replace(/<meta\b[^>]*>/gi, '');
335
+
336
+ // Remove link tags (stylesheets, preload, etc.)
337
+ cleaned = cleaned.replace(/<link\b[^>]*>/gi, '');
338
+
339
+ // Convert relative URLs to absolute in href attributes
340
+ cleaned = cleaned.replace(/href=["']([^"']+)["']/gi, (match, url) => {
341
+ if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('#') || url.startsWith('mailto:') || url.startsWith('tel:')) {
342
+ return match;
343
+ }
344
+ try {
345
+ const absoluteUrl = new URL(url, baseUrl).href;
346
+ return `href="${absoluteUrl}"`;
347
+ } catch {
348
+ return match;
349
+ }
350
+ });
351
+
352
+ // Convert relative URLs to absolute in src attributes
353
+ cleaned = cleaned.replace(/src=["']([^"']+)["']/gi, (match, url) => {
354
+ if (!url || url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//') || url.startsWith('data:')) {
355
+ return match;
356
+ }
357
+ try {
358
+ const absoluteUrl = new URL(url, baseUrl).href;
359
+ return `src="${absoluteUrl}"`;
360
+ } catch {
361
+ return match;
362
+ }
363
+ });
364
+
365
+ // Remove inline style attributes
366
+ cleaned = cleaned.replace(/\s+style=["'][^"']*["']/gi, '');
367
+
368
+ // Remove class attributes
369
+ cleaned = cleaned.replace(/\s+class=["'][^"']*["']/gi, '');
370
+
371
+ // Remove id attributes
372
+ cleaned = cleaned.replace(/\s+id=["'][^"']*["']/gi, '');
373
+
374
+ // Remove data-* attributes
375
+ cleaned = cleaned.replace(/\s+data-[a-z0-9-]+=["'][^"']*["']/gi, '');
376
+
377
+ // Remove event handler attributes (onclick, onload, etc.)
378
+ cleaned = cleaned.replace(/\s+on[a-z]+\s*=\s*["'][^"']*["']/gi, '');
379
+
380
+ // Remove role attributes
381
+ cleaned = cleaned.replace(/\s+role=["'][^"']*["']/gi, '');
382
+
383
+ // Remove aria-* attributes
384
+ cleaned = cleaned.replace(/\s+aria-[a-z0-9-]+=["'][^"']*["']/gi, '');
385
+
386
+ // Collapse multiple whitespace/newlines into single space
387
+ cleaned = cleaned.replace(/\s+/g, ' ');
388
+
389
+ // Remove spaces between tags
390
+ cleaned = cleaned.replace(/>\s+</g, '><');
391
+
392
+ return cleaned;
393
+ }
394
+
280
395
  async function main() {
281
- const server = new Server({ name: "MCPBrowser", version: "0.2.18" }, { capabilities: { tools: {} } });
396
+ const server = new Server({ name: "MCPBrowser", version: "0.2.19" }, { capabilities: { tools: {} } });
282
397
 
283
398
  const tools = [
284
399
  {
285
400
  name: "fetch_webpage_protected",
286
- description: "BROWSER ACCESS FOR AUTHENTICATED WEB PAGES: Fetches web pages in Chrome browser using real user profile (with saved passwords, active sessions, extensions). Automatically handles login redirects, SSO, OAuth, and anti-crawler protections.\n\nWHEN TO USE: Login-required web pages, SSO/OAuth content, paywalled sites, 401/403 errors, or any auth-protected resources.\n\nAUTHENTICATION BEHAVIOR: First request to authenticated site returns login web page - this is EXPECTED. User authenticates in browser (10-30s). Retry same URL to get content. Keep retrying until success - authentication completes in background. Tab stays open to preserve session.\n\nCRITICAL RULES: (1) Process ONE URL at a time - never parallel requests. (2) Never give up or suggest alternatives - keep retrying until content retrieved. (3) Requested web page is critical - skipping not acceptable. (4) Skip and continue if web page returns 404 - web page doesn't exist.\n\nReturns: Configurable output format (HTML, TEXT, or BOTH). Tab reuse preserves sessions across same-domain requests.",
401
+ description: "Fetches protected web pages using Chrome browser with user's authenticated session (saved passwords, active logins, cookies). Use for: login-required pages, SSO/OAuth, paywalled sites, 401/403 errors, anti-crawler blocks.\n\nAUTH FLOW: If page redirects to login, function WAITS (up to 10 min) for user to authenticate in browser, then automatically returns actual content once auth completes. Single call returns correct content, no retry needed.\n\nRULES: (1) ONE URL at a time, never parallel. (2) Wait for full response - may take minutes for auth. (3) Skip only if 404. (4) Returns HTML with clickable links for subpage navigation.",
287
402
  inputSchema: {
288
403
  type: "object",
289
404
  properties: {
290
405
  url: { type: "string", description: "The URL to fetch" },
291
- keepPageOpen: { type: "boolean", description: "Keep tab open to reuse for subsequent same-domain requests - preserves auth session (default: true)" },
292
- outputFormat: { type: "string", enum: ["HTML", "TEXT", "BOTH"], description: "Output format: HTML for full markup with links/structure, TEXT for clean readable content (more token-efficient), BOTH for complete data (default: HTML)" },
293
406
  },
294
407
  required: ["url"],
295
408
  additionalProperties: false,
@@ -339,7 +452,13 @@ async function main() {
339
452
  await server.connect(transport);
340
453
  }
341
454
 
342
- main().catch((err) => {
343
- console.error(err);
344
- process.exit(1);
345
- });
455
+ // Export for testing
456
+ export { fetchPage, getBrowser, prepareHtml };
457
+
458
+ // Only run main if this is the entry point
459
+ if (import.meta.url === `file://${process.argv[1]}`) {
460
+ main().catch((err) => {
461
+ console.error(err);
462
+ process.exit(1);
463
+ });
464
+ }
@@ -0,0 +1,329 @@
1
+ /**
2
+ * UNIT TESTS - Automated tests using mock objects (NO browser required)
3
+ * These tests validate domain pooling logic without opening Chrome
4
+ * Run with: node tests/domain-tab-pooling.test.js
5
+ */
6
+
7
+ // Mock domain pages map and browser
8
+ class MockPage {
9
+ constructor(url) {
10
+ this._url = url;
11
+ this._closed = false;
12
+ this._content = '';
13
+ }
14
+
15
+ url() { return this._url; }
16
+ isClosed() { return this._closed; }
17
+ close() { this._closed = true; }
18
+ async bringToFront() {}
19
+ async goto(url) {
20
+ this._url = url;
21
+ // Simulate eng.ms page with multiple same-domain links
22
+ if (url.includes('eng.ms/docs/products/geneva')) {
23
+ this._content = `
24
+ <html>
25
+ <body>
26
+ <h1>Geneva Documentation</h1>
27
+ <a href="https://eng.ms/docs/products/geneva/getting-started">Getting Started</a>
28
+ <a href="https://eng.ms/docs/products/geneva/configuration">Configuration</a>
29
+ <a href="https://eng.ms/docs/products/geneva/monitoring">Monitoring</a>
30
+ <a href="https://eng.ms/docs/products/geneva/alerts">Alerts</a>
31
+ <a href="https://eng.ms/docs/products/geneva/best-practices">Best Practices</a>
32
+ <a href="https://external.com/link">External Link</a>
33
+ </body>
34
+ </html>
35
+ `;
36
+ }
37
+ }
38
+ async evaluate(fn) {
39
+ if (this._content) {
40
+ return fn.toString().includes('outerHTML') ? this._content : fn();
41
+ }
42
+ return fn();
43
+ }
44
+ }
45
+
46
+ class MockBrowser {
47
+ constructor() {
48
+ this._pages = [];
49
+ }
50
+
51
+ async newPage() {
52
+ const page = new MockPage('about:blank');
53
+ this._pages.push(page);
54
+ return page;
55
+ }
56
+
57
+ async pages() {
58
+ return this._pages;
59
+ }
60
+ }
61
+
62
+ // Test framework
63
+ let testsPassed = 0;
64
+ let testsFailed = 0;
65
+
66
+ function assert(condition, message) {
67
+ if (!condition) {
68
+ console.error(`❌ FAILED: ${message}`);
69
+ testsFailed++;
70
+ throw new Error(message);
71
+ } else {
72
+ console.log(`✅ PASSED: ${message}`);
73
+ testsPassed++;
74
+ }
75
+ }
76
+
77
+ async function test(name, fn) {
78
+ console.log(`\n🧪 Test: ${name}`);
79
+ try {
80
+ await fn();
81
+ } catch (error) {
82
+ console.error(` Error: ${error.message}`);
83
+ }
84
+ }
85
+
86
+ // Tests
87
+ async function runTests() {
88
+ console.log('🚀 Starting Domain Tab Pooling Tests\n');
89
+
90
+ await test('Should create new tab for first domain', async () => {
91
+ const domainPages = new Map();
92
+ const browser = new MockBrowser();
93
+ const url = 'https://github.com/user/repo';
94
+ const hostname = new URL(url).hostname;
95
+
96
+ // No existing page for this domain
97
+ assert(!domainPages.has(hostname), 'Domain should not exist in map initially');
98
+
99
+ // Create new page
100
+ const page = await browser.newPage();
101
+ domainPages.set(hostname, page);
102
+
103
+ assert(domainPages.has(hostname), 'Domain should be added to map');
104
+ assert(domainPages.get(hostname) === page, 'Correct page should be stored');
105
+ });
106
+
107
+ await test('Should reuse tab for same domain', async () => {
108
+ const domainPages = new Map();
109
+ const browser = new MockBrowser();
110
+ const hostname = 'github.com';
111
+
112
+ // Create first page for domain
113
+ const page1 = await browser.newPage();
114
+ await page1.goto('https://github.com/repo1');
115
+ domainPages.set(hostname, page1);
116
+
117
+ // Try to fetch another URL from same domain
118
+ const existingPage = domainPages.get(hostname);
119
+ assert(existingPage === page1, 'Should return same page for same domain');
120
+ assert(!existingPage.isClosed(), 'Page should still be open');
121
+ });
122
+
123
+ await test('Should create new tab for different domain', async () => {
124
+ const domainPages = new Map();
125
+ const browser = new MockBrowser();
126
+
127
+ // First domain
128
+ const page1 = await browser.newPage();
129
+ await page1.goto('https://github.com/repo');
130
+ domainPages.set('github.com', page1);
131
+
132
+ // Second domain - should create new tab
133
+ const hostname2 = 'microsoft.com';
134
+ assert(!domainPages.has(hostname2), 'Second domain should not exist yet');
135
+
136
+ const page2 = await browser.newPage();
137
+ await page2.goto('https://microsoft.com/docs');
138
+ domainPages.set(hostname2, page2);
139
+
140
+ assert(domainPages.has('github.com'), 'First domain should still exist');
141
+ assert(domainPages.has('microsoft.com'), 'Second domain should now exist');
142
+ assert(page1 !== page2, 'Should be different page objects');
143
+ assert(!page1.isClosed(), 'First page should still be open');
144
+ });
145
+
146
+ await test('Should reuse tab when returning to previous domain', async () => {
147
+ const domainPages = new Map();
148
+ const browser = new MockBrowser();
149
+
150
+ // Domain 1
151
+ const page1 = await browser.newPage();
152
+ domainPages.set('github.com', page1);
153
+
154
+ // Domain 2
155
+ const page2 = await browser.newPage();
156
+ domainPages.set('microsoft.com', page2);
157
+
158
+ // Back to domain 1
159
+ const reusedPage = domainPages.get('github.com');
160
+ assert(reusedPage === page1, 'Should reuse original page for domain 1');
161
+ assert(!reusedPage.isClosed(), 'Reused page should still be open');
162
+ assert(domainPages.size === 2, 'Should have 2 domains in map');
163
+ });
164
+
165
+ await test('Should handle closed tabs gracefully', async () => {
166
+ const domainPages = new Map();
167
+ const browser = new MockBrowser();
168
+ const hostname = 'github.com';
169
+
170
+ // Create and store page
171
+ const page = await browser.newPage();
172
+ domainPages.set(hostname, page);
173
+
174
+ // Simulate user closing the tab
175
+ page.close();
176
+
177
+ // Check if page is closed
178
+ const existingPage = domainPages.get(hostname);
179
+ if (existingPage && existingPage.isClosed()) {
180
+ domainPages.delete(hostname);
181
+ }
182
+
183
+ assert(!domainPages.has(hostname), 'Closed page should be removed from map');
184
+ });
185
+
186
+ await test('Should extract hostname correctly from URLs', async () => {
187
+ const testCases = [
188
+ { url: 'https://github.com/user/repo', expected: 'github.com' },
189
+ { url: 'https://microsoft.com/docs/page', expected: 'microsoft.com' },
190
+ { url: 'https://subdomain.example.com/path', expected: 'subdomain.example.com' },
191
+ { url: 'http://localhost:3000/test', expected: 'localhost' },
192
+ ];
193
+
194
+ for (const { url, expected } of testCases) {
195
+ const hostname = new URL(url).hostname;
196
+ assert(hostname === expected, `Hostname for ${url} should be ${expected}, got ${hostname}`);
197
+ }
198
+ });
199
+
200
+ await test('Should handle invalid URLs', async () => {
201
+ let errorThrown = false;
202
+ try {
203
+ new URL('not-a-valid-url');
204
+ } catch (error) {
205
+ errorThrown = true;
206
+ }
207
+ assert(errorThrown, 'Invalid URL should throw error');
208
+ });
209
+
210
+ await test('Should clear all pages on browser disconnect', async () => {
211
+ const domainPages = new Map();
212
+ const browser = new MockBrowser();
213
+
214
+ // Add multiple domains
215
+ const page1 = await browser.newPage();
216
+ domainPages.set('github.com', page1);
217
+
218
+ const page2 = await browser.newPage();
219
+ domainPages.set('microsoft.com', page2);
220
+
221
+ const page3 = await browser.newPage();
222
+ domainPages.set('google.com', page3);
223
+
224
+ assert(domainPages.size === 3, 'Should have 3 domains before disconnect');
225
+
226
+ // Simulate browser disconnect
227
+ domainPages.clear();
228
+
229
+ assert(domainPages.size === 0, 'All domains should be cleared after disconnect');
230
+ });
231
+
232
+ await test('Should handle multiple requests to same domain', async () => {
233
+ const domainPages = new Map();
234
+ const browser = new MockBrowser();
235
+ const hostname = 'github.com';
236
+
237
+ // First request
238
+ const page = await browser.newPage();
239
+ await page.goto('https://github.com/repo1');
240
+ domainPages.set(hostname, page);
241
+
242
+ // Multiple subsequent requests to same domain
243
+ for (let i = 2; i <= 5; i++) {
244
+ const existingPage = domainPages.get(hostname);
245
+ assert(existingPage === page, `Request ${i} should reuse same page`);
246
+ await existingPage.goto(`https://github.com/repo${i}`);
247
+ }
248
+
249
+ assert(domainPages.size === 1, 'Should still have only 1 domain in map');
250
+ });
251
+
252
+ await test('Should open internal eng.ms page', async () => {
253
+ const domainPages = new Map();
254
+ const browser = new MockBrowser();
255
+ const url = 'https://eng.ms/docs/products/geneva';
256
+ const hostname = new URL(url).hostname;
257
+
258
+ // First request to eng.ms domain
259
+ assert(!domainPages.has(hostname), 'eng.ms domain should not exist initially');
260
+
261
+ const page = await browser.newPage();
262
+ await page.goto(url);
263
+ domainPages.set(hostname, page);
264
+
265
+ assert(domainPages.has(hostname), 'eng.ms domain should be added to map');
266
+ assert(page.url() === url, 'Page URL should match requested URL');
267
+ assert(!page.isClosed(), 'Page should remain open');
268
+ });
269
+
270
+ await test('Should extract and load 5 URLs from same domain', async () => {
271
+ const domainPages = new Map();
272
+ const browser = new MockBrowser();
273
+ const initialUrl = 'https://eng.ms/docs/products/geneva';
274
+ const hostname = new URL(initialUrl).hostname;
275
+
276
+ // First: Load the initial page
277
+ const page = await browser.newPage();
278
+ await page.goto(initialUrl);
279
+ domainPages.set(hostname, page);
280
+
281
+ // Extract HTML content
282
+ const html = await page.evaluate(() => document.documentElement.outerHTML);
283
+ assert(html.includes('Geneva Documentation'), 'Page should contain Geneva content');
284
+
285
+ // Extract URLs from the same domain
286
+ const urlPattern = /href="(https:\/\/eng\.ms\/[^"]+)"/g;
287
+ const extractedUrls = [];
288
+ let match;
289
+ while ((match = urlPattern.exec(html)) !== null && extractedUrls.length < 5) {
290
+ extractedUrls.push(match[1]);
291
+ }
292
+
293
+ assert(extractedUrls.length === 5, `Should extract 5 URLs, got ${extractedUrls.length}`);
294
+
295
+ // Verify all URLs are from eng.ms domain
296
+ for (const url of extractedUrls) {
297
+ const urlHostname = new URL(url).hostname;
298
+ assert(urlHostname === hostname, `All URLs should be from ${hostname}, got ${urlHostname}`);
299
+ }
300
+
301
+ // Load each of the 5 URLs and verify tab reuse
302
+ const reusedPage = domainPages.get(hostname);
303
+ assert(reusedPage === page, 'Should reuse same page for same domain');
304
+
305
+ for (let i = 0; i < extractedUrls.length; i++) {
306
+ await reusedPage.goto(extractedUrls[i]);
307
+ assert(reusedPage.url() === extractedUrls[i], `URL ${i+1} should be loaded: ${extractedUrls[i]}`);
308
+ assert(!reusedPage.isClosed(), `Page should remain open after loading URL ${i+1}`);
309
+ }
310
+
311
+ assert(domainPages.size === 1, 'Should still have only 1 domain (eng.ms) in map after all loads');
312
+ });
313
+
314
+ // Summary
315
+ console.log('\n' + '='.repeat(50));
316
+ console.log(`✅ Tests Passed: ${testsPassed}`);
317
+ console.log(`❌ Tests Failed: ${testsFailed}`);
318
+ console.log('='.repeat(50));
319
+
320
+ if (testsFailed > 0) {
321
+ process.exit(1);
322
+ }
323
+ }
324
+
325
+ // Run tests
326
+ runTests().catch(error => {
327
+ console.error('Test suite failed:', error);
328
+ process.exit(1);
329
+ });
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Integration tests - REQUIRES REAL CHROME AND USER AUTHENTICATION
3
+ * These tests will actually open Chrome browser and require manual login
4
+ * Run with: node tests/integration.test.js
5
+ */
6
+
7
+ import { fileURLToPath } from 'url';
8
+ import path from 'path';
9
+ import { fetchPage } from '../src/mcp-browser.js';
10
+
11
+ const __filename = fileURLToPath(import.meta.url);
12
+ const __dirname = path.dirname(__filename);
13
+
14
+ // Test framework
15
+ let testsPassed = 0;
16
+ let testsFailed = 0;
17
+
18
+ function assert(condition, message) {
19
+ if (!condition) {
20
+ console.error(`❌ FAILED: ${message}`);
21
+ testsFailed++;
22
+ throw new Error(message);
23
+ } else {
24
+ console.log(`✅ PASSED: ${message}`);
25
+ testsPassed++;
26
+ }
27
+ }
28
+
29
+ async function test(name, fn) {
30
+ console.log(`\n🧪 Test: ${name}`);
31
+ try {
32
+ await fn();
33
+ } catch (error) {
34
+ console.error(` Error: ${error.message}`);
35
+ }
36
+ }
37
+
38
+ // Integration Tests
39
+ async function runIntegrationTests() {
40
+ console.log('🚀 Starting Integration Tests (REAL CHROME)\n');
41
+ console.log('⚠️ This will open Chrome browser and may require authentication');
42
+ console.log('⚠️ fetchPage function will WAIT for you to complete authentication\n');
43
+
44
+ try {
45
+ await test('Should fetch eng.ms page, extract links, and load them (full Copilot workflow)', async () => {
46
+ const url = 'https://eng.ms/docs/products/geneva';
47
+
48
+ // Step 1: Fetch initial page (with auth waiting)
49
+ console.log(` 📄 Step 1: Fetching ${url}`);
50
+ console.log(` ⏳ Function will wait up to 10 minutes for authentication...`);
51
+ console.log(` 💡 Complete login in the browser that opens`);
52
+
53
+ const result = await fetchPage({ url });
54
+
55
+ console.log(` ✅ Result: ${result.success ? 'SUCCESS' : 'FAILED'}`);
56
+ if (result.success) {
57
+ console.log(` 🔗 Final URL: ${result.url}`);
58
+ console.log(` 📄 HTML length: ${result.html?.length || 0} chars`);
59
+ } else {
60
+ console.log(` ❌ Error: ${result.error}`);
61
+ console.log(` 💡 Hint: ${result.hint}`);
62
+ }
63
+
64
+ assert(result.success, 'Should successfully fetch page after authentication');
65
+ assert(result.url.includes('eng.ms'), `URL should be from eng.ms domain, got: ${result.url}`);
66
+ assert(result.html && result.html.length > 0, 'Should return HTML content');
67
+
68
+ // Step 2: Extract ALL links from HTML, then pick 5 randomly
69
+ console.log(`\n 📋 Step 2: Extracting all links from HTML...`);
70
+
71
+ const baseUrl = new URL(result.url);
72
+ const urlPattern = /href=["']([^"']+)["']/g;
73
+ const allUrls = [];
74
+ let match;
75
+
76
+ // Static asset extensions to skip
77
+ const skipExtensions = ['.css', '.js', '.ico', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.woff', '.woff2', '.ttf', '.eot'];
78
+
79
+ // Extract ALL URLs first
80
+ while ((match = urlPattern.exec(result.html)) !== null) {
81
+ let foundUrl = match[1];
82
+
83
+ // Skip anchor links
84
+ if (foundUrl.includes('#')) continue;
85
+
86
+ // Convert relative URLs to absolute
87
+ if (foundUrl.startsWith('/')) {
88
+ foundUrl = `${baseUrl.origin}${foundUrl}`;
89
+ } else if (!foundUrl.startsWith('http')) {
90
+ continue; // Skip other relative URLs
91
+ }
92
+
93
+ // Skip static assets (check path without query string)
94
+ const urlWithoutQuery = foundUrl.split('?')[0];
95
+ if (skipExtensions.some(ext => urlWithoutQuery.toLowerCase().endsWith(ext))) continue;
96
+
97
+ // Only include eng.ms URLs (pages)
98
+ if (foundUrl.includes('eng.ms')) {
99
+ allUrls.push(foundUrl);
100
+ }
101
+ }
102
+
103
+ console.log(` 📊 Total page URLs found: ${allUrls.length}`);
104
+
105
+ // Remove duplicates
106
+ const uniqueUrls = [...new Set(allUrls)];
107
+ console.log(` 🔗 Unique page URLs: ${uniqueUrls.length}`);
108
+
109
+ // Randomly pick 5 URLs
110
+ const shuffled = uniqueUrls.sort(() => Math.random() - 0.5);
111
+ const extractedUrls = shuffled.slice(0, 5);
112
+
113
+ console.log(` 🎲 Randomly selected ${extractedUrls.length} URLs to test:`);
114
+ extractedUrls.forEach((link, i) => console.log(` ${i+1}. ${link}`));
115
+
116
+ assert(extractedUrls.length > 0, `Should extract at least one eng.ms URL, found ${extractedUrls.length}`);
117
+
118
+ // Step 3: Load each extracted URL (tab reuse)
119
+ console.log(`\n 🔄 Step 3: Loading extracted links (using same tab)...`);
120
+
121
+ const linksToTest = extractedUrls.slice(0, Math.min(5, extractedUrls.length));
122
+ for (let i = 0; i < linksToTest.length; i++) {
123
+ const link = linksToTest[i];
124
+ console.log(` 📄 Loading link ${i+1}/${linksToTest.length}: ${link}`);
125
+
126
+ const linkResult = await fetchPage({ url: link });
127
+
128
+ console.log(` ✅ Loaded: ${linkResult.url}`);
129
+ assert(linkResult.success, `Should successfully load link ${i+1}: ${link}`);
130
+ assert(linkResult.html && linkResult.html.length > 0, `Link ${i+1} should return HTML content`);
131
+ }
132
+ });
133
+
134
+ } catch (error) {
135
+ console.error('\n❌ Test suite error:', error.message);
136
+ testsFailed++;
137
+ } finally {
138
+ // Summary
139
+ console.log('\n' + '='.repeat(50));
140
+ console.log(`✅ Tests Passed: ${testsPassed}`);
141
+ console.log(`❌ Tests Failed: ${testsFailed}`);
142
+ console.log('='.repeat(50));
143
+ console.log('\n💡 Browser left open for manual inspection');
144
+
145
+ if (testsFailed > 0) {
146
+ process.exit(1);
147
+ }
148
+
149
+ // Exit immediately without waiting for browser
150
+ process.exit(0);
151
+ }
152
+ }
153
+
154
+ // Run tests
155
+ runIntegrationTests().catch(error => {
156
+ console.error('Test suite failed:', error);
157
+ process.exit(1);
158
+ });
@@ -0,0 +1,307 @@
1
+ import assert from 'assert';
2
+ import { prepareHtml } from '../src/mcp-browser.js';
3
+
4
+ console.log('🧪 Testing prepareHtml function\n');
5
+
6
+ let testsPassed = 0;
7
+ let testsFailed = 0;
8
+
9
+ function test(description, fn) {
10
+ try {
11
+ fn();
12
+ console.log(`✅ ${description}`);
13
+ testsPassed++;
14
+ } catch (err) {
15
+ console.log(`❌ ${description}`);
16
+ console.log(` Error: ${err.message}`);
17
+ testsFailed++;
18
+ }
19
+ }
20
+
21
+ // Test 1: Remove HTML comments
22
+ test('Should remove HTML comments', () => {
23
+ const html = '<div>Content<!-- This is a comment --></div>';
24
+ const result = prepareHtml(html, 'https://example.com');
25
+ assert(!result.includes('<!--'), 'Should not contain comment start');
26
+ assert(!result.includes('-->'), 'Should not contain comment end');
27
+ assert(result.includes('Content'), 'Should preserve content');
28
+ });
29
+
30
+ // Test 2: Remove script tags
31
+ test('Should remove script tags and their content', () => {
32
+ const html = '<div>Keep this</div><script>alert("remove");</script><div>And this</div>';
33
+ const result = prepareHtml(html, 'https://example.com');
34
+ assert(!result.includes('<script'), 'Should not contain script tag');
35
+ assert(!result.includes('alert'), 'Should not contain script content');
36
+ assert(result.includes('Keep this'), 'Should preserve content');
37
+ });
38
+
39
+ // Test 3: Remove style tags
40
+ test('Should remove style tags and their content', () => {
41
+ const html = '<div>Content</div><style>.class { color: red; }</style>';
42
+ const result = prepareHtml(html, 'https://example.com');
43
+ assert(!result.includes('<style'), 'Should not contain style tag');
44
+ assert(!result.includes('color: red'), 'Should not contain style content');
45
+ assert(result.includes('Content'), 'Should preserve content');
46
+ });
47
+
48
+ // Test 4: Remove meta tags
49
+ test('Should remove meta tags', () => {
50
+ const html = '<head><meta charset="utf-8"><meta name="viewport" content="width=device-width"></head><body>Content</body>';
51
+ const result = prepareHtml(html, 'https://example.com');
52
+ assert(!result.includes('<meta'), 'Should not contain meta tags');
53
+ assert(result.includes('Content'), 'Should preserve content');
54
+ });
55
+
56
+ // Test 5: Convert relative URLs in href
57
+ test('Should convert relative href URLs to absolute', () => {
58
+ const html = '<a href="/docs/page">Link</a>';
59
+ const result = prepareHtml(html, 'https://example.com');
60
+ assert(result.includes('href="https://example.com/docs/page"'), 'Should convert relative href to absolute');
61
+ });
62
+
63
+ // Test 6: Keep absolute URLs in href unchanged
64
+ test('Should keep absolute href URLs unchanged', () => {
65
+ const html = '<a href="https://other.com/page">Link</a>';
66
+ const result = prepareHtml(html, 'https://example.com');
67
+ assert(result.includes('href="https://other.com/page"'), 'Should keep absolute href unchanged');
68
+ });
69
+
70
+ // Test 7: Convert relative URLs in src
71
+ test('Should convert relative src URLs to absolute', () => {
72
+ const html = '<img src="/images/logo.png">';
73
+ const result = prepareHtml(html, 'https://example.com');
74
+ assert(result.includes('src="https://example.com/images/logo.png"'), 'Should convert relative src to absolute');
75
+ });
76
+
77
+ // Test 8: Keep absolute URLs in src unchanged
78
+ test('Should keep absolute src URLs unchanged', () => {
79
+ const html = '<img src="https://cdn.example.com/logo.png">';
80
+ const result = prepareHtml(html, 'https://example.com');
81
+ assert(result.includes('src="https://cdn.example.com/logo.png"'), 'Should keep absolute src unchanged');
82
+ });
83
+
84
+ // Test 9: Handle anchor links (should not modify)
85
+ test('Should not modify anchor links', () => {
86
+ const html = '<a href="#section">Jump</a>';
87
+ const result = prepareHtml(html, 'https://example.com');
88
+ assert(result.includes('href="#section"'), 'Should keep anchor links unchanged');
89
+ });
90
+
91
+ // Test 10: Handle mailto and tel links (should not modify)
92
+ test('Should not modify mailto and tel links', () => {
93
+ const html = '<a href="mailto:test@example.com">Email</a><a href="tel:+1234567890">Call</a>';
94
+ const result = prepareHtml(html, 'https://example.com');
95
+ assert(result.includes('href="mailto:test@example.com"'), 'Should keep mailto unchanged');
96
+ assert(result.includes('href="tel:+1234567890"'), 'Should keep tel unchanged');
97
+ });
98
+
99
+ // Test 11: Handle data URIs in src (should not modify)
100
+ test('Should not modify data URIs', () => {
101
+ const html = '<img src="">';
102
+ const result = prepareHtml(html, 'https://example.com');
103
+ assert(result.includes('src=""'), 'Should keep data URI unchanged');
104
+ });
105
+
106
+ // Test 12: Handle protocol-relative URLs (should not modify)
107
+ test('Should not modify protocol-relative URLs', () => {
108
+ const html = '<img src="//cdn.example.com/image.png">';
109
+ const result = prepareHtml(html, 'https://example.com');
110
+ assert(result.includes('src="//cdn.example.com/image.png"'), 'Should keep protocol-relative URL unchanged');
111
+ });
112
+
113
+ // Test 13: Handle empty or null HTML
114
+ test('Should handle empty HTML', () => {
115
+ const result = prepareHtml('', 'https://example.com');
116
+ assert.strictEqual(result, '', 'Should return empty string');
117
+ });
118
+
119
+ test('Should handle null HTML', () => {
120
+ const result = prepareHtml(null, 'https://example.com');
121
+ assert.strictEqual(result, '', 'Should return empty string for null');
122
+ });
123
+
124
+ // Test 14: Complex real-world example
125
+ test('Should handle complex HTML with multiple elements', () => {
126
+ const html = `
127
+ <!DOCTYPE html>
128
+ <html>
129
+ <head>
130
+ <meta charset="utf-8">
131
+ <title>Test Page</title>
132
+ <style>.test { color: blue; }</style>
133
+ <script>console.log("test");</script>
134
+ </head>
135
+ <body>
136
+ <!-- Main content -->
137
+ <div>
138
+ <a href="/page1">Page 1</a>
139
+ <a href="https://external.com">External</a>
140
+ <img src="/images/pic.jpg">
141
+ <script>alert("inline");</script>
142
+ </div>
143
+ </body>
144
+ </html>
145
+ `;
146
+ const result = prepareHtml(html, 'https://example.com/test/');
147
+
148
+ // Should not contain removed elements
149
+ assert(!result.includes('<meta'), 'Should remove meta');
150
+ assert(!result.includes('<style'), 'Should remove style');
151
+ assert(!result.includes('<script'), 'Should remove script');
152
+ assert(!result.includes('<!--'), 'Should remove comments');
153
+
154
+ // Should convert relative URLs
155
+ assert(result.includes('href="https://example.com/page1"'), 'Should convert relative href');
156
+ assert(result.includes('src="https://example.com/images/pic.jpg"'), 'Should convert relative src');
157
+
158
+ // Should keep absolute URLs
159
+ assert(result.includes('href="https://external.com"'), 'Should keep absolute href');
160
+
161
+ // Should preserve content
162
+ assert(result.includes('Page 1'), 'Should preserve content');
163
+ });
164
+
165
+ // Test 15: Verify script with attributes is removed
166
+ test('Should remove script tags with various attributes', () => {
167
+ const html = '<script type="text/javascript" async defer src="/app.js">console.log("test");</script>';
168
+ const result = prepareHtml(html, 'https://example.com');
169
+ assert(!result.includes('<script'), 'Should remove script with attributes');
170
+ assert(!result.includes('app.js'), 'Should remove script content');
171
+ });
172
+
173
+ // Test 16: Remove inline style attributes
174
+ test('Should remove inline style attributes', () => {
175
+ const html = '<div style="color: red; font-size: 14px;">Content</div>';
176
+ const result = prepareHtml(html, 'https://example.com');
177
+ assert(!result.includes('style='), 'Should remove style attribute');
178
+ assert(result.includes('Content'), 'Should preserve content');
179
+ });
180
+
181
+ // Test 17: Remove class attributes
182
+ test('Should remove class attributes', () => {
183
+ const html = '<div class="container main-content">Text</div>';
184
+ const result = prepareHtml(html, 'https://example.com');
185
+ assert(!result.includes('class='), 'Should remove class attribute');
186
+ assert(result.includes('Text'), 'Should preserve content');
187
+ });
188
+
189
+ // Test 18: Remove id attributes
190
+ test('Should remove id attributes', () => {
191
+ const html = '<div id="main-section">Content</div>';
192
+ const result = prepareHtml(html, 'https://example.com');
193
+ assert(!result.includes('id='), 'Should remove id attribute');
194
+ assert(result.includes('Content'), 'Should preserve content');
195
+ });
196
+
197
+ // Test 19: Remove data-* attributes
198
+ test('Should remove data-* attributes', () => {
199
+ const html = '<div data-id="123" data-value="test">Content</div>';
200
+ const result = prepareHtml(html, 'https://example.com');
201
+ assert(!result.includes('data-'), 'Should remove data attributes');
202
+ assert(result.includes('Content'), 'Should preserve content');
203
+ });
204
+
205
+ // Test 20: Remove event handler attributes
206
+ test('Should remove event handler attributes', () => {
207
+ const html = '<button onclick="handleClick()" onmouseover="hover()">Click</button>';
208
+ const result = prepareHtml(html, 'https://example.com');
209
+ assert(!result.includes('onclick='), 'Should remove onclick');
210
+ assert(!result.includes('onmouseover='), 'Should remove onmouseover');
211
+ assert(result.includes('Click'), 'Should preserve content');
212
+ });
213
+
214
+ // Test 21: Remove SVG tags
215
+ test('Should remove SVG tags and content', () => {
216
+ const html = '<div>Text</div><svg width="100" height="100"><circle cx="50" cy="50" r="40"/></svg>';
217
+ const result = prepareHtml(html, 'https://example.com');
218
+ assert(!result.includes('<svg'), 'Should remove svg tag');
219
+ assert(!result.includes('circle'), 'Should remove svg content');
220
+ assert(result.includes('Text'), 'Should preserve content');
221
+ });
222
+
223
+ // Test 22: Remove noscript tags
224
+ test('Should remove noscript tags and content', () => {
225
+ const html = '<div>Content</div><noscript>JavaScript is disabled</noscript>';
226
+ const result = prepareHtml(html, 'https://example.com');
227
+ assert(!result.includes('<noscript'), 'Should remove noscript tag');
228
+ assert(!result.includes('JavaScript is disabled'), 'Should remove noscript content');
229
+ assert(result.includes('Content'), 'Should preserve content');
230
+ });
231
+
232
+ // Test 23: Remove link tags
233
+ test('Should remove link tags', () => {
234
+ const html = '<head><link rel="stylesheet" href="/style.css"><link rel="preload" as="script"></head>';
235
+ const result = prepareHtml(html, 'https://example.com');
236
+ assert(!result.includes('<link'), 'Should remove link tags');
237
+ });
238
+
239
+ // Test 24: Remove role attributes
240
+ test('Should remove role attributes', () => {
241
+ const html = '<nav role="navigation">Menu</nav>';
242
+ const result = prepareHtml(html, 'https://example.com');
243
+ assert(!result.includes('role='), 'Should remove role attribute');
244
+ assert(result.includes('Menu'), 'Should preserve content');
245
+ });
246
+
247
+ // Test 25: Remove aria-* attributes
248
+ test('Should remove aria-* attributes', () => {
249
+ const html = '<button aria-label="Close" aria-pressed="false">X</button>';
250
+ const result = prepareHtml(html, 'https://example.com');
251
+ assert(!result.includes('aria-'), 'Should remove aria attributes');
252
+ assert(result.includes('X'), 'Should preserve content');
253
+ });
254
+
255
+ // Test 26: Collapse whitespace
256
+ test('Should collapse multiple whitespace into single space', () => {
257
+ const html = '<div>Line 1\n\n\n Line 2\t\t\tLine 3</div>';
258
+ const result = prepareHtml(html, 'https://example.com');
259
+ assert(!result.includes('\n\n'), 'Should remove multiple newlines');
260
+ assert(!result.includes(' '), 'Should remove multiple spaces');
261
+ assert(result.includes('Line 1'), 'Should preserve content');
262
+ });
263
+
264
+ // Test 27: Comprehensive test with all removals
265
+ test('Should handle HTML with all types of removals', () => {
266
+ const html = `
267
+ <div class="container" id="main" style="color: blue;" data-test="value" onclick="alert()">
268
+ <svg width="100"><circle/></svg>
269
+ <script>console.log("test");</script>
270
+ <style>.test { color: red; }</style>
271
+ <noscript>Enable JS</noscript>
272
+ <link rel="stylesheet" href="/style.css">
273
+ <div role="main" aria-label="content">
274
+ <a href="/page">Link</a>
275
+ <p>Text content</p>
276
+ </div>
277
+ </div>
278
+ `;
279
+ const result = prepareHtml(html, 'https://example.com/test/');
280
+
281
+ // Should remove all code attributes
282
+ assert(!result.includes('class='), 'Should remove class');
283
+ assert(!result.includes('id='), 'Should remove id');
284
+ assert(!result.includes('style='), 'Should remove style');
285
+ assert(!result.includes('data-'), 'Should remove data attributes');
286
+ assert(!result.includes('onclick='), 'Should remove onclick');
287
+ assert(!result.includes('role='), 'Should remove role');
288
+ assert(!result.includes('aria-'), 'Should remove aria');
289
+
290
+ // Should remove non-content elements
291
+ assert(!result.includes('<svg'), 'Should remove svg');
292
+ assert(!result.includes('<script'), 'Should remove script');
293
+ assert(!result.includes('<style'), 'Should remove style');
294
+ assert(!result.includes('<noscript'), 'Should remove noscript');
295
+ assert(!result.includes('<link'), 'Should remove link');
296
+
297
+ // Should preserve content and convert URLs
298
+ assert(result.includes('href="https://example.com/page"'), 'Should convert relative URL');
299
+ assert(result.includes('Text content'), 'Should preserve text');
300
+ });
301
+
302
+ console.log('\n==================================================');
303
+ console.log(`✅ Tests Passed: ${testsPassed}`);
304
+ console.log(`❌ Tests Failed: ${testsFailed}`);
305
+ console.log('==================================================\n');
306
+
307
+ process.exit(testsFailed > 0 ? 1 : 0);