@doppelgangerdev/doppelganger 0.5.7 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. package/LICENSE +2 -2
  2. package/README.md +9 -29
  3. package/agent.js +200 -101
  4. package/headful.js +126 -126
  5. package/package.json +2 -2
  6. package/scrape.js +249 -284
  7. package/server.js +469 -359
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
- Notice & Attribution License v1.0
2
- Version 1.0, January 2026
1
+ Notice & Attribution License v1.0
3
2
 
3
+ © 2026 Mnemosyne
4
4
 
5
5
  1. Definitions
6
6
  1.1. "Software" means all source code, binaries, scripts, libraries, components, build configurations, and other software artifacts provided under this license.
package/README.md CHANGED
@@ -1,31 +1,10 @@
1
- ![Doppelganger Banner](banner.png)
1
+ ![Doppelganger Banner](https://raw.githubusercontent.com/mnemosyne-artificial-intelligence/doppelganger/main/banner.png)
2
2
 
3
3
  # Doppelganger — Browser Automation for Everyone
4
4
 
5
- <div align="center">
6
- <a href="https://doppelgangerdev.com">
7
- <img src="https://img.shields.io/badge/Website-doppelgangerdev.com-0056ff?style=for-the-badge&logo=googlechrome&rounded=true" alt="Website" />
8
- </a>
9
- <a href="https://doppelgangerdev.com/docs">
10
- <img src="https://img.shields.io/badge/Docs-doppelgangerdev.com%2Fdocs-00c2ff?style=for-the-badge&logo=readthedocs&rounded=true" alt="Docs" />
11
- </a>
12
- <a href="https://forum.doppelgangerdev.com">
13
- <img src="https://img.shields.io/badge/Forum-forum.doppelgangerdev.com-ff9900?style=for-the-badge&logo=discourse&rounded=true" alt="Forum" />
14
- </a>
15
- <a href="https://opensource.org/">
16
- <img src="https://img.shields.io/badge/Open_Source-Yes-0056ff?style=for-the-badge&logo=opensourceinitiative&logoColor=white" alt="Open Source" />
17
- </a>
18
- <a href="https://www.npmjs.com/package/@doppelgangerdev/doppelganger">
19
- <img src="https://img.shields.io/badge/Version-0.5.5-6a8cff?style=for-the-badge&logo=npm&rounded=true" alt="Version" />
20
- </a>
21
- <a href="https://hub.docker.com/r/mnemosyneai/doppelganger">
22
- <img src="https://img.shields.io/badge/Docker-mnemosyneai%2Fdoppelganger-0db7ed?style=for-the-badge&logo=docker&rounded=true" alt="Docker" />
23
- </a>
24
- </div>
25
-
26
5
  Doppelganger is a self‑hosted, block-first automation control plane built for teams that want predictable, auditable browser workflows without pushing sensitive data to third‑party SaaS. It bundles a React/Vite frontend, an Express/Playwright backend, helper scripts, and optional CLI tooling so you can sketch blocks, inject JavaScript, rotate proxies, and run everything locally.
27
6
 
28
- ![Demo run](demo-run.gif)
7
+ ![Demo run](https://raw.githubusercontent.com/mnemosyne-artificial-intelligence/doppelganger/main/demo-run.gif)
29
8
 
30
9
  # What You Get
31
10
 
@@ -236,7 +215,7 @@ Authentication enforces sessions (`/api/auth/login`, `/api/auth/logout`, `/api/a
236
215
 
237
216
  # Maintenance
238
217
 
239
- - The project is governed by the **[Sustainable Use License (SUL 1.0)](https://github.com/mnemosyne-artificial-intelligence/doppelganger/blob/main/LICENSE)**; hosting it as a competing service is prohibited.
218
+ - The project is governed by the **[Notice & Attribution License v1.0](https://github.com/mnemosyne-artificial-intelligence/doppelganger/blob/main/LICENSE)**, which grants royalty-free internal/private rights while requiring notice, attribution, and source delivery when the software is deployed for external end users; hosting it as a competing service is prohibited.
240
219
  - Keep `data/` and `storage_state.json` backed up if you rely on historical cookies or proxies.
241
220
  - Release updates by pulling `mnemosyneai/doppelganger` (Docker) or `npm i @doppelgangerdev/doppelganger` (npm). The Settings view always displays the current package version.
242
221
  - Contributions: follow `.github/` templates, respect `CONTRIBUTING.md`, and run available lint/test scripts if you touch critical areas.
@@ -250,15 +229,16 @@ Authentication enforces sessions (`/api/auth/login`, `/api/auth/logout`, `/api/a
250
229
  - [x] **Task proxy rotation toggle** — the “Rotate Proxies” option in each task ties into the Settings rotation controls, enabling rotation per execution.
251
230
  - [ ] **Action key combos** — add modifier shortcuts (e.g., Ctrl+Click, Shift+Scroll) so tasks can more closely mirror real user interactions.
252
231
  - [ ] **Click-and-drag block** — add an action that does drag gestures (selecting text, moving items) so tasks can simulate click-and-drag flows.
253
- - [x] **Recording controls** — Task editor now exposes a “Disable automated recording” switch in the general settings panel so workflows can skip video capture on a per-task basis.
232
+ - [x] **Recording controls** — Task editor now exposes a “Disable automated recording” switch in the general settings panel so workflows can skip video capture on a per-task basis.
254
233
  - [ ] **File downloads** — add explicit support for agent tasks to download files (PDFs, CSVs, etc.) directly from target pages, then surface those downloads in the UI so users can preview or export them without sifting through captures.
255
- - [x] **Stateless mode** — Tasks now have a “Stateless execution” toggle alongside the recording controls so each run can skip `storage_state.json`, ensuring no cookies or local storage persist between executions for that workflow.
256
- - [ ] **Adblocking filters** — add controls so execution contexts can enable built-in ad/malware filtering (e.g., via hosts file overrides or request blocking) to reduce noise on sensitive sites.
257
- - [ ] **Extraction response mode** — add a Settings switch so users can choose whether the UI returns HTML+data (for debugging) or data-only payloads when extraction scripts run.
234
+ - [x] **Stateless mode** — Tasks now have a “Stateless execution” toggle alongside the recording controls so each run can skip `storage_state.json`, ensuring no cookies or local storage persist between executions for that workflow.
235
+ - [ ] **Adblocking filters** — add controls so execution contexts can enable built-in ad/malware filtering (e.g., via hosts file overrides or request blocking) to reduce noise on sensitive sites.
236
+ - [ ] **Extraction response mode** — add a Settings switch so users can choose whether the UI returns HTML+data (for debugging) or data-only payloads when extraction scripts run.
258
237
  - [ ] **Folder organization** — group tasks, assets, and captures into named folders so operators can browse, filter, and download collections per workflow.
259
238
  - [ ] **Stable capture retention** — add filtering, pinning, and archiving in captures tab so teams can keep compliance records.
260
239
  - [ ] **Workspace templates** — allow saving and sharing workspace presets (layout + default proxies/agents) so new team members can onboard with pre-configured setups.
261
240
  - [ ] **Geo-targeted exits** — allow choosing proxy regions for tasks so you can pin the apparent location before running a job.
241
+ - [ ] **Complete anti-detection coverage** — follow browserscan.net's anti-detection checklist (fingerprints, headers, fonts, WebRTC, etc.) so automated runs mimic real browsers across task executions.
262
242
  - [ ] **Session recording redaction** — add toggles to redact sensitive fields (passwords, credit cards) from recordings/logs before storing them.
263
243
  - [ ] **Two-factor authentication** — add optional TOTP/second-factor support to Settings/Auth so operators can lock down the UI with 2FA.
264
244
  - [ ] **AI-assisted fixing** — add an “AI auto-fix” helper that suggests layout, selector, and proxy tweaks after failed runs, letting teams approve or discard the proposed changes without switching contexts.
@@ -276,4 +256,4 @@ Authentication enforces sessions (`/api/auth/login`, `/api/auth/logout`, `/api/a
276
256
 
277
257
  - Report issues or request features via the GitHub repo issue tracker.
278
258
  - Follow the authors on `https://github.com/mnemosyne-artificial-intelligence` for releases.
279
- - Share automation recipes with other self-hosted users in your org, but respect the license for sharing infrastructure.
259
+ - Share automation recipes with other self-hosted users in your org, but respect the license for sharing infrastructure.
package/agent.js CHANGED
@@ -2,8 +2,10 @@ const { chromium } = require('playwright');
2
2
  const { JSDOM } = require('jsdom');
3
3
  const fs = require('fs');
4
4
  const path = require('path');
5
+ const vm = require('vm');
5
6
  const { getProxySelection } = require('./proxy-rotation');
6
7
  const { selectUserAgent } = require('./user-agent-settings');
8
+ const { formatHTML, safeFormatHTML } = require('./html-utils');
7
9
 
8
10
  const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
9
11
  const STORAGE_STATE_FILE = (() => {
@@ -20,10 +22,10 @@ const STORAGE_STATE_FILE = (() => {
20
22
 
21
23
  const API_KEY_FILE = path.join(__dirname, 'data', 'api_key.json');
22
24
 
23
- const loadApiKey = () => {
24
- if (!fs.existsSync(API_KEY_FILE)) return null;
25
+ const loadApiKey = async () => {
25
26
  try {
26
- const data = JSON.parse(fs.readFileSync(API_KEY_FILE, 'utf8'));
27
+ const raw = await fs.promises.readFile(API_KEY_FILE, 'utf8');
28
+ const data = JSON.parse(raw);
27
29
  return data && data.apiKey ? data.apiKey : null;
28
30
  } catch {
29
31
  return null;
@@ -110,17 +112,17 @@ async function overshootScroll(page, targetY) {
110
112
  }
111
113
  }
112
114
 
113
- const punctuationPause = /[.,!?;:]/;
114
-
115
- const randomBetween = (min, max) => min + Math.random() * (max - min);
116
- const parseBooleanFlag = (value) => {
117
- if (typeof value === 'boolean') return value;
118
- if (value === undefined || value === null) return false;
119
- const normalized = String(value).toLowerCase();
120
- return normalized === 'true' || normalized === '1';
121
- };
122
-
123
- async function humanType(page, selector, text, options = {}) {
115
+ const punctuationPause = /[.,!?;:]/;
116
+
117
+ const randomBetween = (min, max) => min + Math.random() * (max - min);
118
+ const parseBooleanFlag = (value) => {
119
+ if (typeof value === 'boolean') return value;
120
+ if (value === undefined || value === null) return false;
121
+ const normalized = String(value).toLowerCase();
122
+ return normalized === 'true' || normalized === '1';
123
+ };
124
+
125
+ async function humanType(page, selector, text, options = {}) {
124
126
  const { allowTypos = false, naturalTyping = false, fatigue = false } = options;
125
127
  if (selector) await page.focus(selector);
126
128
  const chars = text.split('');
@@ -167,21 +169,108 @@ async function humanType(page, selector, text, options = {}) {
167
169
  }
168
170
  }
169
171
 
172
+ const REAL_TARGET = Symbol('REAL_TARGET');
173
+
174
+ function createSafeProxy(target) {
175
+ if (target === null || (typeof target !== 'object' && typeof target !== 'function')) {
176
+ return target;
177
+ }
178
+
179
+ let shadowTarget = target;
180
+ if (typeof target === 'function') {
181
+ shadowTarget = function (...args) { };
182
+ try { Object.defineProperty(shadowTarget, 'name', { value: target.name, configurable: true }); } catch {}
183
+ try { Object.defineProperty(shadowTarget, 'length', { value: target.length, configurable: true }); } catch {}
184
+ shadowTarget[REAL_TARGET] = target;
185
+ }
186
+
187
+ return new Proxy(shadowTarget, {
188
+ get(target, prop, receiver) {
189
+ const realTarget = target[REAL_TARGET] || target;
190
+ if (prop === 'constructor' || prop === '__proto__') {
191
+ return undefined;
192
+ }
193
+ if (prop === REAL_TARGET) return realTarget;
194
+
195
+ const value = Reflect.get(realTarget, prop, realTarget);
196
+
197
+ if (typeof value === 'function') {
198
+ return function (...args) {
199
+ const realArgs = args.map(arg => {
200
+ return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
201
+ });
202
+ const wrappedArgs = realArgs.map(arg => {
203
+ if (typeof arg === 'function') {
204
+ return function (...cbArgs) {
205
+ const wrappedCbArgs = cbArgs.map(a => createSafeProxy(a));
206
+ return arg.apply(this, wrappedCbArgs);
207
+ }
208
+ }
209
+ return arg;
210
+ });
211
+ try {
212
+ const result = value.apply(realTarget, wrappedArgs);
213
+ return createSafeProxy(result);
214
+ } catch (e) {
215
+ throw e;
216
+ }
217
+ };
218
+ }
219
+ return createSafeProxy(value);
220
+ },
221
+ apply(target, thisArg, argList) {
222
+ const realTarget = target[REAL_TARGET] || target;
223
+ const realThis = (thisArg && thisArg[REAL_TARGET]) ? thisArg[REAL_TARGET] : thisArg;
224
+ const realArgs = argList.map(arg => {
225
+ return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
226
+ });
227
+ const wrappedArgs = realArgs.map(arg => {
228
+ if (typeof arg === 'function') {
229
+ return function (...cbArgs) {
230
+ const wrappedCbArgs = cbArgs.map(a => createSafeProxy(a));
231
+ return arg.apply(this, wrappedCbArgs);
232
+ }
233
+ }
234
+ return arg;
235
+ });
236
+
237
+ try {
238
+ const result = Reflect.apply(realTarget, realThis, wrappedArgs);
239
+ return createSafeProxy(result);
240
+ } catch (e) {
241
+ throw e;
242
+ }
243
+ },
244
+ construct(target, argumentsList, newTarget) {
245
+ const realTarget = target[REAL_TARGET] || target;
246
+ const realArgs = argumentsList.map(arg => {
247
+ return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
248
+ });
249
+ try {
250
+ const result = Reflect.construct(realTarget, realArgs, realTarget);
251
+ return createSafeProxy(result);
252
+ } catch (e) {
253
+ throw e;
254
+ }
255
+ }
256
+ });
257
+ }
258
+
170
259
  async function handleAgent(req, res) {
171
260
  const data = (req.method === 'POST') ? req.body : req.query;
172
261
  let { url, actions, wait: globalWait, rotateUserAgents, rotateProxies, humanTyping, stealth = {} } = data;
173
262
  const runId = data.runId ? String(data.runId) : null;
174
263
  const captureRunId = runId || `run_${Date.now()}_unknown`;
175
- const includeShadowDomRaw = data.includeShadowDom ?? req.query.includeShadowDom;
176
- const includeShadowDom = includeShadowDomRaw === undefined
177
- ? true
178
- : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
179
- const disableRecordingRaw = data.disableRecording ?? req.query.disableRecording;
180
- const disableRecording = parseBooleanFlag(disableRecordingRaw);
181
- const statelessExecutionRaw = data.statelessExecution ?? req.query.statelessExecution;
182
- const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
183
- const {
184
- allowTypos = false,
264
+ const includeShadowDomRaw = data.includeShadowDom ?? req.query.includeShadowDom;
265
+ const includeShadowDom = includeShadowDomRaw === undefined
266
+ ? true
267
+ : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
268
+ const disableRecordingRaw = data.disableRecording ?? req.query.disableRecording;
269
+ const disableRecording = parseBooleanFlag(disableRecordingRaw);
270
+ const statelessExecutionRaw = data.statelessExecution ?? req.query.statelessExecution;
271
+ const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
272
+ const {
273
+ allowTypos = false,
185
274
  idleMovements = false,
186
275
  overscroll = false,
187
276
  deadClicks = false,
@@ -204,10 +293,10 @@ async function handleAgent(req, res) {
204
293
  });
205
294
  }
206
295
 
207
- const localPort = req.socket && req.socket.localPort;
208
- const configuredPort = process.env.PORT || process.env.VITE_BACKEND_PORT;
209
- const basePort = localPort || configuredPort || '11345';
210
- const baseUrl = `${req.protocol || 'http'}://127.0.0.1:${basePort}`;
296
+ const localPort = req.socket && req.socket.localPort;
297
+ const configuredPort = process.env.PORT || process.env.VITE_BACKEND_PORT;
298
+ const basePort = localPort || configuredPort || '11345';
299
+ const baseUrl = `${req.protocol || 'http'}://127.0.0.1:${basePort}`;
211
300
  const runtimeVars = { ...(data.taskVariables || data.variables || {}) };
212
301
  let lastBlockOutput = null;
213
302
  runtimeVars['block.output'] = lastBlockOutput;
@@ -268,39 +357,58 @@ async function handleAgent(req, res) {
268
357
 
269
358
  const parseCsv = (input) => {
270
359
  const text = typeof input === 'string' ? input : String(input || '');
360
+ const len = text.length;
271
361
  const rows = [];
272
362
  let row = [];
273
363
  let current = '';
274
364
  let inQuotes = false;
365
+ const specialChar = /[",\n\r]/g;
275
366
 
276
- for (let i = 0; i < text.length; i += 1) {
277
- const char = text[i];
367
+ let i = 0;
368
+ while (i < len) {
278
369
  if (inQuotes) {
279
- if (char === '"') {
280
- if (text[i + 1] === '"') {
281
- current += '"';
282
- i += 1;
283
- } else {
284
- inQuotes = false;
285
- }
370
+ const nextQuote = text.indexOf('"', i);
371
+ if (nextQuote === -1) {
372
+ current += text.slice(i);
373
+ i = len;
374
+ break;
375
+ }
376
+ current += text.slice(i, nextQuote);
377
+ i = nextQuote;
378
+ if (i + 1 < len && text[i + 1] === '"') {
379
+ current += '"';
380
+ i += 2;
286
381
  } else {
287
- current += char;
382
+ inQuotes = false;
383
+ i += 1;
288
384
  }
289
385
  } else {
386
+ specialChar.lastIndex = i;
387
+ const match = specialChar.exec(text);
388
+ if (!match) {
389
+ current += text.slice(i);
390
+ i = len;
391
+ break;
392
+ }
393
+ const idx = match.index;
394
+ const char = match[0];
395
+ current += text.slice(i, idx);
396
+ i = idx;
290
397
  if (char === '"') {
291
398
  inQuotes = true;
399
+ i += 1;
292
400
  } else if (char === ',') {
293
401
  row.push(current);
294
402
  current = '';
403
+ i += 1;
295
404
  } else if (char === '\n') {
296
405
  row.push(current);
297
406
  rows.push(row);
298
407
  row = [];
299
408
  current = '';
409
+ i += 1;
300
410
  } else if (char === '\r') {
301
- // ignore CR (handle CRLF)
302
- } else {
303
- current += char;
411
+ i += 1;
304
412
  }
305
413
  }
306
414
  }
@@ -408,7 +516,7 @@ async function handleAgent(req, res) {
408
516
  return { startToEnd, startToElse, elseToEnd, endToStart };
409
517
  };
410
518
 
411
- const selectedUA = selectUserAgent(rotateUserAgents);
519
+ const selectedUA = await selectUserAgent(rotateUserAgents);
412
520
 
413
521
  let browser;
414
522
  let context;
@@ -435,33 +543,31 @@ async function handleAgent(req, res) {
435
543
  browser = await chromium.launch(launchOptions);
436
544
 
437
545
  const recordingsDir = path.join(__dirname, 'data', 'recordings');
438
- if (!fs.existsSync(recordingsDir)) {
439
- fs.mkdirSync(recordingsDir, { recursive: true });
440
- }
546
+ await fs.promises.mkdir(recordingsDir, { recursive: true });
441
547
 
442
548
  const rotateViewport = String(data.rotateViewport).toLowerCase() === 'true' || data.rotateViewport === true;
443
549
  const viewport = rotateViewport
444
550
  ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
445
551
  : { width: 1366, height: 768 };
446
552
 
447
- const contextOptions = {
448
- userAgent: selectedUA,
449
- viewport,
450
- deviceScaleFactor: 1,
451
- locale: 'en-US',
452
- timezoneId: 'America/New_York',
453
- colorScheme: 'dark',
454
- permissions: ['geolocation'],
455
- };
456
-
457
- const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
458
- if (shouldUseStorageState) {
459
- contextOptions.storageState = STORAGE_STATE_FILE;
460
- }
461
-
462
- if (!disableRecording) {
463
- contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
464
- }
553
+ const contextOptions = {
554
+ userAgent: selectedUA,
555
+ viewport,
556
+ deviceScaleFactor: 1,
557
+ locale: 'en-US',
558
+ timezoneId: 'America/New_York',
559
+ colorScheme: 'dark',
560
+ permissions: ['geolocation'],
561
+ };
562
+
563
+ const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
564
+ if (shouldUseStorageState) {
565
+ contextOptions.storageState = STORAGE_STATE_FILE;
566
+ }
567
+
568
+ if (!disableRecording) {
569
+ contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
570
+ }
465
571
  context = await browser.newContext(contextOptions);
466
572
 
467
573
  await context.addInitScript(() => {
@@ -1022,7 +1128,7 @@ async function handleAgent(req, res) {
1022
1128
  case 'start': {
1023
1129
  const taskId = resolveMaybe(act.value);
1024
1130
  if (!taskId) throw new Error('Missing task id.');
1025
- const apiKey = loadApiKey() || data.apiKey || data.key;
1131
+ const apiKey = (await loadApiKey()) || data.apiKey || data.key;
1026
1132
  if (!apiKey) {
1027
1133
  logs.push('No API key available; attempting internal start.');
1028
1134
  }
@@ -1363,16 +1469,6 @@ async function handleAgent(req, res) {
1363
1469
  return { shadowQueryAll, shadowText };
1364
1470
  })();
1365
1471
 
1366
- // CodeQL alerts on dynamic eval, but extraction scripts intentionally run inside the browser sandbox,
1367
- // so we expose only the helpers needed (window, document, DOMParser, console) and keep the evaluation confined there.
1368
- const executor = new Function(
1369
- '$$data',
1370
- 'window',
1371
- 'document',
1372
- 'DOMParser',
1373
- 'console',
1374
- `"use strict"; return (async () => { ${script}\n})();`
1375
- );
1376
1472
  const $$data = {
1377
1473
  html: () => html || '',
1378
1474
  url: () => pageUrl || '',
@@ -1381,7 +1477,33 @@ async function handleAgent(req, res) {
1381
1477
  shadowQueryAll: includeShadowDom ? shadowHelpers.shadowQueryAll : undefined,
1382
1478
  shadowText: includeShadowDom ? shadowHelpers.shadowText : undefined
1383
1479
  };
1384
- const result = await executor($$data, window, window.document, window.DOMParser, consoleProxy);
1480
+
1481
+ // Use vm for sandboxed execution
1482
+ const sandbox = Object.create(null);
1483
+ sandbox.window = createSafeProxy(window);
1484
+ sandbox.document = createSafeProxy(window.document);
1485
+ sandbox.DOMParser = createSafeProxy(window.DOMParser);
1486
+ sandbox.console = createSafeProxy(consoleProxy);
1487
+ sandbox.$$data = createSafeProxy($$data);
1488
+
1489
+ // Pass the script as a variable to avoid string interpolation (CodeQL: Code Injection)
1490
+ sandbox.$$userScript = script;
1491
+
1492
+ const context = vm.createContext(sandbox);
1493
+
1494
+ // We use a static wrapper to execute the user script.
1495
+ // This ensures that the code passed to vm.runInContext is constant and safe.
1496
+ // The user script is retrieved from the sandbox environment and executed as an AsyncFunction.
1497
+ const scriptCode = `
1498
+ "use strict";
1499
+ (async () => {
1500
+ const AsyncFunction = Object.getPrototypeOf(async function(){}).constructor;
1501
+ const fn = new AsyncFunction('$$data', 'window', 'document', 'DOMParser', 'console', $$userScript);
1502
+ return fn($$data, window, document, DOMParser, console);
1503
+ })();
1504
+ `;
1505
+
1506
+ const result = await vm.runInContext(scriptCode, context);
1385
1507
  return { result, logs: logBuffer };
1386
1508
  } catch (e) {
1387
1509
  return { result: `Extraction script error: ${e.message}`, logs: [] };
@@ -1394,29 +1516,6 @@ async function handleAgent(req, res) {
1394
1516
  const extractionScript = extractionScriptRaw ? resolveTemplate(extractionScriptRaw) : undefined;
1395
1517
  const extraction = await runExtractionScript(extractionScript, cleanedHtml, page.url());
1396
1518
 
1397
- // Simple HTML Formatter (fallback to raw if formatting collapses content)
1398
- const formatHTML = (html) => {
1399
- let indent = 0;
1400
- return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
1401
- if (slash) indent--;
1402
- const result = ' '.repeat(Math.max(0, indent)) + match;
1403
- if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
1404
- return '\n' + result;
1405
- }).trim();
1406
- };
1407
-
1408
- const safeFormatHTML = (html) => {
1409
- if (typeof html !== 'string') return '';
1410
- try {
1411
- const formatted = formatHTML(html);
1412
- if (!formatted) return html;
1413
- if (formatted.length < Math.max(200, Math.floor(html.length * 0.5))) return html;
1414
- return formatted;
1415
- } catch {
1416
- return html;
1417
- }
1418
- };
1419
-
1420
1519
  // Ensure the public/screenshots directory exists
1421
1520
  const capturesDir = path.join(__dirname, 'public', 'captures');
1422
1521
  if (!fs.existsSync(capturesDir)) {
@@ -1447,9 +1546,9 @@ async function handleAgent(req, res) {
1447
1546
  };
1448
1547
 
1449
1548
  const video = page.video();
1450
- if (!statelessExecution) {
1451
- try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch {}
1452
- }
1549
+ if (!statelessExecution) {
1550
+ try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch {}
1551
+ }
1453
1552
  try { await context.close(); } catch {}
1454
1553
  if (video) {
1455
1554
  try {