@doppelgangerdev/doppelganger 0.5.6 → 0.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +177 -157
- package/README.md +259 -274
- package/agent.js +176 -62
- package/dist/assets/index-Cwmqk52G.js +19 -0
- package/dist/assets/{index-isZw-0dm.css → index-CxzMazJO.css} +1 -1
- package/dist/captures/run_1769734411613_783_scrape_1769734425256.png +0 -0
- package/dist/captures/run_1769734411613_783_scrape_1769734428068.webm +0 -0
- package/dist/captures/run_1769734522774_unknown_scrape_1769734535501.png +0 -0
- package/dist/captures/run_1769734522774_unknown_scrape_1769734538775.webm +0 -0
- package/dist/index.html +133 -22
- package/headful.js +92 -82
- package/package.json +2 -2
- package/public/captures/run_1770084709375_263_scrape_1770084720880.png +0 -0
- package/public/captures/run_1770084753714_765_agent_1770084772039.png +0 -0
- package/public/captures/run_1770084753714_765_agent_1770084774318.webm +0 -0
- package/public/captures/run_1770084826401_32_scrape_1770084832653.png +0 -0
- package/public/captures/run_1770084826401_32_scrape_1770084835345.webm +0 -0
- package/public/captures/run_1770084861758_434_scrape_1770084869777.png +0 -0
- package/public/captures/run_1770084861758_434_scrape_1770084875604.webm +0 -0
- package/public/captures/run_1770084870793_97_scrape_1770084879360.png +0 -0
- package/public/captures/run_1770084870793_97_scrape_1770084882219.webm +0 -0
- package/scrape.js +235 -253
- package/server.js +442 -312
- package/dist/assets/index-BKB-zmAO.js +0 -19
package/agent.js
CHANGED
|
@@ -2,8 +2,10 @@ const { chromium } = require('playwright');
|
|
|
2
2
|
const { JSDOM } = require('jsdom');
|
|
3
3
|
const fs = require('fs');
|
|
4
4
|
const path = require('path');
|
|
5
|
+
const vm = require('vm');
|
|
5
6
|
const { getProxySelection } = require('./proxy-rotation');
|
|
6
7
|
const { selectUserAgent } = require('./user-agent-settings');
|
|
8
|
+
const { formatHTML, safeFormatHTML } = require('./html-utils');
|
|
7
9
|
|
|
8
10
|
const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
|
|
9
11
|
const STORAGE_STATE_FILE = (() => {
|
|
@@ -20,10 +22,10 @@ const STORAGE_STATE_FILE = (() => {
|
|
|
20
22
|
|
|
21
23
|
const API_KEY_FILE = path.join(__dirname, 'data', 'api_key.json');
|
|
22
24
|
|
|
23
|
-
const loadApiKey = () => {
|
|
24
|
-
if (!fs.existsSync(API_KEY_FILE)) return null;
|
|
25
|
+
const loadApiKey = async () => {
|
|
25
26
|
try {
|
|
26
|
-
const
|
|
27
|
+
const raw = await fs.promises.readFile(API_KEY_FILE, 'utf8');
|
|
28
|
+
const data = JSON.parse(raw);
|
|
27
29
|
return data && data.apiKey ? data.apiKey : null;
|
|
28
30
|
} catch {
|
|
29
31
|
return null;
|
|
@@ -113,6 +115,12 @@ async function overshootScroll(page, targetY) {
|
|
|
113
115
|
const punctuationPause = /[.,!?;:]/;
|
|
114
116
|
|
|
115
117
|
const randomBetween = (min, max) => min + Math.random() * (max - min);
|
|
118
|
+
const parseBooleanFlag = (value) => {
|
|
119
|
+
if (typeof value === 'boolean') return value;
|
|
120
|
+
if (value === undefined || value === null) return false;
|
|
121
|
+
const normalized = String(value).toLowerCase();
|
|
122
|
+
return normalized === 'true' || normalized === '1';
|
|
123
|
+
};
|
|
116
124
|
|
|
117
125
|
async function humanType(page, selector, text, options = {}) {
|
|
118
126
|
const { allowTypos = false, naturalTyping = false, fatigue = false } = options;
|
|
@@ -161,6 +169,93 @@ async function humanType(page, selector, text, options = {}) {
|
|
|
161
169
|
}
|
|
162
170
|
}
|
|
163
171
|
|
|
172
|
+
const REAL_TARGET = Symbol('REAL_TARGET');
|
|
173
|
+
|
|
174
|
+
function createSafeProxy(target) {
|
|
175
|
+
if (target === null || (typeof target !== 'object' && typeof target !== 'function')) {
|
|
176
|
+
return target;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
let shadowTarget = target;
|
|
180
|
+
if (typeof target === 'function') {
|
|
181
|
+
shadowTarget = function (...args) { };
|
|
182
|
+
try { Object.defineProperty(shadowTarget, 'name', { value: target.name, configurable: true }); } catch {}
|
|
183
|
+
try { Object.defineProperty(shadowTarget, 'length', { value: target.length, configurable: true }); } catch {}
|
|
184
|
+
shadowTarget[REAL_TARGET] = target;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return new Proxy(shadowTarget, {
|
|
188
|
+
get(target, prop, receiver) {
|
|
189
|
+
const realTarget = target[REAL_TARGET] || target;
|
|
190
|
+
if (prop === 'constructor' || prop === '__proto__') {
|
|
191
|
+
return undefined;
|
|
192
|
+
}
|
|
193
|
+
if (prop === REAL_TARGET) return realTarget;
|
|
194
|
+
|
|
195
|
+
const value = Reflect.get(realTarget, prop, realTarget);
|
|
196
|
+
|
|
197
|
+
if (typeof value === 'function') {
|
|
198
|
+
return function (...args) {
|
|
199
|
+
const realArgs = args.map(arg => {
|
|
200
|
+
return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
|
|
201
|
+
});
|
|
202
|
+
const wrappedArgs = realArgs.map(arg => {
|
|
203
|
+
if (typeof arg === 'function') {
|
|
204
|
+
return function (...cbArgs) {
|
|
205
|
+
const wrappedCbArgs = cbArgs.map(a => createSafeProxy(a));
|
|
206
|
+
return arg.apply(this, wrappedCbArgs);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return arg;
|
|
210
|
+
});
|
|
211
|
+
try {
|
|
212
|
+
const result = value.apply(realTarget, wrappedArgs);
|
|
213
|
+
return createSafeProxy(result);
|
|
214
|
+
} catch (e) {
|
|
215
|
+
throw e;
|
|
216
|
+
}
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
return createSafeProxy(value);
|
|
220
|
+
},
|
|
221
|
+
apply(target, thisArg, argList) {
|
|
222
|
+
const realTarget = target[REAL_TARGET] || target;
|
|
223
|
+
const realThis = (thisArg && thisArg[REAL_TARGET]) ? thisArg[REAL_TARGET] : thisArg;
|
|
224
|
+
const realArgs = argList.map(arg => {
|
|
225
|
+
return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
|
|
226
|
+
});
|
|
227
|
+
const wrappedArgs = realArgs.map(arg => {
|
|
228
|
+
if (typeof arg === 'function') {
|
|
229
|
+
return function (...cbArgs) {
|
|
230
|
+
const wrappedCbArgs = cbArgs.map(a => createSafeProxy(a));
|
|
231
|
+
return arg.apply(this, wrappedCbArgs);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
return arg;
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
try {
|
|
238
|
+
const result = Reflect.apply(realTarget, realThis, wrappedArgs);
|
|
239
|
+
return createSafeProxy(result);
|
|
240
|
+
} catch (e) {
|
|
241
|
+
throw e;
|
|
242
|
+
}
|
|
243
|
+
},
|
|
244
|
+
construct(target, argumentsList, newTarget) {
|
|
245
|
+
const realTarget = target[REAL_TARGET] || target;
|
|
246
|
+
const realArgs = argumentsList.map(arg => {
|
|
247
|
+
return (arg && arg[REAL_TARGET]) ? arg[REAL_TARGET] : arg;
|
|
248
|
+
});
|
|
249
|
+
try {
|
|
250
|
+
const result = Reflect.construct(realTarget, realArgs, realTarget);
|
|
251
|
+
return createSafeProxy(result);
|
|
252
|
+
} catch (e) {
|
|
253
|
+
throw e;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
|
|
164
259
|
async function handleAgent(req, res) {
|
|
165
260
|
const data = (req.method === 'POST') ? req.body : req.query;
|
|
166
261
|
let { url, actions, wait: globalWait, rotateUserAgents, rotateProxies, humanTyping, stealth = {} } = data;
|
|
@@ -170,6 +265,10 @@ async function handleAgent(req, res) {
|
|
|
170
265
|
const includeShadowDom = includeShadowDomRaw === undefined
|
|
171
266
|
? true
|
|
172
267
|
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
268
|
+
const disableRecordingRaw = data.disableRecording ?? req.query.disableRecording;
|
|
269
|
+
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
270
|
+
const statelessExecutionRaw = data.statelessExecution ?? req.query.statelessExecution;
|
|
271
|
+
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
173
272
|
const {
|
|
174
273
|
allowTypos = false,
|
|
175
274
|
idleMovements = false,
|
|
@@ -194,10 +293,10 @@ async function handleAgent(req, res) {
|
|
|
194
293
|
});
|
|
195
294
|
}
|
|
196
295
|
|
|
197
|
-
const localPort = req.socket && req.socket.localPort;
|
|
198
|
-
const configuredPort = process.env.PORT || process.env.VITE_BACKEND_PORT;
|
|
199
|
-
const basePort = localPort || configuredPort || '11345';
|
|
200
|
-
const baseUrl = `${req.protocol || 'http'}://127.0.0.1:${basePort}`;
|
|
296
|
+
const localPort = req.socket && req.socket.localPort;
|
|
297
|
+
const configuredPort = process.env.PORT || process.env.VITE_BACKEND_PORT;
|
|
298
|
+
const basePort = localPort || configuredPort || '11345';
|
|
299
|
+
const baseUrl = `${req.protocol || 'http'}://127.0.0.1:${basePort}`;
|
|
201
300
|
const runtimeVars = { ...(data.taskVariables || data.variables || {}) };
|
|
202
301
|
let lastBlockOutput = null;
|
|
203
302
|
runtimeVars['block.output'] = lastBlockOutput;
|
|
@@ -258,39 +357,58 @@ async function handleAgent(req, res) {
|
|
|
258
357
|
|
|
259
358
|
const parseCsv = (input) => {
|
|
260
359
|
const text = typeof input === 'string' ? input : String(input || '');
|
|
360
|
+
const len = text.length;
|
|
261
361
|
const rows = [];
|
|
262
362
|
let row = [];
|
|
263
363
|
let current = '';
|
|
264
364
|
let inQuotes = false;
|
|
365
|
+
const specialChar = /[",\n\r]/g;
|
|
265
366
|
|
|
266
|
-
|
|
267
|
-
|
|
367
|
+
let i = 0;
|
|
368
|
+
while (i < len) {
|
|
268
369
|
if (inQuotes) {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
370
|
+
const nextQuote = text.indexOf('"', i);
|
|
371
|
+
if (nextQuote === -1) {
|
|
372
|
+
current += text.slice(i);
|
|
373
|
+
i = len;
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
376
|
+
current += text.slice(i, nextQuote);
|
|
377
|
+
i = nextQuote;
|
|
378
|
+
if (i + 1 < len && text[i + 1] === '"') {
|
|
379
|
+
current += '"';
|
|
380
|
+
i += 2;
|
|
276
381
|
} else {
|
|
277
|
-
|
|
382
|
+
inQuotes = false;
|
|
383
|
+
i += 1;
|
|
278
384
|
}
|
|
279
385
|
} else {
|
|
386
|
+
specialChar.lastIndex = i;
|
|
387
|
+
const match = specialChar.exec(text);
|
|
388
|
+
if (!match) {
|
|
389
|
+
current += text.slice(i);
|
|
390
|
+
i = len;
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
393
|
+
const idx = match.index;
|
|
394
|
+
const char = match[0];
|
|
395
|
+
current += text.slice(i, idx);
|
|
396
|
+
i = idx;
|
|
280
397
|
if (char === '"') {
|
|
281
398
|
inQuotes = true;
|
|
399
|
+
i += 1;
|
|
282
400
|
} else if (char === ',') {
|
|
283
401
|
row.push(current);
|
|
284
402
|
current = '';
|
|
403
|
+
i += 1;
|
|
285
404
|
} else if (char === '\n') {
|
|
286
405
|
row.push(current);
|
|
287
406
|
rows.push(row);
|
|
288
407
|
row = [];
|
|
289
408
|
current = '';
|
|
409
|
+
i += 1;
|
|
290
410
|
} else if (char === '\r') {
|
|
291
|
-
|
|
292
|
-
} else {
|
|
293
|
-
current += char;
|
|
411
|
+
i += 1;
|
|
294
412
|
}
|
|
295
413
|
}
|
|
296
414
|
}
|
|
@@ -398,7 +516,7 @@ async function handleAgent(req, res) {
|
|
|
398
516
|
return { startToEnd, startToElse, elseToEnd, endToStart };
|
|
399
517
|
};
|
|
400
518
|
|
|
401
|
-
const selectedUA = selectUserAgent(rotateUserAgents);
|
|
519
|
+
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
402
520
|
|
|
403
521
|
let browser;
|
|
404
522
|
let context;
|
|
@@ -425,9 +543,7 @@ async function handleAgent(req, res) {
|
|
|
425
543
|
browser = await chromium.launch(launchOptions);
|
|
426
544
|
|
|
427
545
|
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
428
|
-
|
|
429
|
-
fs.mkdirSync(recordingsDir, { recursive: true });
|
|
430
|
-
}
|
|
546
|
+
await fs.promises.mkdir(recordingsDir, { recursive: true });
|
|
431
547
|
|
|
432
548
|
const rotateViewport = String(data.rotateViewport).toLowerCase() === 'true' || data.rotateViewport === true;
|
|
433
549
|
const viewport = rotateViewport
|
|
@@ -442,13 +558,16 @@ async function handleAgent(req, res) {
|
|
|
442
558
|
timezoneId: 'America/New_York',
|
|
443
559
|
colorScheme: 'dark',
|
|
444
560
|
permissions: ['geolocation'],
|
|
445
|
-
recordVideo: { dir: recordingsDir, size: viewport }
|
|
446
561
|
};
|
|
447
562
|
|
|
448
|
-
|
|
563
|
+
const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
|
|
564
|
+
if (shouldUseStorageState) {
|
|
449
565
|
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
450
566
|
}
|
|
451
567
|
|
|
568
|
+
if (!disableRecording) {
|
|
569
|
+
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
570
|
+
}
|
|
452
571
|
context = await browser.newContext(contextOptions);
|
|
453
572
|
|
|
454
573
|
await context.addInitScript(() => {
|
|
@@ -1009,7 +1128,7 @@ async function handleAgent(req, res) {
|
|
|
1009
1128
|
case 'start': {
|
|
1010
1129
|
const taskId = resolveMaybe(act.value);
|
|
1011
1130
|
if (!taskId) throw new Error('Missing task id.');
|
|
1012
|
-
const apiKey = loadApiKey() || data.apiKey || data.key;
|
|
1131
|
+
const apiKey = (await loadApiKey()) || data.apiKey || data.key;
|
|
1013
1132
|
if (!apiKey) {
|
|
1014
1133
|
logs.push('No API key available; attempting internal start.');
|
|
1015
1134
|
}
|
|
@@ -1350,16 +1469,6 @@ async function handleAgent(req, res) {
|
|
|
1350
1469
|
return { shadowQueryAll, shadowText };
|
|
1351
1470
|
})();
|
|
1352
1471
|
|
|
1353
|
-
// CodeQL alerts on dynamic eval, but extraction scripts intentionally run inside the browser sandbox,
|
|
1354
|
-
// so we expose only the helpers needed (window, document, DOMParser, console) and keep the evaluation confined there.
|
|
1355
|
-
const executor = new Function(
|
|
1356
|
-
'$$data',
|
|
1357
|
-
'window',
|
|
1358
|
-
'document',
|
|
1359
|
-
'DOMParser',
|
|
1360
|
-
'console',
|
|
1361
|
-
`"use strict"; return (async () => { ${script}\n})();`
|
|
1362
|
-
);
|
|
1363
1472
|
const $$data = {
|
|
1364
1473
|
html: () => html || '',
|
|
1365
1474
|
url: () => pageUrl || '',
|
|
@@ -1368,7 +1477,33 @@ async function handleAgent(req, res) {
|
|
|
1368
1477
|
shadowQueryAll: includeShadowDom ? shadowHelpers.shadowQueryAll : undefined,
|
|
1369
1478
|
shadowText: includeShadowDom ? shadowHelpers.shadowText : undefined
|
|
1370
1479
|
};
|
|
1371
|
-
|
|
1480
|
+
|
|
1481
|
+
// Use vm for sandboxed execution
|
|
1482
|
+
const sandbox = Object.create(null);
|
|
1483
|
+
sandbox.window = createSafeProxy(window);
|
|
1484
|
+
sandbox.document = createSafeProxy(window.document);
|
|
1485
|
+
sandbox.DOMParser = createSafeProxy(window.DOMParser);
|
|
1486
|
+
sandbox.console = createSafeProxy(consoleProxy);
|
|
1487
|
+
sandbox.$$data = createSafeProxy($$data);
|
|
1488
|
+
|
|
1489
|
+
// Pass the script as a variable to avoid string interpolation (CodeQL: Code Injection)
|
|
1490
|
+
sandbox.$$userScript = script;
|
|
1491
|
+
|
|
1492
|
+
const context = vm.createContext(sandbox);
|
|
1493
|
+
|
|
1494
|
+
// We use a static wrapper to execute the user script.
|
|
1495
|
+
// This ensures that the code passed to vm.runInContext is constant and safe.
|
|
1496
|
+
// The user script is retrieved from the sandbox environment and executed as an AsyncFunction.
|
|
1497
|
+
const scriptCode = `
|
|
1498
|
+
"use strict";
|
|
1499
|
+
(async () => {
|
|
1500
|
+
const AsyncFunction = Object.getPrototypeOf(async function(){}).constructor;
|
|
1501
|
+
const fn = new AsyncFunction('$$data', 'window', 'document', 'DOMParser', 'console', $$userScript);
|
|
1502
|
+
return fn($$data, window, document, DOMParser, console);
|
|
1503
|
+
})();
|
|
1504
|
+
`;
|
|
1505
|
+
|
|
1506
|
+
const result = await vm.runInContext(scriptCode, context);
|
|
1372
1507
|
return { result, logs: logBuffer };
|
|
1373
1508
|
} catch (e) {
|
|
1374
1509
|
return { result: `Extraction script error: ${e.message}`, logs: [] };
|
|
@@ -1381,29 +1516,6 @@ async function handleAgent(req, res) {
|
|
|
1381
1516
|
const extractionScript = extractionScriptRaw ? resolveTemplate(extractionScriptRaw) : undefined;
|
|
1382
1517
|
const extraction = await runExtractionScript(extractionScript, cleanedHtml, page.url());
|
|
1383
1518
|
|
|
1384
|
-
// Simple HTML Formatter (fallback to raw if formatting collapses content)
|
|
1385
|
-
const formatHTML = (html) => {
|
|
1386
|
-
let indent = 0;
|
|
1387
|
-
return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
|
|
1388
|
-
if (slash) indent--;
|
|
1389
|
-
const result = ' '.repeat(Math.max(0, indent)) + match;
|
|
1390
|
-
if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
|
|
1391
|
-
return '\n' + result;
|
|
1392
|
-
}).trim();
|
|
1393
|
-
};
|
|
1394
|
-
|
|
1395
|
-
const safeFormatHTML = (html) => {
|
|
1396
|
-
if (typeof html !== 'string') return '';
|
|
1397
|
-
try {
|
|
1398
|
-
const formatted = formatHTML(html);
|
|
1399
|
-
if (!formatted) return html;
|
|
1400
|
-
if (formatted.length < Math.max(200, Math.floor(html.length * 0.5))) return html;
|
|
1401
|
-
return formatted;
|
|
1402
|
-
} catch {
|
|
1403
|
-
return html;
|
|
1404
|
-
}
|
|
1405
|
-
};
|
|
1406
|
-
|
|
1407
1519
|
// Ensure the public/screenshots directory exists
|
|
1408
1520
|
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
1409
1521
|
if (!fs.existsSync(capturesDir)) {
|
|
@@ -1434,7 +1546,9 @@ async function handleAgent(req, res) {
|
|
|
1434
1546
|
};
|
|
1435
1547
|
|
|
1436
1548
|
const video = page.video();
|
|
1437
|
-
|
|
1549
|
+
if (!statelessExecution) {
|
|
1550
|
+
try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch {}
|
|
1551
|
+
}
|
|
1438
1552
|
try { await context.close(); } catch {}
|
|
1439
1553
|
if (video) {
|
|
1440
1554
|
try {
|