@doppelgangerdev/doppelganger 0.5.6 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scrape.js CHANGED
@@ -1,9 +1,10 @@
1
1
  const { chromium } = require('playwright');
2
- const { JSDOM } = require('jsdom');
3
2
  const fs = require('fs');
4
3
  const path = require('path');
5
- const { getProxySelection } = require('./proxy-rotation');
6
- const { selectUserAgent } = require('./user-agent-settings');
4
+ const { spawn } = require('child_process');
5
+ const { getProxySelection } = require('./proxy-rotation');
6
+ const { selectUserAgent } = require('./user-agent-settings');
7
+ const { formatHTML } = require('./html-utils');
7
8
 
8
9
  const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
9
10
  const STORAGE_STATE_FILE = (() => {
@@ -67,23 +68,34 @@ const toCsvString = (raw) => {
67
68
  return [headerLine, ...lines].join('\n');
68
69
  };
69
70
 
70
- async function handleScrape(req, res) {
71
+ const parseBooleanFlag = (value) => {
72
+ if (typeof value === 'boolean') return value;
73
+ if (value === undefined || value === null) return false;
74
+ const normalized = String(value).toLowerCase();
75
+ return normalized === 'true' || normalized === '1';
76
+ };
77
+
78
+ async function handleScrape(req, res) {
71
79
  const url = req.body.url || req.query.url;
72
80
  const customHeaders = req.body.headers || {};
73
81
  const userSelector = req.body.selector || req.query.selector;
74
82
  const waitInput = req.body.wait || req.query.wait;
75
83
  const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
76
- const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
77
- const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
78
- const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
79
- const runId = req.body.runId || req.query.runId || null;
80
- const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
84
+ const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
85
+ const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
86
+ const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
87
+ const runId = req.body.runId || req.query.runId || null;
88
+ const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
81
89
  const rotateProxiesRaw = req.body.rotateProxies ?? req.query.rotateProxies;
82
90
  const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
83
91
  const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
84
92
  const includeShadowDom = includeShadowDomRaw === undefined
85
93
  ? true
86
94
  : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
95
+ const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
96
+ const disableRecording = parseBooleanFlag(disableRecordingRaw);
97
+ const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
98
+ const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
87
99
  const extractionScript = req.body.extractionScript || req.query.extractionScript;
88
100
  const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
89
101
 
@@ -93,24 +105,24 @@ async function handleScrape(req, res) {
93
105
 
94
106
  console.log(`Scraping: ${url}`);
95
107
 
96
- const selectedUA = selectUserAgent(rotateUserAgents);
97
-
98
- let browser;
99
- let context;
100
- let page;
101
- try {
108
+ const selectedUA = await selectUserAgent(rotateUserAgents);
109
+
110
+ let browser;
111
+ let context;
112
+ let page;
113
+ try {
102
114
  // Use 'chrome' channel to use a real installed browser instead of default Chromium
103
115
  const launchOptions = {
104
116
  headless: true,
105
117
  channel: 'chrome',
106
- args: [
107
- '--no-sandbox',
108
- '--disable-setuid-sandbox',
109
- '--disable-dev-shm-usage',
110
- '--disable-blink-features=AutomationControlled',
111
- '--hide-scrollbars',
112
- '--mute-audio'
113
- ]
118
+ args: [
119
+ '--no-sandbox',
120
+ '--disable-setuid-sandbox',
121
+ '--disable-dev-shm-usage',
122
+ '--disable-blink-features=AutomationControlled',
123
+ '--hide-scrollbars',
124
+ '--mute-audio'
125
+ ]
114
126
  };
115
127
  const selection = getProxySelection(rotateProxies);
116
128
  if (selection.proxy) {
@@ -119,98 +131,102 @@ async function handleScrape(req, res) {
119
131
  console.log(`[PROXY] Mode: ${selection.mode}; Target: ${selection.proxy ? selection.proxy.server : 'host_ip'}`);
120
132
  browser = await chromium.launch(launchOptions);
121
133
 
122
- const recordingsDir = path.join(__dirname, 'data', 'recordings');
123
- if (!fs.existsSync(recordingsDir)) {
124
- fs.mkdirSync(recordingsDir, { recursive: true });
125
- }
126
-
127
- const viewport = rotateViewport
128
- ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
129
- : { width: 1366, height: 768 };
130
-
131
- const contextOptions = {
132
- userAgent: selectedUA,
133
- extraHTTPHeaders: customHeaders,
134
- viewport,
135
- deviceScaleFactor: 1,
136
- locale: 'en-US',
137
- timezoneId: 'America/New_York',
138
- colorScheme: 'dark',
139
- permissions: ['geolocation'],
140
- recordVideo: { dir: recordingsDir, size: viewport }
141
- };
142
-
143
- if (fs.existsSync(STORAGE_STATE_FILE)) {
134
+ const recordingsDir = path.join(__dirname, 'data', 'recordings');
135
+ if (!fs.existsSync(recordingsDir)) {
136
+ fs.mkdirSync(recordingsDir, { recursive: true });
137
+ }
138
+
139
+ const viewport = rotateViewport
140
+ ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
141
+ : { width: 1366, height: 768 };
142
+
143
+ const contextOptions = {
144
+ userAgent: selectedUA,
145
+ extraHTTPHeaders: customHeaders,
146
+ viewport,
147
+ deviceScaleFactor: 1,
148
+ locale: 'en-US',
149
+ timezoneId: 'America/New_York',
150
+ colorScheme: 'dark',
151
+ permissions: ['geolocation']
152
+ };
153
+
154
+ const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
155
+ if (shouldUseStorageState) {
144
156
  contextOptions.storageState = STORAGE_STATE_FILE;
145
157
  }
146
158
 
147
- context = await browser.newContext(contextOptions);
148
-
149
- // Manual WebDriver Patch
150
- await context.addInitScript(() => {
151
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
152
- });
153
- await context.addInitScript(() => {
154
- const cursorId = 'dg-cursor-overlay';
155
- const dotId = 'dg-click-dot';
156
- if (document.getElementById(cursorId)) return;
157
- const cursor = document.createElement('div');
158
- cursor.id = cursorId;
159
- cursor.style.cssText = [
160
- 'position:fixed',
161
- 'top:0',
162
- 'left:0',
163
- 'width:18px',
164
- 'height:18px',
165
- 'margin-left:-9px',
166
- 'margin-top:-9px',
167
- 'border:2px solid rgba(56,189,248,0.7)',
168
- 'background:rgba(56,189,248,0.25)',
169
- 'border-radius:50%',
170
- 'box-shadow:0 0 10px rgba(56,189,248,0.6)',
171
- 'pointer-events:none',
172
- 'z-index:2147483647',
173
- 'transform:translate3d(0,0,0)',
174
- 'transition:transform 60ms ease-out'
175
- ].join(';');
176
- const dot = document.createElement('div');
177
- dot.id = dotId;
178
- dot.style.cssText = [
179
- 'position:fixed',
180
- 'top:0',
181
- 'left:0',
182
- 'width:10px',
183
- 'height:10px',
184
- 'margin-left:-5px',
185
- 'margin-top:-5px',
186
- 'background:rgba(239,68,68,0.9)',
187
- 'border-radius:50%',
188
- 'box-shadow:0 0 12px rgba(239,68,68,0.8)',
189
- 'pointer-events:none',
190
- 'z-index:2147483647',
191
- 'opacity:0',
192
- 'transform:translate3d(0,0,0) scale(0.6)',
193
- 'transition:opacity 120ms ease, transform 120ms ease'
194
- ].join(';');
195
- document.documentElement.appendChild(cursor);
196
- document.documentElement.appendChild(dot);
197
- const move = (x, y) => {
198
- cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
199
- };
200
- window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
201
- window.addEventListener('click', (e) => {
202
- dot.style.left = `${e.clientX}px`;
203
- dot.style.top = `${e.clientY}px`;
204
- dot.style.opacity = '1';
205
- dot.style.transform = 'translate3d(0,0,0) scale(1)';
206
- cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
207
- setTimeout(() => {
208
- dot.style.opacity = '0';
209
- dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
210
- cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
211
- }, 180);
212
- }, true);
213
- });
159
+ if (!disableRecording) {
160
+ contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
161
+ }
162
+
163
+ context = await browser.newContext(contextOptions);
164
+
165
+ // Manual WebDriver Patch
166
+ await context.addInitScript(() => {
167
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
168
+ });
169
+ await context.addInitScript(() => {
170
+ const cursorId = 'dg-cursor-overlay';
171
+ const dotId = 'dg-click-dot';
172
+ if (document.getElementById(cursorId)) return;
173
+ const cursor = document.createElement('div');
174
+ cursor.id = cursorId;
175
+ cursor.style.cssText = [
176
+ 'position:fixed',
177
+ 'top:0',
178
+ 'left:0',
179
+ 'width:18px',
180
+ 'height:18px',
181
+ 'margin-left:-9px',
182
+ 'margin-top:-9px',
183
+ 'border:2px solid rgba(56,189,248,0.7)',
184
+ 'background:rgba(56,189,248,0.25)',
185
+ 'border-radius:50%',
186
+ 'box-shadow:0 0 10px rgba(56,189,248,0.6)',
187
+ 'pointer-events:none',
188
+ 'z-index:2147483647',
189
+ 'transform:translate3d(0,0,0)',
190
+ 'transition:transform 60ms ease-out'
191
+ ].join(';');
192
+ const dot = document.createElement('div');
193
+ dot.id = dotId;
194
+ dot.style.cssText = [
195
+ 'position:fixed',
196
+ 'top:0',
197
+ 'left:0',
198
+ 'width:10px',
199
+ 'height:10px',
200
+ 'margin-left:-5px',
201
+ 'margin-top:-5px',
202
+ 'background:rgba(239,68,68,0.9)',
203
+ 'border-radius:50%',
204
+ 'box-shadow:0 0 12px rgba(239,68,68,0.8)',
205
+ 'pointer-events:none',
206
+ 'z-index:2147483647',
207
+ 'opacity:0',
208
+ 'transform:translate3d(0,0,0) scale(0.6)',
209
+ 'transition:opacity 120ms ease, transform 120ms ease'
210
+ ].join(';');
211
+ document.documentElement.appendChild(cursor);
212
+ document.documentElement.appendChild(dot);
213
+ const move = (x, y) => {
214
+ cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
215
+ };
216
+ window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
217
+ window.addEventListener('click', (e) => {
218
+ dot.style.left = `${e.clientX}px`;
219
+ dot.style.top = `${e.clientY}px`;
220
+ dot.style.opacity = '1';
221
+ dot.style.transform = 'translate3d(0,0,0) scale(1)';
222
+ cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
223
+ setTimeout(() => {
224
+ dot.style.opacity = '0';
225
+ dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
226
+ cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
227
+ }, 180);
228
+ }, true);
229
+ });
214
230
  if (includeShadowDom) {
215
231
  await context.addInitScript(() => {
216
232
  if (!Element.prototype.attachShadow) return;
@@ -222,7 +238,7 @@ async function handleScrape(req, res) {
222
238
  });
223
239
  }
224
240
 
225
- page = await context.newPage();
241
+ page = await context.newPage();
226
242
 
227
243
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
228
244
 
@@ -330,112 +346,76 @@ async function handleScrape(req, res) {
330
346
 
331
347
  const runExtractionScript = async (script, html, pageUrl) => {
332
348
  if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
333
- try {
334
- const dom = new JSDOM(html || '');
335
- const { window } = dom;
336
- const logBuffer = [];
337
- const consoleProxy = {
338
- log: (...args) => logBuffer.push(args.join(' ')),
339
- warn: (...args) => logBuffer.push(args.join(' ')),
340
- error: (...args) => logBuffer.push(args.join(' '))
341
- };
342
- const shadowHelpers = (() => {
343
- const shadowQueryAll = (selector, root = window.document) => {
344
- const results = [];
345
- const walk = (node) => {
346
- if (!node) return;
347
- if (node.nodeType === 1) {
348
- const el = node;
349
- if (selector && el.matches && el.matches(selector)) results.push(el);
350
- if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
351
- walk(el.content);
352
- }
353
- } else if (node.nodeType === 11) {
354
- // DocumentFragment
355
- }
356
- if (node.childNodes) {
357
- node.childNodes.forEach((child) => walk(child));
358
- }
359
- };
360
- walk(root);
361
- return results;
362
- };
363
349
 
364
- const shadowText = (root = window.document) => {
365
- const texts = [];
366
- const walk = (node) => {
367
- if (!node) return;
368
- if (node.nodeType === 3) {
369
- const text = node.nodeValue ? node.nodeValue.trim() : '';
370
- if (text) texts.push(text);
371
- return;
372
- }
373
- if (node.nodeType === 1) {
374
- const el = node;
375
- if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
376
- walk(el.content);
377
- }
378
- }
379
- if (node.childNodes) {
380
- node.childNodes.forEach((child) => walk(child));
381
- }
382
- };
383
- walk(root);
384
- return texts;
385
- };
350
+ return new Promise((resolve) => {
351
+ const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
352
+ stdio: ['pipe', 'pipe', 'pipe'],
353
+ env: { ...process.env, NODE_ENV: 'production' } // Minimal env
354
+ });
386
355
 
387
- return { shadowQueryAll, shadowText };
388
- })();
389
-
390
- const executor = new Function(
391
- '$$data',
392
- 'window',
393
- 'document',
394
- 'DOMParser',
395
- 'console',
396
- `"use strict"; return (async () => { ${script}\n})();`
397
- );
398
- const $$data = {
399
- html: () => html || '',
400
- url: () => pageUrl || '',
401
- window,
402
- document: window.document,
403
- shadowQueryAll: includeShadowDom ? shadowHelpers.shadowQueryAll : undefined,
404
- shadowText: includeShadowDom ? shadowHelpers.shadowText : undefined
405
- };
406
- const result = await executor($$data, window, window.document, window.DOMParser, consoleProxy);
407
- return { result, logs: logBuffer };
408
- } catch (e) {
409
- return { result: `Extraction script error: ${e.message}`, logs: [] };
410
- }
356
+ let stdout = '';
357
+ let stderr = '';
358
+
359
+ const workerTimeout = 5000;
360
+ const timer = setTimeout(() => {
361
+ worker.kill();
362
+ resolve({ result: 'Worker timed out', logs: [] });
363
+ }, workerTimeout);
364
+
365
+ worker.stdout.on('data', (data) => {
366
+ stdout += data.toString();
367
+ });
368
+
369
+ worker.stderr.on('data', (data) => {
370
+ stderr += data.toString();
371
+ });
372
+
373
+ worker.on('close', (code) => {
374
+ clearTimeout(timer);
375
+ if (code !== 0) {
376
+ resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
377
+ return;
378
+ }
379
+ try {
380
+ const output = JSON.parse(stdout);
381
+ resolve(output);
382
+ } catch (e) {
383
+ resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
384
+ }
385
+ });
386
+
387
+ worker.on('error', (err) => {
388
+ clearTimeout(timer);
389
+ resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
390
+ });
391
+
392
+ const input = JSON.stringify({
393
+ script,
394
+ html,
395
+ url: pageUrl,
396
+ includeShadowDom
397
+ });
398
+
399
+ worker.stdin.write(input);
400
+ worker.stdin.end();
401
+ });
411
402
  };
412
403
 
413
404
  const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
414
405
 
415
406
  // Ensure the public/screenshots directory exists
416
- const capturesDir = path.join(__dirname, 'public', 'captures');
417
- if (!fs.existsSync(capturesDir)) {
418
- fs.mkdirSync(capturesDir, { recursive: true });
419
- }
420
-
421
- const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
422
- const screenshotPath = path.join(capturesDir, screenshotName);
423
- try {
424
- await page.screenshot({ path: screenshotPath, fullPage: false });
425
- } catch (e) {
426
- console.error('Screenshot failed:', e.message);
427
- }
428
-
429
- // Simple HTML Formatter
430
- const formatHTML = (html) => {
431
- let indent = 0;
432
- return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
433
- if (slash) indent--;
434
- const result = ' '.repeat(Math.max(0, indent)) + match;
435
- if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
436
- return '\n' + result;
437
- }).trim();
438
- };
407
+ const capturesDir = path.join(__dirname, 'public', 'captures');
408
+ if (!fs.existsSync(capturesDir)) {
409
+ fs.mkdirSync(capturesDir, { recursive: true });
410
+ }
411
+
412
+ const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
413
+ const screenshotPath = path.join(capturesDir, screenshotName);
414
+ try {
415
+ await page.screenshot({ path: screenshotPath, fullPage: false });
416
+ } catch (e) {
417
+ console.error('Screenshot failed:', e.message);
418
+ }
439
419
 
440
420
  const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
441
421
  const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
@@ -450,46 +430,48 @@ async function handleScrape(req, res) {
450
430
  links: await page.$$eval('a[href]', elements => {
451
431
  return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
452
432
  }),
453
- screenshot_url: `/captures/${screenshotName}`
454
- };
455
-
456
- // Save session state
457
- await context.storageState({ path: STORAGE_STATE_FILE });
458
-
459
- const video = page.video();
460
- await context.close();
461
- if (video) {
462
- try {
463
- const videoPath = await video.path();
464
- if (videoPath && fs.existsSync(videoPath)) {
465
- const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
466
- const recordingPath = path.join(capturesDir, recordingName);
467
- try {
468
- fs.renameSync(videoPath, recordingPath);
469
- } catch (err) {
470
- if (err && err.code === 'EXDEV') {
471
- fs.copyFileSync(videoPath, recordingPath);
472
- fs.unlinkSync(videoPath);
473
- } else {
474
- throw err;
475
- }
476
- }
477
- }
478
- } catch (e) {
479
- console.error('Recording save failed:', e.message);
480
- }
481
- }
482
-
483
- await browser.close();
484
- res.json(data);
485
- } catch (error) {
486
- console.error('Scrape Error:', error);
487
- try {
488
- if (context) await context.close();
489
- } catch {}
490
- if (browser) await browser.close();
491
- res.status(500).json({ error: 'Failed to scrape', details: error.message });
492
- }
493
- }
433
+ screenshot_url: `/captures/${screenshotName}`
434
+ };
435
+
436
+ // Save session state
437
+ if (!statelessExecution) {
438
+ await context.storageState({ path: STORAGE_STATE_FILE });
439
+ }
440
+
441
+ const video = page.video();
442
+ await context.close();
443
+ if (video) {
444
+ try {
445
+ const videoPath = await video.path();
446
+ if (videoPath && fs.existsSync(videoPath)) {
447
+ const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
448
+ const recordingPath = path.join(capturesDir, recordingName);
449
+ try {
450
+ await fs.promises.rename(videoPath, recordingPath);
451
+ } catch (err) {
452
+ if (err && err.code === 'EXDEV') {
453
+ await fs.promises.copyFile(videoPath, recordingPath);
454
+ await fs.promises.unlink(videoPath);
455
+ } else {
456
+ throw err;
457
+ }
458
+ }
459
+ }
460
+ } catch (e) {
461
+ console.error('Recording save failed:', e.message);
462
+ }
463
+ }
464
+
465
+ await browser.close();
466
+ res.json(data);
467
+ } catch (error) {
468
+ console.error('Scrape Error:', error);
469
+ try {
470
+ if (context) await context.close();
471
+ } catch {}
472
+ if (browser) await browser.close();
473
+ res.status(500).json({ error: 'Failed to scrape', details: error.message });
474
+ }
475
+ }
494
476
 
495
477
  module.exports = { handleScrape };