@doppelgangerdev/doppelganger 0.5.7 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. package/LICENSE +2 -2
  2. package/README.md +9 -29
  3. package/agent.js +200 -101
  4. package/headful.js +126 -126
  5. package/package.json +2 -2
  6. package/scrape.js +249 -284
  7. package/server.js +469 -359
package/scrape.js CHANGED
@@ -1,9 +1,10 @@
1
1
  const { chromium } = require('playwright');
2
- const { JSDOM } = require('jsdom');
3
2
  const fs = require('fs');
4
3
  const path = require('path');
5
- const { getProxySelection } = require('./proxy-rotation');
6
- const { selectUserAgent } = require('./user-agent-settings');
4
+ const { spawn } = require('child_process');
5
+ const { getProxySelection } = require('./proxy-rotation');
6
+ const { selectUserAgent } = require('./user-agent-settings');
7
+ const { formatHTML } = require('./html-utils');
7
8
 
8
9
  const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
9
10
  const STORAGE_STATE_FILE = (() => {
@@ -26,8 +27,8 @@ const csvEscape = (value) => {
26
27
  return text;
27
28
  };
28
29
 
29
- const toCsvString = (raw) => {
30
- if (raw === undefined || raw === null) return '';
30
+ const toCsvString = (raw) => {
31
+ if (raw === undefined || raw === null) return '';
31
32
  if (typeof raw === 'string') {
32
33
  const trimmed = raw.trim();
33
34
  if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
@@ -64,39 +65,39 @@ const toCsvString = (raw) => {
64
65
  const obj = row && typeof row === 'object' ? row : {};
65
66
  return allKeys.map((key) => csvEscape(obj[key])).join(',');
66
67
  });
67
- return [headerLine, ...lines].join('\n');
68
- };
69
-
70
- const parseBooleanFlag = (value) => {
71
- if (typeof value === 'boolean') return value;
72
- if (value === undefined || value === null) return false;
73
- const normalized = String(value).toLowerCase();
74
- return normalized === 'true' || normalized === '1';
75
- };
76
-
77
- async function handleScrape(req, res) {
68
+ return [headerLine, ...lines].join('\n');
69
+ };
70
+
71
+ const parseBooleanFlag = (value) => {
72
+ if (typeof value === 'boolean') return value;
73
+ if (value === undefined || value === null) return false;
74
+ const normalized = String(value).toLowerCase();
75
+ return normalized === 'true' || normalized === '1';
76
+ };
77
+
78
+ async function handleScrape(req, res) {
78
79
  const url = req.body.url || req.query.url;
79
80
  const customHeaders = req.body.headers || {};
80
81
  const userSelector = req.body.selector || req.query.selector;
81
82
  const waitInput = req.body.wait || req.query.wait;
82
83
  const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
83
- const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
84
- const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
85
- const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
86
- const runId = req.body.runId || req.query.runId || null;
87
- const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
84
+ const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
85
+ const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
86
+ const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
87
+ const runId = req.body.runId || req.query.runId || null;
88
+ const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
88
89
  const rotateProxiesRaw = req.body.rotateProxies ?? req.query.rotateProxies;
89
90
  const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
90
- const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
91
- const includeShadowDom = includeShadowDomRaw === undefined
92
- ? true
93
- : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
94
- const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
95
- const disableRecording = parseBooleanFlag(disableRecordingRaw);
96
- const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
97
- const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
98
- const extractionScript = req.body.extractionScript || req.query.extractionScript;
99
- const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
91
+ const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
92
+ const includeShadowDom = includeShadowDomRaw === undefined
93
+ ? true
94
+ : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
95
+ const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
96
+ const disableRecording = parseBooleanFlag(disableRecordingRaw);
97
+ const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
98
+ const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
99
+ const extractionScript = req.body.extractionScript || req.query.extractionScript;
100
+ const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
100
101
 
101
102
  if (!url) {
102
103
  return res.status(400).json({ error: 'URL is required.' });
@@ -104,24 +105,24 @@ async function handleScrape(req, res) {
104
105
 
105
106
  console.log(`Scraping: ${url}`);
106
107
 
107
- const selectedUA = selectUserAgent(rotateUserAgents);
108
-
109
- let browser;
110
- let context;
111
- let page;
112
- try {
108
+ const selectedUA = await selectUserAgent(rotateUserAgents);
109
+
110
+ let browser;
111
+ let context;
112
+ let page;
113
+ try {
113
114
  // Use 'chrome' channel to use a real installed browser instead of default Chromium
114
115
  const launchOptions = {
115
116
  headless: true,
116
117
  channel: 'chrome',
117
- args: [
118
- '--no-sandbox',
119
- '--disable-setuid-sandbox',
120
- '--disable-dev-shm-usage',
121
- '--disable-blink-features=AutomationControlled',
122
- '--hide-scrollbars',
123
- '--mute-audio'
124
- ]
118
+ args: [
119
+ '--no-sandbox',
120
+ '--disable-setuid-sandbox',
121
+ '--disable-dev-shm-usage',
122
+ '--disable-blink-features=AutomationControlled',
123
+ '--hide-scrollbars',
124
+ '--mute-audio'
125
+ ]
125
126
  };
126
127
  const selection = getProxySelection(rotateProxies);
127
128
  if (selection.proxy) {
@@ -130,102 +131,102 @@ async function handleScrape(req, res) {
130
131
  console.log(`[PROXY] Mode: ${selection.mode}; Target: ${selection.proxy ? selection.proxy.server : 'host_ip'}`);
131
132
  browser = await chromium.launch(launchOptions);
132
133
 
133
- const recordingsDir = path.join(__dirname, 'data', 'recordings');
134
- if (!fs.existsSync(recordingsDir)) {
135
- fs.mkdirSync(recordingsDir, { recursive: true });
136
- }
137
-
138
- const viewport = rotateViewport
139
- ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
140
- : { width: 1366, height: 768 };
141
-
142
- const contextOptions = {
143
- userAgent: selectedUA,
144
- extraHTTPHeaders: customHeaders,
145
- viewport,
146
- deviceScaleFactor: 1,
147
- locale: 'en-US',
148
- timezoneId: 'America/New_York',
149
- colorScheme: 'dark',
150
- permissions: ['geolocation']
151
- };
152
-
153
- const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
154
- if (shouldUseStorageState) {
155
- contextOptions.storageState = STORAGE_STATE_FILE;
156
- }
157
-
158
- if (!disableRecording) {
159
- contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
160
- }
161
-
162
- context = await browser.newContext(contextOptions);
163
-
164
- // Manual WebDriver Patch
165
- await context.addInitScript(() => {
166
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
167
- });
168
- await context.addInitScript(() => {
169
- const cursorId = 'dg-cursor-overlay';
170
- const dotId = 'dg-click-dot';
171
- if (document.getElementById(cursorId)) return;
172
- const cursor = document.createElement('div');
173
- cursor.id = cursorId;
174
- cursor.style.cssText = [
175
- 'position:fixed',
176
- 'top:0',
177
- 'left:0',
178
- 'width:18px',
179
- 'height:18px',
180
- 'margin-left:-9px',
181
- 'margin-top:-9px',
182
- 'border:2px solid rgba(56,189,248,0.7)',
183
- 'background:rgba(56,189,248,0.25)',
184
- 'border-radius:50%',
185
- 'box-shadow:0 0 10px rgba(56,189,248,0.6)',
186
- 'pointer-events:none',
187
- 'z-index:2147483647',
188
- 'transform:translate3d(0,0,0)',
189
- 'transition:transform 60ms ease-out'
190
- ].join(';');
191
- const dot = document.createElement('div');
192
- dot.id = dotId;
193
- dot.style.cssText = [
194
- 'position:fixed',
195
- 'top:0',
196
- 'left:0',
197
- 'width:10px',
198
- 'height:10px',
199
- 'margin-left:-5px',
200
- 'margin-top:-5px',
201
- 'background:rgba(239,68,68,0.9)',
202
- 'border-radius:50%',
203
- 'box-shadow:0 0 12px rgba(239,68,68,0.8)',
204
- 'pointer-events:none',
205
- 'z-index:2147483647',
206
- 'opacity:0',
207
- 'transform:translate3d(0,0,0) scale(0.6)',
208
- 'transition:opacity 120ms ease, transform 120ms ease'
209
- ].join(';');
210
- document.documentElement.appendChild(cursor);
211
- document.documentElement.appendChild(dot);
212
- const move = (x, y) => {
213
- cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
214
- };
215
- window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
216
- window.addEventListener('click', (e) => {
217
- dot.style.left = `${e.clientX}px`;
218
- dot.style.top = `${e.clientY}px`;
219
- dot.style.opacity = '1';
220
- dot.style.transform = 'translate3d(0,0,0) scale(1)';
221
- cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
222
- setTimeout(() => {
223
- dot.style.opacity = '0';
224
- dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
225
- cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
226
- }, 180);
227
- }, true);
228
- });
134
+ const recordingsDir = path.join(__dirname, 'data', 'recordings');
135
+ if (!fs.existsSync(recordingsDir)) {
136
+ fs.mkdirSync(recordingsDir, { recursive: true });
137
+ }
138
+
139
+ const viewport = rotateViewport
140
+ ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
141
+ : { width: 1366, height: 768 };
142
+
143
+ const contextOptions = {
144
+ userAgent: selectedUA,
145
+ extraHTTPHeaders: customHeaders,
146
+ viewport,
147
+ deviceScaleFactor: 1,
148
+ locale: 'en-US',
149
+ timezoneId: 'America/New_York',
150
+ colorScheme: 'dark',
151
+ permissions: ['geolocation']
152
+ };
153
+
154
+ const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
155
+ if (shouldUseStorageState) {
156
+ contextOptions.storageState = STORAGE_STATE_FILE;
157
+ }
158
+
159
+ if (!disableRecording) {
160
+ contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
161
+ }
162
+
163
+ context = await browser.newContext(contextOptions);
164
+
165
+ // Manual WebDriver Patch
166
+ await context.addInitScript(() => {
167
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
168
+ });
169
+ await context.addInitScript(() => {
170
+ const cursorId = 'dg-cursor-overlay';
171
+ const dotId = 'dg-click-dot';
172
+ if (document.getElementById(cursorId)) return;
173
+ const cursor = document.createElement('div');
174
+ cursor.id = cursorId;
175
+ cursor.style.cssText = [
176
+ 'position:fixed',
177
+ 'top:0',
178
+ 'left:0',
179
+ 'width:18px',
180
+ 'height:18px',
181
+ 'margin-left:-9px',
182
+ 'margin-top:-9px',
183
+ 'border:2px solid rgba(56,189,248,0.7)',
184
+ 'background:rgba(56,189,248,0.25)',
185
+ 'border-radius:50%',
186
+ 'box-shadow:0 0 10px rgba(56,189,248,0.6)',
187
+ 'pointer-events:none',
188
+ 'z-index:2147483647',
189
+ 'transform:translate3d(0,0,0)',
190
+ 'transition:transform 60ms ease-out'
191
+ ].join(';');
192
+ const dot = document.createElement('div');
193
+ dot.id = dotId;
194
+ dot.style.cssText = [
195
+ 'position:fixed',
196
+ 'top:0',
197
+ 'left:0',
198
+ 'width:10px',
199
+ 'height:10px',
200
+ 'margin-left:-5px',
201
+ 'margin-top:-5px',
202
+ 'background:rgba(239,68,68,0.9)',
203
+ 'border-radius:50%',
204
+ 'box-shadow:0 0 12px rgba(239,68,68,0.8)',
205
+ 'pointer-events:none',
206
+ 'z-index:2147483647',
207
+ 'opacity:0',
208
+ 'transform:translate3d(0,0,0) scale(0.6)',
209
+ 'transition:opacity 120ms ease, transform 120ms ease'
210
+ ].join(';');
211
+ document.documentElement.appendChild(cursor);
212
+ document.documentElement.appendChild(dot);
213
+ const move = (x, y) => {
214
+ cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
215
+ };
216
+ window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
217
+ window.addEventListener('click', (e) => {
218
+ dot.style.left = `${e.clientX}px`;
219
+ dot.style.top = `${e.clientY}px`;
220
+ dot.style.opacity = '1';
221
+ dot.style.transform = 'translate3d(0,0,0) scale(1)';
222
+ cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
223
+ setTimeout(() => {
224
+ dot.style.opacity = '0';
225
+ dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
226
+ cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
227
+ }, 180);
228
+ }, true);
229
+ });
229
230
  if (includeShadowDom) {
230
231
  await context.addInitScript(() => {
231
232
  if (!Element.prototype.attachShadow) return;
@@ -237,7 +238,7 @@ async function handleScrape(req, res) {
237
238
  });
238
239
  }
239
240
 
240
- page = await context.newPage();
241
+ page = await context.newPage();
241
242
 
242
243
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
243
244
 
@@ -345,112 +346,76 @@ async function handleScrape(req, res) {
345
346
 
346
347
  const runExtractionScript = async (script, html, pageUrl) => {
347
348
  if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
348
- try {
349
- const dom = new JSDOM(html || '');
350
- const { window } = dom;
351
- const logBuffer = [];
352
- const consoleProxy = {
353
- log: (...args) => logBuffer.push(args.join(' ')),
354
- warn: (...args) => logBuffer.push(args.join(' ')),
355
- error: (...args) => logBuffer.push(args.join(' '))
356
- };
357
- const shadowHelpers = (() => {
358
- const shadowQueryAll = (selector, root = window.document) => {
359
- const results = [];
360
- const walk = (node) => {
361
- if (!node) return;
362
- if (node.nodeType === 1) {
363
- const el = node;
364
- if (selector && el.matches && el.matches(selector)) results.push(el);
365
- if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
366
- walk(el.content);
367
- }
368
- } else if (node.nodeType === 11) {
369
- // DocumentFragment
370
- }
371
- if (node.childNodes) {
372
- node.childNodes.forEach((child) => walk(child));
373
- }
374
- };
375
- walk(root);
376
- return results;
377
- };
378
349
 
379
- const shadowText = (root = window.document) => {
380
- const texts = [];
381
- const walk = (node) => {
382
- if (!node) return;
383
- if (node.nodeType === 3) {
384
- const text = node.nodeValue ? node.nodeValue.trim() : '';
385
- if (text) texts.push(text);
386
- return;
387
- }
388
- if (node.nodeType === 1) {
389
- const el = node;
390
- if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
391
- walk(el.content);
392
- }
393
- }
394
- if (node.childNodes) {
395
- node.childNodes.forEach((child) => walk(child));
396
- }
397
- };
398
- walk(root);
399
- return texts;
400
- };
350
+ return new Promise((resolve) => {
351
+ const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
352
+ stdio: ['pipe', 'pipe', 'pipe'],
353
+ env: { ...process.env, NODE_ENV: 'production' } // Minimal env
354
+ });
401
355
 
402
- return { shadowQueryAll, shadowText };
403
- })();
404
-
405
- const executor = new Function(
406
- '$$data',
407
- 'window',
408
- 'document',
409
- 'DOMParser',
410
- 'console',
411
- `"use strict"; return (async () => { ${script}\n})();`
412
- );
413
- const $$data = {
414
- html: () => html || '',
415
- url: () => pageUrl || '',
416
- window,
417
- document: window.document,
418
- shadowQueryAll: includeShadowDom ? shadowHelpers.shadowQueryAll : undefined,
419
- shadowText: includeShadowDom ? shadowHelpers.shadowText : undefined
420
- };
421
- const result = await executor($$data, window, window.document, window.DOMParser, consoleProxy);
422
- return { result, logs: logBuffer };
423
- } catch (e) {
424
- return { result: `Extraction script error: ${e.message}`, logs: [] };
425
- }
356
+ let stdout = '';
357
+ let stderr = '';
358
+
359
+ const workerTimeout = 5000;
360
+ const timer = setTimeout(() => {
361
+ worker.kill();
362
+ resolve({ result: 'Worker timed out', logs: [] });
363
+ }, workerTimeout);
364
+
365
+ worker.stdout.on('data', (data) => {
366
+ stdout += data.toString();
367
+ });
368
+
369
+ worker.stderr.on('data', (data) => {
370
+ stderr += data.toString();
371
+ });
372
+
373
+ worker.on('close', (code) => {
374
+ clearTimeout(timer);
375
+ if (code !== 0) {
376
+ resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
377
+ return;
378
+ }
379
+ try {
380
+ const output = JSON.parse(stdout);
381
+ resolve(output);
382
+ } catch (e) {
383
+ resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
384
+ }
385
+ });
386
+
387
+ worker.on('error', (err) => {
388
+ clearTimeout(timer);
389
+ resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
390
+ });
391
+
392
+ const input = JSON.stringify({
393
+ script,
394
+ html,
395
+ url: pageUrl,
396
+ includeShadowDom
397
+ });
398
+
399
+ worker.stdin.write(input);
400
+ worker.stdin.end();
401
+ });
426
402
  };
427
403
 
428
404
  const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
429
405
 
430
406
  // Ensure the public/screenshots directory exists
431
- const capturesDir = path.join(__dirname, 'public', 'captures');
432
- if (!fs.existsSync(capturesDir)) {
433
- fs.mkdirSync(capturesDir, { recursive: true });
434
- }
435
-
436
- const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
437
- const screenshotPath = path.join(capturesDir, screenshotName);
438
- try {
439
- await page.screenshot({ path: screenshotPath, fullPage: false });
440
- } catch (e) {
441
- console.error('Screenshot failed:', e.message);
442
- }
443
-
444
- // Simple HTML Formatter
445
- const formatHTML = (html) => {
446
- let indent = 0;
447
- return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
448
- if (slash) indent--;
449
- const result = ' '.repeat(Math.max(0, indent)) + match;
450
- if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
451
- return '\n' + result;
452
- }).trim();
453
- };
407
+ const capturesDir = path.join(__dirname, 'public', 'captures');
408
+ if (!fs.existsSync(capturesDir)) {
409
+ fs.mkdirSync(capturesDir, { recursive: true });
410
+ }
411
+
412
+ const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
413
+ const screenshotPath = path.join(capturesDir, screenshotName);
414
+ try {
415
+ await page.screenshot({ path: screenshotPath, fullPage: false });
416
+ } catch (e) {
417
+ console.error('Screenshot failed:', e.message);
418
+ }
454
419
 
455
420
  const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
456
421
  const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
@@ -465,48 +430,48 @@ async function handleScrape(req, res) {
465
430
  links: await page.$$eval('a[href]', elements => {
466
431
  return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
467
432
  }),
468
- screenshot_url: `/captures/${screenshotName}`
469
- };
470
-
471
- // Save session state
472
- if (!statelessExecution) {
473
- await context.storageState({ path: STORAGE_STATE_FILE });
474
- }
475
-
476
- const video = page.video();
477
- await context.close();
478
- if (video) {
479
- try {
480
- const videoPath = await video.path();
481
- if (videoPath && fs.existsSync(videoPath)) {
482
- const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
483
- const recordingPath = path.join(capturesDir, recordingName);
484
- try {
485
- fs.renameSync(videoPath, recordingPath);
486
- } catch (err) {
487
- if (err && err.code === 'EXDEV') {
488
- fs.copyFileSync(videoPath, recordingPath);
489
- fs.unlinkSync(videoPath);
490
- } else {
491
- throw err;
492
- }
493
- }
494
- }
495
- } catch (e) {
496
- console.error('Recording save failed:', e.message);
497
- }
498
- }
499
-
500
- await browser.close();
501
- res.json(data);
502
- } catch (error) {
503
- console.error('Scrape Error:', error);
504
- try {
505
- if (context) await context.close();
506
- } catch {}
507
- if (browser) await browser.close();
508
- res.status(500).json({ error: 'Failed to scrape', details: error.message });
509
- }
510
- }
433
+ screenshot_url: `/captures/${screenshotName}`
434
+ };
435
+
436
+ // Save session state
437
+ if (!statelessExecution) {
438
+ await context.storageState({ path: STORAGE_STATE_FILE });
439
+ }
440
+
441
+ const video = page.video();
442
+ await context.close();
443
+ if (video) {
444
+ try {
445
+ const videoPath = await video.path();
446
+ if (videoPath && fs.existsSync(videoPath)) {
447
+ const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
448
+ const recordingPath = path.join(capturesDir, recordingName);
449
+ try {
450
+ await fs.promises.rename(videoPath, recordingPath);
451
+ } catch (err) {
452
+ if (err && err.code === 'EXDEV') {
453
+ await fs.promises.copyFile(videoPath, recordingPath);
454
+ await fs.promises.unlink(videoPath);
455
+ } else {
456
+ throw err;
457
+ }
458
+ }
459
+ }
460
+ } catch (e) {
461
+ console.error('Recording save failed:', e.message);
462
+ }
463
+ }
464
+
465
+ await browser.close();
466
+ res.json(data);
467
+ } catch (error) {
468
+ console.error('Scrape Error:', error);
469
+ try {
470
+ if (context) await context.close();
471
+ } catch {}
472
+ if (browser) await browser.close();
473
+ res.status(500).json({ error: 'Failed to scrape', details: error.message });
474
+ }
475
+ }
511
476
 
512
477
  module.exports = { handleScrape };