@doppelgangerdev/doppelganger 0.5.6 → 0.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +177 -157
- package/README.md +259 -274
- package/agent.js +176 -62
- package/dist/assets/index-Cwmqk52G.js +19 -0
- package/dist/assets/{index-isZw-0dm.css → index-CxzMazJO.css} +1 -1
- package/dist/captures/run_1769734411613_783_scrape_1769734425256.png +0 -0
- package/dist/captures/run_1769734411613_783_scrape_1769734428068.webm +0 -0
- package/dist/captures/run_1769734522774_unknown_scrape_1769734535501.png +0 -0
- package/dist/captures/run_1769734522774_unknown_scrape_1769734538775.webm +0 -0
- package/dist/index.html +133 -22
- package/headful.js +92 -82
- package/package.json +2 -2
- package/public/captures/run_1770084709375_263_scrape_1770084720880.png +0 -0
- package/public/captures/run_1770084753714_765_agent_1770084772039.png +0 -0
- package/public/captures/run_1770084753714_765_agent_1770084774318.webm +0 -0
- package/public/captures/run_1770084826401_32_scrape_1770084832653.png +0 -0
- package/public/captures/run_1770084826401_32_scrape_1770084835345.webm +0 -0
- package/public/captures/run_1770084861758_434_scrape_1770084869777.png +0 -0
- package/public/captures/run_1770084861758_434_scrape_1770084875604.webm +0 -0
- package/public/captures/run_1770084870793_97_scrape_1770084879360.png +0 -0
- package/public/captures/run_1770084870793_97_scrape_1770084882219.webm +0 -0
- package/scrape.js +235 -253
- package/server.js +442 -312
- package/dist/assets/index-BKB-zmAO.js +0 -19
package/scrape.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
const { chromium } = require('playwright');
|
|
2
|
-
const { JSDOM } = require('jsdom');
|
|
3
2
|
const fs = require('fs');
|
|
4
3
|
const path = require('path');
|
|
5
|
-
const {
|
|
6
|
-
const {
|
|
4
|
+
const { spawn } = require('child_process');
|
|
5
|
+
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
+
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
+
const { formatHTML } = require('./html-utils');
|
|
7
8
|
|
|
8
9
|
const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
|
|
9
10
|
const STORAGE_STATE_FILE = (() => {
|
|
@@ -67,23 +68,34 @@ const toCsvString = (raw) => {
|
|
|
67
68
|
return [headerLine, ...lines].join('\n');
|
|
68
69
|
};
|
|
69
70
|
|
|
70
|
-
|
|
71
|
+
const parseBooleanFlag = (value) => {
|
|
72
|
+
if (typeof value === 'boolean') return value;
|
|
73
|
+
if (value === undefined || value === null) return false;
|
|
74
|
+
const normalized = String(value).toLowerCase();
|
|
75
|
+
return normalized === 'true' || normalized === '1';
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
async function handleScrape(req, res) {
|
|
71
79
|
const url = req.body.url || req.query.url;
|
|
72
80
|
const customHeaders = req.body.headers || {};
|
|
73
81
|
const userSelector = req.body.selector || req.query.selector;
|
|
74
82
|
const waitInput = req.body.wait || req.query.wait;
|
|
75
83
|
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
76
|
-
const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
|
|
77
|
-
const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
|
|
78
|
-
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
79
|
-
const runId = req.body.runId || req.query.runId || null;
|
|
80
|
-
const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
|
|
84
|
+
const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
|
|
85
|
+
const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
|
|
86
|
+
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
87
|
+
const runId = req.body.runId || req.query.runId || null;
|
|
88
|
+
const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
|
|
81
89
|
const rotateProxiesRaw = req.body.rotateProxies ?? req.query.rotateProxies;
|
|
82
90
|
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
83
91
|
const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
|
|
84
92
|
const includeShadowDom = includeShadowDomRaw === undefined
|
|
85
93
|
? true
|
|
86
94
|
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
95
|
+
const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
|
|
96
|
+
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
97
|
+
const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
|
|
98
|
+
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
87
99
|
const extractionScript = req.body.extractionScript || req.query.extractionScript;
|
|
88
100
|
const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
|
|
89
101
|
|
|
@@ -93,24 +105,24 @@ async function handleScrape(req, res) {
|
|
|
93
105
|
|
|
94
106
|
console.log(`Scraping: ${url}`);
|
|
95
107
|
|
|
96
|
-
const selectedUA = selectUserAgent(rotateUserAgents);
|
|
97
|
-
|
|
98
|
-
let browser;
|
|
99
|
-
let context;
|
|
100
|
-
let page;
|
|
101
|
-
try {
|
|
108
|
+
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
109
|
+
|
|
110
|
+
let browser;
|
|
111
|
+
let context;
|
|
112
|
+
let page;
|
|
113
|
+
try {
|
|
102
114
|
// Use 'chrome' channel to use a real installed browser instead of default Chromium
|
|
103
115
|
const launchOptions = {
|
|
104
116
|
headless: true,
|
|
105
117
|
channel: 'chrome',
|
|
106
|
-
args: [
|
|
107
|
-
'--no-sandbox',
|
|
108
|
-
'--disable-setuid-sandbox',
|
|
109
|
-
'--disable-dev-shm-usage',
|
|
110
|
-
'--disable-blink-features=AutomationControlled',
|
|
111
|
-
'--hide-scrollbars',
|
|
112
|
-
'--mute-audio'
|
|
113
|
-
]
|
|
118
|
+
args: [
|
|
119
|
+
'--no-sandbox',
|
|
120
|
+
'--disable-setuid-sandbox',
|
|
121
|
+
'--disable-dev-shm-usage',
|
|
122
|
+
'--disable-blink-features=AutomationControlled',
|
|
123
|
+
'--hide-scrollbars',
|
|
124
|
+
'--mute-audio'
|
|
125
|
+
]
|
|
114
126
|
};
|
|
115
127
|
const selection = getProxySelection(rotateProxies);
|
|
116
128
|
if (selection.proxy) {
|
|
@@ -119,98 +131,102 @@ async function handleScrape(req, res) {
|
|
|
119
131
|
console.log(`[PROXY] Mode: ${selection.mode}; Target: ${selection.proxy ? selection.proxy.server : 'host_ip'}`);
|
|
120
132
|
browser = await chromium.launch(launchOptions);
|
|
121
133
|
|
|
122
|
-
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
123
|
-
if (!fs.existsSync(recordingsDir)) {
|
|
124
|
-
fs.mkdirSync(recordingsDir, { recursive: true });
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
const viewport = rotateViewport
|
|
128
|
-
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
129
|
-
: { width: 1366, height: 768 };
|
|
130
|
-
|
|
131
|
-
const contextOptions = {
|
|
132
|
-
userAgent: selectedUA,
|
|
133
|
-
extraHTTPHeaders: customHeaders,
|
|
134
|
-
viewport,
|
|
135
|
-
deviceScaleFactor: 1,
|
|
136
|
-
locale: 'en-US',
|
|
137
|
-
timezoneId: 'America/New_York',
|
|
138
|
-
colorScheme: 'dark',
|
|
139
|
-
permissions: ['geolocation']
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if (
|
|
134
|
+
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
135
|
+
if (!fs.existsSync(recordingsDir)) {
|
|
136
|
+
fs.mkdirSync(recordingsDir, { recursive: true });
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const viewport = rotateViewport
|
|
140
|
+
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
141
|
+
: { width: 1366, height: 768 };
|
|
142
|
+
|
|
143
|
+
const contextOptions = {
|
|
144
|
+
userAgent: selectedUA,
|
|
145
|
+
extraHTTPHeaders: customHeaders,
|
|
146
|
+
viewport,
|
|
147
|
+
deviceScaleFactor: 1,
|
|
148
|
+
locale: 'en-US',
|
|
149
|
+
timezoneId: 'America/New_York',
|
|
150
|
+
colorScheme: 'dark',
|
|
151
|
+
permissions: ['geolocation']
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
|
|
155
|
+
if (shouldUseStorageState) {
|
|
144
156
|
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
145
157
|
}
|
|
146
158
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
'
|
|
165
|
-
'
|
|
166
|
-
'
|
|
167
|
-
'
|
|
168
|
-
'
|
|
169
|
-
'
|
|
170
|
-
'
|
|
171
|
-
'
|
|
172
|
-
'
|
|
173
|
-
'
|
|
174
|
-
'
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
'
|
|
184
|
-
'
|
|
185
|
-
'
|
|
186
|
-
'
|
|
187
|
-
'
|
|
188
|
-
'
|
|
189
|
-
'
|
|
190
|
-
'
|
|
191
|
-
'
|
|
192
|
-
'
|
|
193
|
-
'
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
159
|
+
if (!disableRecording) {
|
|
160
|
+
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
context = await browser.newContext(contextOptions);
|
|
164
|
+
|
|
165
|
+
// Manual WebDriver Patch
|
|
166
|
+
await context.addInitScript(() => {
|
|
167
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
168
|
+
});
|
|
169
|
+
await context.addInitScript(() => {
|
|
170
|
+
const cursorId = 'dg-cursor-overlay';
|
|
171
|
+
const dotId = 'dg-click-dot';
|
|
172
|
+
if (document.getElementById(cursorId)) return;
|
|
173
|
+
const cursor = document.createElement('div');
|
|
174
|
+
cursor.id = cursorId;
|
|
175
|
+
cursor.style.cssText = [
|
|
176
|
+
'position:fixed',
|
|
177
|
+
'top:0',
|
|
178
|
+
'left:0',
|
|
179
|
+
'width:18px',
|
|
180
|
+
'height:18px',
|
|
181
|
+
'margin-left:-9px',
|
|
182
|
+
'margin-top:-9px',
|
|
183
|
+
'border:2px solid rgba(56,189,248,0.7)',
|
|
184
|
+
'background:rgba(56,189,248,0.25)',
|
|
185
|
+
'border-radius:50%',
|
|
186
|
+
'box-shadow:0 0 10px rgba(56,189,248,0.6)',
|
|
187
|
+
'pointer-events:none',
|
|
188
|
+
'z-index:2147483647',
|
|
189
|
+
'transform:translate3d(0,0,0)',
|
|
190
|
+
'transition:transform 60ms ease-out'
|
|
191
|
+
].join(';');
|
|
192
|
+
const dot = document.createElement('div');
|
|
193
|
+
dot.id = dotId;
|
|
194
|
+
dot.style.cssText = [
|
|
195
|
+
'position:fixed',
|
|
196
|
+
'top:0',
|
|
197
|
+
'left:0',
|
|
198
|
+
'width:10px',
|
|
199
|
+
'height:10px',
|
|
200
|
+
'margin-left:-5px',
|
|
201
|
+
'margin-top:-5px',
|
|
202
|
+
'background:rgba(239,68,68,0.9)',
|
|
203
|
+
'border-radius:50%',
|
|
204
|
+
'box-shadow:0 0 12px rgba(239,68,68,0.8)',
|
|
205
|
+
'pointer-events:none',
|
|
206
|
+
'z-index:2147483647',
|
|
207
|
+
'opacity:0',
|
|
208
|
+
'transform:translate3d(0,0,0) scale(0.6)',
|
|
209
|
+
'transition:opacity 120ms ease, transform 120ms ease'
|
|
210
|
+
].join(';');
|
|
211
|
+
document.documentElement.appendChild(cursor);
|
|
212
|
+
document.documentElement.appendChild(dot);
|
|
213
|
+
const move = (x, y) => {
|
|
214
|
+
cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
|
|
215
|
+
};
|
|
216
|
+
window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
|
|
217
|
+
window.addEventListener('click', (e) => {
|
|
218
|
+
dot.style.left = `${e.clientX}px`;
|
|
219
|
+
dot.style.top = `${e.clientY}px`;
|
|
220
|
+
dot.style.opacity = '1';
|
|
221
|
+
dot.style.transform = 'translate3d(0,0,0) scale(1)';
|
|
222
|
+
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
|
|
223
|
+
setTimeout(() => {
|
|
224
|
+
dot.style.opacity = '0';
|
|
225
|
+
dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
|
|
226
|
+
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
|
|
227
|
+
}, 180);
|
|
228
|
+
}, true);
|
|
229
|
+
});
|
|
214
230
|
if (includeShadowDom) {
|
|
215
231
|
await context.addInitScript(() => {
|
|
216
232
|
if (!Element.prototype.attachShadow) return;
|
|
@@ -222,7 +238,7 @@ async function handleScrape(req, res) {
|
|
|
222
238
|
});
|
|
223
239
|
}
|
|
224
240
|
|
|
225
|
-
page = await context.newPage();
|
|
241
|
+
page = await context.newPage();
|
|
226
242
|
|
|
227
243
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
228
244
|
|
|
@@ -330,112 +346,76 @@ async function handleScrape(req, res) {
|
|
|
330
346
|
|
|
331
347
|
const runExtractionScript = async (script, html, pageUrl) => {
|
|
332
348
|
if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
|
|
333
|
-
try {
|
|
334
|
-
const dom = new JSDOM(html || '');
|
|
335
|
-
const { window } = dom;
|
|
336
|
-
const logBuffer = [];
|
|
337
|
-
const consoleProxy = {
|
|
338
|
-
log: (...args) => logBuffer.push(args.join(' ')),
|
|
339
|
-
warn: (...args) => logBuffer.push(args.join(' ')),
|
|
340
|
-
error: (...args) => logBuffer.push(args.join(' '))
|
|
341
|
-
};
|
|
342
|
-
const shadowHelpers = (() => {
|
|
343
|
-
const shadowQueryAll = (selector, root = window.document) => {
|
|
344
|
-
const results = [];
|
|
345
|
-
const walk = (node) => {
|
|
346
|
-
if (!node) return;
|
|
347
|
-
if (node.nodeType === 1) {
|
|
348
|
-
const el = node;
|
|
349
|
-
if (selector && el.matches && el.matches(selector)) results.push(el);
|
|
350
|
-
if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
|
|
351
|
-
walk(el.content);
|
|
352
|
-
}
|
|
353
|
-
} else if (node.nodeType === 11) {
|
|
354
|
-
// DocumentFragment
|
|
355
|
-
}
|
|
356
|
-
if (node.childNodes) {
|
|
357
|
-
node.childNodes.forEach((child) => walk(child));
|
|
358
|
-
}
|
|
359
|
-
};
|
|
360
|
-
walk(root);
|
|
361
|
-
return results;
|
|
362
|
-
};
|
|
363
349
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
const text = node.nodeValue ? node.nodeValue.trim() : '';
|
|
370
|
-
if (text) texts.push(text);
|
|
371
|
-
return;
|
|
372
|
-
}
|
|
373
|
-
if (node.nodeType === 1) {
|
|
374
|
-
const el = node;
|
|
375
|
-
if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
|
|
376
|
-
walk(el.content);
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
if (node.childNodes) {
|
|
380
|
-
node.childNodes.forEach((child) => walk(child));
|
|
381
|
-
}
|
|
382
|
-
};
|
|
383
|
-
walk(root);
|
|
384
|
-
return texts;
|
|
385
|
-
};
|
|
350
|
+
return new Promise((resolve) => {
|
|
351
|
+
const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
|
|
352
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
353
|
+
env: { ...process.env, NODE_ENV: 'production' } // Minimal env
|
|
354
|
+
});
|
|
386
355
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
const
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
'
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
356
|
+
let stdout = '';
|
|
357
|
+
let stderr = '';
|
|
358
|
+
|
|
359
|
+
const workerTimeout = 5000;
|
|
360
|
+
const timer = setTimeout(() => {
|
|
361
|
+
worker.kill();
|
|
362
|
+
resolve({ result: 'Worker timed out', logs: [] });
|
|
363
|
+
}, workerTimeout);
|
|
364
|
+
|
|
365
|
+
worker.stdout.on('data', (data) => {
|
|
366
|
+
stdout += data.toString();
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
worker.stderr.on('data', (data) => {
|
|
370
|
+
stderr += data.toString();
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
worker.on('close', (code) => {
|
|
374
|
+
clearTimeout(timer);
|
|
375
|
+
if (code !== 0) {
|
|
376
|
+
resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
379
|
+
try {
|
|
380
|
+
const output = JSON.parse(stdout);
|
|
381
|
+
resolve(output);
|
|
382
|
+
} catch (e) {
|
|
383
|
+
resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
|
|
384
|
+
}
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
worker.on('error', (err) => {
|
|
388
|
+
clearTimeout(timer);
|
|
389
|
+
resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
const input = JSON.stringify({
|
|
393
|
+
script,
|
|
394
|
+
html,
|
|
395
|
+
url: pageUrl,
|
|
396
|
+
includeShadowDom
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
worker.stdin.write(input);
|
|
400
|
+
worker.stdin.end();
|
|
401
|
+
});
|
|
411
402
|
};
|
|
412
403
|
|
|
413
404
|
const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
|
|
414
405
|
|
|
415
406
|
// Ensure the public/screenshots directory exists
|
|
416
|
-
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
417
|
-
if (!fs.existsSync(capturesDir)) {
|
|
418
|
-
fs.mkdirSync(capturesDir, { recursive: true });
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
422
|
-
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
423
|
-
try {
|
|
424
|
-
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
425
|
-
} catch (e) {
|
|
426
|
-
console.error('Screenshot failed:', e.message);
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
// Simple HTML Formatter
|
|
430
|
-
const formatHTML = (html) => {
|
|
431
|
-
let indent = 0;
|
|
432
|
-
return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
|
|
433
|
-
if (slash) indent--;
|
|
434
|
-
const result = ' '.repeat(Math.max(0, indent)) + match;
|
|
435
|
-
if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
|
|
436
|
-
return '\n' + result;
|
|
437
|
-
}).trim();
|
|
438
|
-
};
|
|
407
|
+
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
408
|
+
if (!fs.existsSync(capturesDir)) {
|
|
409
|
+
fs.mkdirSync(capturesDir, { recursive: true });
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
413
|
+
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
414
|
+
try {
|
|
415
|
+
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
416
|
+
} catch (e) {
|
|
417
|
+
console.error('Screenshot failed:', e.message);
|
|
418
|
+
}
|
|
439
419
|
|
|
440
420
|
const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
|
|
441
421
|
const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
|
|
@@ -450,46 +430,48 @@ async function handleScrape(req, res) {
|
|
|
450
430
|
links: await page.$$eval('a[href]', elements => {
|
|
451
431
|
return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
|
|
452
432
|
}),
|
|
453
|
-
screenshot_url: `/captures/${screenshotName}`
|
|
454
|
-
};
|
|
455
|
-
|
|
456
|
-
// Save session state
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
}
|
|
433
|
+
screenshot_url: `/captures/${screenshotName}`
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
// Save session state
|
|
437
|
+
if (!statelessExecution) {
|
|
438
|
+
await context.storageState({ path: STORAGE_STATE_FILE });
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
const video = page.video();
|
|
442
|
+
await context.close();
|
|
443
|
+
if (video) {
|
|
444
|
+
try {
|
|
445
|
+
const videoPath = await video.path();
|
|
446
|
+
if (videoPath && fs.existsSync(videoPath)) {
|
|
447
|
+
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
448
|
+
const recordingPath = path.join(capturesDir, recordingName);
|
|
449
|
+
try {
|
|
450
|
+
await fs.promises.rename(videoPath, recordingPath);
|
|
451
|
+
} catch (err) {
|
|
452
|
+
if (err && err.code === 'EXDEV') {
|
|
453
|
+
await fs.promises.copyFile(videoPath, recordingPath);
|
|
454
|
+
await fs.promises.unlink(videoPath);
|
|
455
|
+
} else {
|
|
456
|
+
throw err;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
} catch (e) {
|
|
461
|
+
console.error('Recording save failed:', e.message);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
await browser.close();
|
|
466
|
+
res.json(data);
|
|
467
|
+
} catch (error) {
|
|
468
|
+
console.error('Scrape Error:', error);
|
|
469
|
+
try {
|
|
470
|
+
if (context) await context.close();
|
|
471
|
+
} catch {}
|
|
472
|
+
if (browser) await browser.close();
|
|
473
|
+
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
474
|
+
}
|
|
475
|
+
}
|
|
494
476
|
|
|
495
477
|
module.exports = { handleScrape };
|