@doppelgangerdev/doppelganger 0.5.7 → 0.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +2 -2
- package/README.md +9 -29
- package/agent.js +200 -101
- package/headful.js +126 -126
- package/package.json +2 -2
- package/scrape.js +249 -284
- package/server.js +469 -359
package/scrape.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
const { chromium } = require('playwright');
|
|
2
|
-
const { JSDOM } = require('jsdom');
|
|
3
2
|
const fs = require('fs');
|
|
4
3
|
const path = require('path');
|
|
5
|
-
const {
|
|
6
|
-
const {
|
|
4
|
+
const { spawn } = require('child_process');
|
|
5
|
+
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
+
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
+
const { formatHTML } = require('./html-utils');
|
|
7
8
|
|
|
8
9
|
const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
|
|
9
10
|
const STORAGE_STATE_FILE = (() => {
|
|
@@ -26,8 +27,8 @@ const csvEscape = (value) => {
|
|
|
26
27
|
return text;
|
|
27
28
|
};
|
|
28
29
|
|
|
29
|
-
const toCsvString = (raw) => {
|
|
30
|
-
if (raw === undefined || raw === null) return '';
|
|
30
|
+
const toCsvString = (raw) => {
|
|
31
|
+
if (raw === undefined || raw === null) return '';
|
|
31
32
|
if (typeof raw === 'string') {
|
|
32
33
|
const trimmed = raw.trim();
|
|
33
34
|
if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
|
|
@@ -64,39 +65,39 @@ const toCsvString = (raw) => {
|
|
|
64
65
|
const obj = row && typeof row === 'object' ? row : {};
|
|
65
66
|
return allKeys.map((key) => csvEscape(obj[key])).join(',');
|
|
66
67
|
});
|
|
67
|
-
return [headerLine, ...lines].join('\n');
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
const parseBooleanFlag = (value) => {
|
|
71
|
-
if (typeof value === 'boolean') return value;
|
|
72
|
-
if (value === undefined || value === null) return false;
|
|
73
|
-
const normalized = String(value).toLowerCase();
|
|
74
|
-
return normalized === 'true' || normalized === '1';
|
|
75
|
-
};
|
|
76
|
-
|
|
77
|
-
async function handleScrape(req, res) {
|
|
68
|
+
return [headerLine, ...lines].join('\n');
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
const parseBooleanFlag = (value) => {
|
|
72
|
+
if (typeof value === 'boolean') return value;
|
|
73
|
+
if (value === undefined || value === null) return false;
|
|
74
|
+
const normalized = String(value).toLowerCase();
|
|
75
|
+
return normalized === 'true' || normalized === '1';
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
async function handleScrape(req, res) {
|
|
78
79
|
const url = req.body.url || req.query.url;
|
|
79
80
|
const customHeaders = req.body.headers || {};
|
|
80
81
|
const userSelector = req.body.selector || req.query.selector;
|
|
81
82
|
const waitInput = req.body.wait || req.query.wait;
|
|
82
83
|
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
83
|
-
const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
|
|
84
|
-
const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
|
|
85
|
-
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
86
|
-
const runId = req.body.runId || req.query.runId || null;
|
|
87
|
-
const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
|
|
84
|
+
const rotateUserAgents = req.body.rotateUserAgents || req.query.rotateUserAgents || false;
|
|
85
|
+
const rotateViewportRaw = req.body.rotateViewport ?? req.query.rotateViewport;
|
|
86
|
+
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
87
|
+
const runId = req.body.runId || req.query.runId || null;
|
|
88
|
+
const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
|
|
88
89
|
const rotateProxiesRaw = req.body.rotateProxies ?? req.query.rotateProxies;
|
|
89
90
|
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
90
|
-
const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
|
|
91
|
-
const includeShadowDom = includeShadowDomRaw === undefined
|
|
92
|
-
? true
|
|
93
|
-
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
94
|
-
const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
|
|
95
|
-
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
96
|
-
const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
|
|
97
|
-
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
98
|
-
const extractionScript = req.body.extractionScript || req.query.extractionScript;
|
|
99
|
-
const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
|
|
91
|
+
const includeShadowDomRaw = req.body.includeShadowDom ?? req.query.includeShadowDom;
|
|
92
|
+
const includeShadowDom = includeShadowDomRaw === undefined
|
|
93
|
+
? true
|
|
94
|
+
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
95
|
+
const disableRecordingRaw = req.body.disableRecording ?? req.query.disableRecording;
|
|
96
|
+
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
97
|
+
const statelessExecutionRaw = req.body.statelessExecution ?? req.query.statelessExecution;
|
|
98
|
+
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
99
|
+
const extractionScript = req.body.extractionScript || req.query.extractionScript;
|
|
100
|
+
const extractionFormat = (req.body.extractionFormat || req.query.extractionFormat) === 'csv' ? 'csv' : 'json';
|
|
100
101
|
|
|
101
102
|
if (!url) {
|
|
102
103
|
return res.status(400).json({ error: 'URL is required.' });
|
|
@@ -104,24 +105,24 @@ async function handleScrape(req, res) {
|
|
|
104
105
|
|
|
105
106
|
console.log(`Scraping: ${url}`);
|
|
106
107
|
|
|
107
|
-
const selectedUA = selectUserAgent(rotateUserAgents);
|
|
108
|
-
|
|
109
|
-
let browser;
|
|
110
|
-
let context;
|
|
111
|
-
let page;
|
|
112
|
-
try {
|
|
108
|
+
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
109
|
+
|
|
110
|
+
let browser;
|
|
111
|
+
let context;
|
|
112
|
+
let page;
|
|
113
|
+
try {
|
|
113
114
|
// Use 'chrome' channel to use a real installed browser instead of default Chromium
|
|
114
115
|
const launchOptions = {
|
|
115
116
|
headless: true,
|
|
116
117
|
channel: 'chrome',
|
|
117
|
-
args: [
|
|
118
|
-
'--no-sandbox',
|
|
119
|
-
'--disable-setuid-sandbox',
|
|
120
|
-
'--disable-dev-shm-usage',
|
|
121
|
-
'--disable-blink-features=AutomationControlled',
|
|
122
|
-
'--hide-scrollbars',
|
|
123
|
-
'--mute-audio'
|
|
124
|
-
]
|
|
118
|
+
args: [
|
|
119
|
+
'--no-sandbox',
|
|
120
|
+
'--disable-setuid-sandbox',
|
|
121
|
+
'--disable-dev-shm-usage',
|
|
122
|
+
'--disable-blink-features=AutomationControlled',
|
|
123
|
+
'--hide-scrollbars',
|
|
124
|
+
'--mute-audio'
|
|
125
|
+
]
|
|
125
126
|
};
|
|
126
127
|
const selection = getProxySelection(rotateProxies);
|
|
127
128
|
if (selection.proxy) {
|
|
@@ -130,102 +131,102 @@ async function handleScrape(req, res) {
|
|
|
130
131
|
console.log(`[PROXY] Mode: ${selection.mode}; Target: ${selection.proxy ? selection.proxy.server : 'host_ip'}`);
|
|
131
132
|
browser = await chromium.launch(launchOptions);
|
|
132
133
|
|
|
133
|
-
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
134
|
-
if (!fs.existsSync(recordingsDir)) {
|
|
135
|
-
fs.mkdirSync(recordingsDir, { recursive: true });
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
const viewport = rotateViewport
|
|
139
|
-
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
140
|
-
: { width: 1366, height: 768 };
|
|
141
|
-
|
|
142
|
-
const contextOptions = {
|
|
143
|
-
userAgent: selectedUA,
|
|
144
|
-
extraHTTPHeaders: customHeaders,
|
|
145
|
-
viewport,
|
|
146
|
-
deviceScaleFactor: 1,
|
|
147
|
-
locale: 'en-US',
|
|
148
|
-
timezoneId: 'America/New_York',
|
|
149
|
-
colorScheme: 'dark',
|
|
150
|
-
permissions: ['geolocation']
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
|
|
154
|
-
if (shouldUseStorageState) {
|
|
155
|
-
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
if (!disableRecording) {
|
|
159
|
-
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
context = await browser.newContext(contextOptions);
|
|
163
|
-
|
|
164
|
-
// Manual WebDriver Patch
|
|
165
|
-
await context.addInitScript(() => {
|
|
166
|
-
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
167
|
-
});
|
|
168
|
-
await context.addInitScript(() => {
|
|
169
|
-
const cursorId = 'dg-cursor-overlay';
|
|
170
|
-
const dotId = 'dg-click-dot';
|
|
171
|
-
if (document.getElementById(cursorId)) return;
|
|
172
|
-
const cursor = document.createElement('div');
|
|
173
|
-
cursor.id = cursorId;
|
|
174
|
-
cursor.style.cssText = [
|
|
175
|
-
'position:fixed',
|
|
176
|
-
'top:0',
|
|
177
|
-
'left:0',
|
|
178
|
-
'width:18px',
|
|
179
|
-
'height:18px',
|
|
180
|
-
'margin-left:-9px',
|
|
181
|
-
'margin-top:-9px',
|
|
182
|
-
'border:2px solid rgba(56,189,248,0.7)',
|
|
183
|
-
'background:rgba(56,189,248,0.25)',
|
|
184
|
-
'border-radius:50%',
|
|
185
|
-
'box-shadow:0 0 10px rgba(56,189,248,0.6)',
|
|
186
|
-
'pointer-events:none',
|
|
187
|
-
'z-index:2147483647',
|
|
188
|
-
'transform:translate3d(0,0,0)',
|
|
189
|
-
'transition:transform 60ms ease-out'
|
|
190
|
-
].join(';');
|
|
191
|
-
const dot = document.createElement('div');
|
|
192
|
-
dot.id = dotId;
|
|
193
|
-
dot.style.cssText = [
|
|
194
|
-
'position:fixed',
|
|
195
|
-
'top:0',
|
|
196
|
-
'left:0',
|
|
197
|
-
'width:10px',
|
|
198
|
-
'height:10px',
|
|
199
|
-
'margin-left:-5px',
|
|
200
|
-
'margin-top:-5px',
|
|
201
|
-
'background:rgba(239,68,68,0.9)',
|
|
202
|
-
'border-radius:50%',
|
|
203
|
-
'box-shadow:0 0 12px rgba(239,68,68,0.8)',
|
|
204
|
-
'pointer-events:none',
|
|
205
|
-
'z-index:2147483647',
|
|
206
|
-
'opacity:0',
|
|
207
|
-
'transform:translate3d(0,0,0) scale(0.6)',
|
|
208
|
-
'transition:opacity 120ms ease, transform 120ms ease'
|
|
209
|
-
].join(';');
|
|
210
|
-
document.documentElement.appendChild(cursor);
|
|
211
|
-
document.documentElement.appendChild(dot);
|
|
212
|
-
const move = (x, y) => {
|
|
213
|
-
cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
|
|
214
|
-
};
|
|
215
|
-
window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
|
|
216
|
-
window.addEventListener('click', (e) => {
|
|
217
|
-
dot.style.left = `${e.clientX}px`;
|
|
218
|
-
dot.style.top = `${e.clientY}px`;
|
|
219
|
-
dot.style.opacity = '1';
|
|
220
|
-
dot.style.transform = 'translate3d(0,0,0) scale(1)';
|
|
221
|
-
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
|
|
222
|
-
setTimeout(() => {
|
|
223
|
-
dot.style.opacity = '0';
|
|
224
|
-
dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
|
|
225
|
-
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
|
|
226
|
-
}, 180);
|
|
227
|
-
}, true);
|
|
228
|
-
});
|
|
134
|
+
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
135
|
+
if (!fs.existsSync(recordingsDir)) {
|
|
136
|
+
fs.mkdirSync(recordingsDir, { recursive: true });
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const viewport = rotateViewport
|
|
140
|
+
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
141
|
+
: { width: 1366, height: 768 };
|
|
142
|
+
|
|
143
|
+
const contextOptions = {
|
|
144
|
+
userAgent: selectedUA,
|
|
145
|
+
extraHTTPHeaders: customHeaders,
|
|
146
|
+
viewport,
|
|
147
|
+
deviceScaleFactor: 1,
|
|
148
|
+
locale: 'en-US',
|
|
149
|
+
timezoneId: 'America/New_York',
|
|
150
|
+
colorScheme: 'dark',
|
|
151
|
+
permissions: ['geolocation']
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const shouldUseStorageState = !statelessExecution && fs.existsSync(STORAGE_STATE_FILE);
|
|
155
|
+
if (shouldUseStorageState) {
|
|
156
|
+
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (!disableRecording) {
|
|
160
|
+
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
context = await browser.newContext(contextOptions);
|
|
164
|
+
|
|
165
|
+
// Manual WebDriver Patch
|
|
166
|
+
await context.addInitScript(() => {
|
|
167
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
168
|
+
});
|
|
169
|
+
await context.addInitScript(() => {
|
|
170
|
+
const cursorId = 'dg-cursor-overlay';
|
|
171
|
+
const dotId = 'dg-click-dot';
|
|
172
|
+
if (document.getElementById(cursorId)) return;
|
|
173
|
+
const cursor = document.createElement('div');
|
|
174
|
+
cursor.id = cursorId;
|
|
175
|
+
cursor.style.cssText = [
|
|
176
|
+
'position:fixed',
|
|
177
|
+
'top:0',
|
|
178
|
+
'left:0',
|
|
179
|
+
'width:18px',
|
|
180
|
+
'height:18px',
|
|
181
|
+
'margin-left:-9px',
|
|
182
|
+
'margin-top:-9px',
|
|
183
|
+
'border:2px solid rgba(56,189,248,0.7)',
|
|
184
|
+
'background:rgba(56,189,248,0.25)',
|
|
185
|
+
'border-radius:50%',
|
|
186
|
+
'box-shadow:0 0 10px rgba(56,189,248,0.6)',
|
|
187
|
+
'pointer-events:none',
|
|
188
|
+
'z-index:2147483647',
|
|
189
|
+
'transform:translate3d(0,0,0)',
|
|
190
|
+
'transition:transform 60ms ease-out'
|
|
191
|
+
].join(';');
|
|
192
|
+
const dot = document.createElement('div');
|
|
193
|
+
dot.id = dotId;
|
|
194
|
+
dot.style.cssText = [
|
|
195
|
+
'position:fixed',
|
|
196
|
+
'top:0',
|
|
197
|
+
'left:0',
|
|
198
|
+
'width:10px',
|
|
199
|
+
'height:10px',
|
|
200
|
+
'margin-left:-5px',
|
|
201
|
+
'margin-top:-5px',
|
|
202
|
+
'background:rgba(239,68,68,0.9)',
|
|
203
|
+
'border-radius:50%',
|
|
204
|
+
'box-shadow:0 0 12px rgba(239,68,68,0.8)',
|
|
205
|
+
'pointer-events:none',
|
|
206
|
+
'z-index:2147483647',
|
|
207
|
+
'opacity:0',
|
|
208
|
+
'transform:translate3d(0,0,0) scale(0.6)',
|
|
209
|
+
'transition:opacity 120ms ease, transform 120ms ease'
|
|
210
|
+
].join(';');
|
|
211
|
+
document.documentElement.appendChild(cursor);
|
|
212
|
+
document.documentElement.appendChild(dot);
|
|
213
|
+
const move = (x, y) => {
|
|
214
|
+
cursor.style.transform = `translate3d(${x}px, ${y}px, 0)`;
|
|
215
|
+
};
|
|
216
|
+
window.addEventListener('mousemove', (e) => move(e.clientX, e.clientY), { passive: true });
|
|
217
|
+
window.addEventListener('click', (e) => {
|
|
218
|
+
dot.style.left = `${e.clientX}px`;
|
|
219
|
+
dot.style.top = `${e.clientY}px`;
|
|
220
|
+
dot.style.opacity = '1';
|
|
221
|
+
dot.style.transform = 'translate3d(0,0,0) scale(1)';
|
|
222
|
+
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(0.65)`;
|
|
223
|
+
setTimeout(() => {
|
|
224
|
+
dot.style.opacity = '0';
|
|
225
|
+
dot.style.transform = 'translate3d(0,0,0) scale(0.6)';
|
|
226
|
+
cursor.style.transform = `translate3d(${e.clientX}px, ${e.clientY}px, 0) scale(1)`;
|
|
227
|
+
}, 180);
|
|
228
|
+
}, true);
|
|
229
|
+
});
|
|
229
230
|
if (includeShadowDom) {
|
|
230
231
|
await context.addInitScript(() => {
|
|
231
232
|
if (!Element.prototype.attachShadow) return;
|
|
@@ -237,7 +238,7 @@ async function handleScrape(req, res) {
|
|
|
237
238
|
});
|
|
238
239
|
}
|
|
239
240
|
|
|
240
|
-
page = await context.newPage();
|
|
241
|
+
page = await context.newPage();
|
|
241
242
|
|
|
242
243
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
243
244
|
|
|
@@ -345,112 +346,76 @@ async function handleScrape(req, res) {
|
|
|
345
346
|
|
|
346
347
|
const runExtractionScript = async (script, html, pageUrl) => {
|
|
347
348
|
if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
|
|
348
|
-
try {
|
|
349
|
-
const dom = new JSDOM(html || '');
|
|
350
|
-
const { window } = dom;
|
|
351
|
-
const logBuffer = [];
|
|
352
|
-
const consoleProxy = {
|
|
353
|
-
log: (...args) => logBuffer.push(args.join(' ')),
|
|
354
|
-
warn: (...args) => logBuffer.push(args.join(' ')),
|
|
355
|
-
error: (...args) => logBuffer.push(args.join(' '))
|
|
356
|
-
};
|
|
357
|
-
const shadowHelpers = (() => {
|
|
358
|
-
const shadowQueryAll = (selector, root = window.document) => {
|
|
359
|
-
const results = [];
|
|
360
|
-
const walk = (node) => {
|
|
361
|
-
if (!node) return;
|
|
362
|
-
if (node.nodeType === 1) {
|
|
363
|
-
const el = node;
|
|
364
|
-
if (selector && el.matches && el.matches(selector)) results.push(el);
|
|
365
|
-
if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
|
|
366
|
-
walk(el.content);
|
|
367
|
-
}
|
|
368
|
-
} else if (node.nodeType === 11) {
|
|
369
|
-
// DocumentFragment
|
|
370
|
-
}
|
|
371
|
-
if (node.childNodes) {
|
|
372
|
-
node.childNodes.forEach((child) => walk(child));
|
|
373
|
-
}
|
|
374
|
-
};
|
|
375
|
-
walk(root);
|
|
376
|
-
return results;
|
|
377
|
-
};
|
|
378
349
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
const text = node.nodeValue ? node.nodeValue.trim() : '';
|
|
385
|
-
if (text) texts.push(text);
|
|
386
|
-
return;
|
|
387
|
-
}
|
|
388
|
-
if (node.nodeType === 1) {
|
|
389
|
-
const el = node;
|
|
390
|
-
if (el.tagName === 'TEMPLATE' && el.hasAttribute('data-shadowroot')) {
|
|
391
|
-
walk(el.content);
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
if (node.childNodes) {
|
|
395
|
-
node.childNodes.forEach((child) => walk(child));
|
|
396
|
-
}
|
|
397
|
-
};
|
|
398
|
-
walk(root);
|
|
399
|
-
return texts;
|
|
400
|
-
};
|
|
350
|
+
return new Promise((resolve) => {
|
|
351
|
+
const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
|
|
352
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
353
|
+
env: { ...process.env, NODE_ENV: 'production' } // Minimal env
|
|
354
|
+
});
|
|
401
355
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
const
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
'
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
356
|
+
let stdout = '';
|
|
357
|
+
let stderr = '';
|
|
358
|
+
|
|
359
|
+
const workerTimeout = 5000;
|
|
360
|
+
const timer = setTimeout(() => {
|
|
361
|
+
worker.kill();
|
|
362
|
+
resolve({ result: 'Worker timed out', logs: [] });
|
|
363
|
+
}, workerTimeout);
|
|
364
|
+
|
|
365
|
+
worker.stdout.on('data', (data) => {
|
|
366
|
+
stdout += data.toString();
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
worker.stderr.on('data', (data) => {
|
|
370
|
+
stderr += data.toString();
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
worker.on('close', (code) => {
|
|
374
|
+
clearTimeout(timer);
|
|
375
|
+
if (code !== 0) {
|
|
376
|
+
resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
379
|
+
try {
|
|
380
|
+
const output = JSON.parse(stdout);
|
|
381
|
+
resolve(output);
|
|
382
|
+
} catch (e) {
|
|
383
|
+
resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
|
|
384
|
+
}
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
worker.on('error', (err) => {
|
|
388
|
+
clearTimeout(timer);
|
|
389
|
+
resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
const input = JSON.stringify({
|
|
393
|
+
script,
|
|
394
|
+
html,
|
|
395
|
+
url: pageUrl,
|
|
396
|
+
includeShadowDom
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
worker.stdin.write(input);
|
|
400
|
+
worker.stdin.end();
|
|
401
|
+
});
|
|
426
402
|
};
|
|
427
403
|
|
|
428
404
|
const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
|
|
429
405
|
|
|
430
406
|
// Ensure the public/screenshots directory exists
|
|
431
|
-
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
432
|
-
if (!fs.existsSync(capturesDir)) {
|
|
433
|
-
fs.mkdirSync(capturesDir, { recursive: true });
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
437
|
-
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
438
|
-
try {
|
|
439
|
-
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
440
|
-
} catch (e) {
|
|
441
|
-
console.error('Screenshot failed:', e.message);
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
// Simple HTML Formatter
|
|
445
|
-
const formatHTML = (html) => {
|
|
446
|
-
let indent = 0;
|
|
447
|
-
return html.replace(/<(\/?)([a-z0-9]+)([^>]*?)(\/?)>/gi, (match, slash, tag, attrs, selfClose) => {
|
|
448
|
-
if (slash) indent--;
|
|
449
|
-
const result = ' '.repeat(Math.max(0, indent)) + match;
|
|
450
|
-
if (!slash && !selfClose && !['img', 'br', 'hr', 'input', 'link', 'meta'].includes(tag.toLowerCase())) indent++;
|
|
451
|
-
return '\n' + result;
|
|
452
|
-
}).trim();
|
|
453
|
-
};
|
|
407
|
+
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
408
|
+
if (!fs.existsSync(capturesDir)) {
|
|
409
|
+
fs.mkdirSync(capturesDir, { recursive: true });
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
413
|
+
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
414
|
+
try {
|
|
415
|
+
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
416
|
+
} catch (e) {
|
|
417
|
+
console.error('Screenshot failed:', e.message);
|
|
418
|
+
}
|
|
454
419
|
|
|
455
420
|
const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
|
|
456
421
|
const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
|
|
@@ -465,48 +430,48 @@ async function handleScrape(req, res) {
|
|
|
465
430
|
links: await page.$$eval('a[href]', elements => {
|
|
466
431
|
return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
|
|
467
432
|
}),
|
|
468
|
-
screenshot_url: `/captures/${screenshotName}`
|
|
469
|
-
};
|
|
470
|
-
|
|
471
|
-
// Save session state
|
|
472
|
-
if (!statelessExecution) {
|
|
473
|
-
await context.storageState({ path: STORAGE_STATE_FILE });
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
const video = page.video();
|
|
477
|
-
await context.close();
|
|
478
|
-
if (video) {
|
|
479
|
-
try {
|
|
480
|
-
const videoPath = await video.path();
|
|
481
|
-
if (videoPath && fs.existsSync(videoPath)) {
|
|
482
|
-
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
483
|
-
const recordingPath = path.join(capturesDir, recordingName);
|
|
484
|
-
try {
|
|
485
|
-
fs.
|
|
486
|
-
} catch (err) {
|
|
487
|
-
if (err && err.code === 'EXDEV') {
|
|
488
|
-
fs.
|
|
489
|
-
fs.
|
|
490
|
-
} else {
|
|
491
|
-
throw err;
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
} catch (e) {
|
|
496
|
-
console.error('Recording save failed:', e.message);
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
await browser.close();
|
|
501
|
-
res.json(data);
|
|
502
|
-
} catch (error) {
|
|
503
|
-
console.error('Scrape Error:', error);
|
|
504
|
-
try {
|
|
505
|
-
if (context) await context.close();
|
|
506
|
-
} catch {}
|
|
507
|
-
if (browser) await browser.close();
|
|
508
|
-
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
509
|
-
}
|
|
510
|
-
}
|
|
433
|
+
screenshot_url: `/captures/${screenshotName}`
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
// Save session state
|
|
437
|
+
if (!statelessExecution) {
|
|
438
|
+
await context.storageState({ path: STORAGE_STATE_FILE });
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
const video = page.video();
|
|
442
|
+
await context.close();
|
|
443
|
+
if (video) {
|
|
444
|
+
try {
|
|
445
|
+
const videoPath = await video.path();
|
|
446
|
+
if (videoPath && fs.existsSync(videoPath)) {
|
|
447
|
+
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
448
|
+
const recordingPath = path.join(capturesDir, recordingName);
|
|
449
|
+
try {
|
|
450
|
+
await fs.promises.rename(videoPath, recordingPath);
|
|
451
|
+
} catch (err) {
|
|
452
|
+
if (err && err.code === 'EXDEV') {
|
|
453
|
+
await fs.promises.copyFile(videoPath, recordingPath);
|
|
454
|
+
await fs.promises.unlink(videoPath);
|
|
455
|
+
} else {
|
|
456
|
+
throw err;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
} catch (e) {
|
|
461
|
+
console.error('Recording save failed:', e.message);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
await browser.close();
|
|
466
|
+
res.json(data);
|
|
467
|
+
} catch (error) {
|
|
468
|
+
console.error('Scrape Error:', error);
|
|
469
|
+
try {
|
|
470
|
+
if (context) await context.close();
|
|
471
|
+
} catch {}
|
|
472
|
+
if (browser) await browser.close();
|
|
473
|
+
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
474
|
+
}
|
|
475
|
+
}
|
|
511
476
|
|
|
512
477
|
module.exports = { handleScrape };
|