figranium 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +674 -674
- package/README.md +336 -336
- package/agent.js +1 -1
- package/bin/cli.js +149 -149
- package/common-utils.js +211 -211
- package/dist/assets/{favicon-DmUMR1rm.svg → favicon-DXDXzv5K.svg} +290 -290
- package/dist/assets/index-BaVlGc48.js +18 -0
- package/dist/assets/index-T2xxnq_A.css +1 -0
- package/dist/favicon.svg +290 -290
- package/dist/figranium_icon.svg +290 -290
- package/dist/figranium_logo.svg +60 -60
- package/dist/index.html +26 -26
- package/dist/novnc.html +108 -108
- package/dist/styles.css +86 -86
- package/extraction-worker.js +211 -204
- package/headful.js +584 -569
- package/html-utils.js +24 -24
- package/package.json +82 -82
- package/proxy-rotation.js +261 -261
- package/proxy-utils.js +84 -84
- package/public/favicon.svg +290 -290
- package/public/figranium_icon.svg +290 -290
- package/public/figranium_logo.svg +60 -60
- package/public/novnc.html +108 -108
- package/public/styles.css +86 -86
- package/scrape.js +389 -389
- package/scripts/postinstall.js +21 -21
- package/server.js +626 -625
- package/src/server/cron-parser.js +325 -316
- package/src/server/routes/schedules.js +171 -171
- package/src/server/scheduler.js +379 -381
- package/url-utils.js +339 -295
- package/user-agent-settings.js +76 -76
- package/dist/assets/index-B1CypY6C.css +0 -1
- package/dist/assets/index-B295GWry.js +0 -18
package/scrape.js
CHANGED
|
@@ -1,389 +1,389 @@
|
|
|
1
|
-
const { chromium } = require('./stealth-chromium');
|
|
2
|
-
const fs = require('fs');
|
|
3
|
-
const path = require('path');
|
|
4
|
-
const { spawn } = require('child_process');
|
|
5
|
-
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
-
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
-
const { formatHTML } = require('./html-utils');
|
|
8
|
-
const { validateUrl, setupNavigationProtection } = require('./url-utils');
|
|
9
|
-
const { parseBooleanFlag, sanitizeRunId, toCsvString } = require('./common-utils');
|
|
10
|
-
const { installMouseHelper } = require('./src/agent/dom-utils');
|
|
11
|
-
|
|
12
|
-
const PROFILE_DIR = path.join(__dirname, 'data', 'browser-profile-scrape');
|
|
13
|
-
const HEADFUL_STATE_PATH = path.join(__dirname, 'data', 'headful-storage-state.json');
|
|
14
|
-
|
|
15
|
-
async function injectHeadfulCookies(context) {
|
|
16
|
-
try {
|
|
17
|
-
const raw = await fs.promises.readFile(HEADFUL_STATE_PATH, 'utf8');
|
|
18
|
-
const state = JSON.parse(raw);
|
|
19
|
-
const now = Date.now() / 1000;
|
|
20
|
-
const cookies = (state.cookies || []).filter(c => !c.expires || c.expires === -1 || c.expires > now);
|
|
21
|
-
if (cookies.length > 0) {
|
|
22
|
-
await context.addCookies(cookies);
|
|
23
|
-
console.log(`[SCRAPE] Injected ${cookies.length} cookies from headful session`);
|
|
24
|
-
}
|
|
25
|
-
} catch (e) {
|
|
26
|
-
if (e.code !== 'ENOENT') console.error('[SCRAPE] Failed to inject headful cookies:', e.message);
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
async function runScrape(data) {
|
|
31
|
-
const url = data.url;
|
|
32
|
-
const customHeaders = data.headers || {};
|
|
33
|
-
const userSelector = data.selector;
|
|
34
|
-
const waitInput = data.wait;
|
|
35
|
-
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
36
|
-
const rotateUserAgents = data.rotateUserAgents || false;
|
|
37
|
-
const rotateViewportRaw = data.rotateViewport;
|
|
38
|
-
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
39
|
-
const runId = data.runId || null;
|
|
40
|
-
const captureRunId = sanitizeRunId(runId) || `run_${Date.now()}_unknown`;
|
|
41
|
-
const rotateProxiesRaw = data.rotateProxies;
|
|
42
|
-
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
43
|
-
const includeShadowDomRaw = data.includeShadowDom;
|
|
44
|
-
const includeShadowDom = includeShadowDomRaw === undefined
|
|
45
|
-
? true
|
|
46
|
-
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
47
|
-
const disableRecordingRaw = data.disableRecording;
|
|
48
|
-
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
49
|
-
const statelessExecutionRaw = data.statelessExecution;
|
|
50
|
-
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
51
|
-
const extractionScript = data.extractionScript;
|
|
52
|
-
const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
|
|
53
|
-
|
|
54
|
-
if (!url) {
|
|
55
|
-
throw new Error('URL is required.');
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
await validateUrl(url);
|
|
59
|
-
|
|
60
|
-
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
61
|
-
|
|
62
|
-
let browser;
|
|
63
|
-
let context;
|
|
64
|
-
let page;
|
|
65
|
-
try {
|
|
66
|
-
const selection = getProxySelection(rotateProxies);
|
|
67
|
-
const hasProxy = !!selection.proxy;
|
|
68
|
-
|
|
69
|
-
const args = [
|
|
70
|
-
'--no-sandbox',
|
|
71
|
-
'--disable-setuid-sandbox',
|
|
72
|
-
'--disable-dev-shm-usage',
|
|
73
|
-
'--disable-blink-features=AutomationControlled',
|
|
74
|
-
'--hide-scrollbars',
|
|
75
|
-
'--mute-audio',
|
|
76
|
-
'--dns-prefetch-disable',
|
|
77
|
-
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp'
|
|
78
|
-
];
|
|
79
|
-
if (!hasProxy) {
|
|
80
|
-
args.push(
|
|
81
|
-
'--enable-features=DnsOverHttps',
|
|
82
|
-
'--dns-over-https-mode=secure',
|
|
83
|
-
'--dns-over-https-templates=https://cloudflare-dns.com/dns-query'
|
|
84
|
-
);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
88
|
-
await fs.promises.mkdir(recordingsDir, { recursive: true });
|
|
89
|
-
await fs.promises.mkdir(PROFILE_DIR, { recursive: true });
|
|
90
|
-
|
|
91
|
-
const viewport = rotateViewport
|
|
92
|
-
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
93
|
-
: { width: 1366, height: 768 };
|
|
94
|
-
|
|
95
|
-
const contextOptions = {
|
|
96
|
-
headless: true,
|
|
97
|
-
args,
|
|
98
|
-
userAgent: selectedUA,
|
|
99
|
-
extraHTTPHeaders: customHeaders,
|
|
100
|
-
viewport,
|
|
101
|
-
deviceScaleFactor: 1,
|
|
102
|
-
locale: 'en-US',
|
|
103
|
-
timezoneId: 'America/New_York',
|
|
104
|
-
colorScheme: 'dark',
|
|
105
|
-
permissions: ['geolocation']
|
|
106
|
-
};
|
|
107
|
-
|
|
108
|
-
if (selection.proxy) {
|
|
109
|
-
contextOptions.proxy = selection.proxy;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
if (!disableRecording) {
|
|
113
|
-
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
if (statelessExecution) {
|
|
117
|
-
const launchOpts = { headless: true, args, ...(selection.proxy ? { proxy: selection.proxy } : {}) };
|
|
118
|
-
browser = await chromium.launch(launchOpts);
|
|
119
|
-
context = await browser.newContext(contextOptions);
|
|
120
|
-
} else {
|
|
121
|
-
await fs.promises.mkdir(PROFILE_DIR, { recursive: true });
|
|
122
|
-
context = await chromium.launchPersistentContext(PROFILE_DIR, { headless: true, args, ...contextOptions });
|
|
123
|
-
browser = context.browser();
|
|
124
|
-
await injectHeadfulCookies(context);
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
await setupNavigationProtection(context);
|
|
128
|
-
await context.addInitScript(installMouseHelper);
|
|
129
|
-
|
|
130
|
-
if (includeShadowDom) {
|
|
131
|
-
await context.addInitScript(() => {
|
|
132
|
-
if (!Element.prototype.attachShadow) return;
|
|
133
|
-
const original = Element.prototype.attachShadow;
|
|
134
|
-
Element.prototype.attachShadow = function (init) {
|
|
135
|
-
const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
|
|
136
|
-
return original.call(this, options);
|
|
137
|
-
};
|
|
138
|
-
});
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
// Persistent context auto-creates a blank page; reuse it or open a new one
|
|
142
|
-
const existingPages = context.pages();
|
|
143
|
-
page = existingPages.length > 0 ? existingPages[0] : await context.newPage();
|
|
144
|
-
|
|
145
|
-
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
146
|
-
|
|
147
|
-
await page.evaluate(async () => {
|
|
148
|
-
await new Promise((resolve) => {
|
|
149
|
-
let totalHeight = 0;
|
|
150
|
-
const distance = 400;
|
|
151
|
-
const timer = setInterval(() => {
|
|
152
|
-
const scrollHeight = document.body.scrollHeight;
|
|
153
|
-
window.scrollBy(0, distance);
|
|
154
|
-
totalHeight += distance;
|
|
155
|
-
if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
|
|
156
|
-
}, 100);
|
|
157
|
-
});
|
|
158
|
-
window.scrollTo(0, 0);
|
|
159
|
-
});
|
|
160
|
-
|
|
161
|
-
await page.waitForTimeout(waitTime);
|
|
162
|
-
|
|
163
|
-
let productHtml = '';
|
|
164
|
-
let usedFallback = false;
|
|
165
|
-
|
|
166
|
-
if (userSelector) {
|
|
167
|
-
if (includeShadowDom) {
|
|
168
|
-
productHtml = await page.evaluate((selector) => {
|
|
169
|
-
const stripUseless = (root) => {
|
|
170
|
-
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
171
|
-
useless.forEach(node => node.remove());
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
const cloneWithShadow = (root) => {
|
|
175
|
-
const clone = root.cloneNode(true);
|
|
176
|
-
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
177
|
-
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
178
|
-
|
|
179
|
-
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
180
|
-
const orig = walkerOrig.currentNode;
|
|
181
|
-
const cloned = walkerClone.currentNode;
|
|
182
|
-
if (orig.shadowRoot) {
|
|
183
|
-
const template = document.createElement('template');
|
|
184
|
-
template.setAttribute('data-shadowroot', 'open');
|
|
185
|
-
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
186
|
-
cloned.appendChild(template);
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
stripUseless(clone);
|
|
191
|
-
return clone;
|
|
192
|
-
};
|
|
193
|
-
|
|
194
|
-
const elements = Array.from(document.querySelectorAll(selector));
|
|
195
|
-
return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
|
|
196
|
-
}, userSelector);
|
|
197
|
-
} else {
|
|
198
|
-
productHtml = await page.$$eval(userSelector, (elements) => {
|
|
199
|
-
return elements.map(el => {
|
|
200
|
-
const useless = el.querySelectorAll('script, style, svg, link, noscript');
|
|
201
|
-
useless.forEach(node => node.remove());
|
|
202
|
-
return el.outerHTML;
|
|
203
|
-
}).join('\n');
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
if (!productHtml || productHtml.trim() === '') usedFallback = true;
|
|
207
|
-
} else {
|
|
208
|
-
usedFallback = true;
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
if (usedFallback) {
|
|
212
|
-
productHtml = await page.evaluate((withShadow) => {
|
|
213
|
-
const stripUseless = (root) => {
|
|
214
|
-
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
215
|
-
useless.forEach(node => node.remove());
|
|
216
|
-
};
|
|
217
|
-
|
|
218
|
-
const cloneWithShadow = (root) => {
|
|
219
|
-
const clone = root.cloneNode(true);
|
|
220
|
-
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
221
|
-
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
222
|
-
|
|
223
|
-
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
224
|
-
const orig = walkerOrig.currentNode;
|
|
225
|
-
const cloned = walkerClone.currentNode;
|
|
226
|
-
if (orig.shadowRoot) {
|
|
227
|
-
const template = document.createElement('template');
|
|
228
|
-
template.setAttribute('data-shadowroot', 'open');
|
|
229
|
-
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
230
|
-
cloned.appendChild(template);
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
stripUseless(clone);
|
|
235
|
-
return clone;
|
|
236
|
-
};
|
|
237
|
-
|
|
238
|
-
if (withShadow) {
|
|
239
|
-
return cloneWithShadow(document.body).innerHTML;
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
const body = document.body.cloneNode(true);
|
|
243
|
-
stripUseless(body);
|
|
244
|
-
return body.innerHTML;
|
|
245
|
-
}, includeShadowDom);
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
const runExtractionScript = async (script, html, pageUrl) => {
|
|
249
|
-
if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
|
|
250
|
-
|
|
251
|
-
return new Promise((resolve) => {
|
|
252
|
-
const safeEnv = {
|
|
253
|
-
NODE_ENV: 'production',
|
|
254
|
-
PATH: process.env.PATH,
|
|
255
|
-
LANG: process.env.LANG,
|
|
256
|
-
TZ: process.env.TZ
|
|
257
|
-
};
|
|
258
|
-
|
|
259
|
-
const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
|
|
260
|
-
stdio: ['pipe', 'pipe', 'pipe'],
|
|
261
|
-
env: safeEnv
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
let stdout = '';
|
|
265
|
-
let stderr = '';
|
|
266
|
-
|
|
267
|
-
const workerTimeout = 5000;
|
|
268
|
-
const timer = setTimeout(() => {
|
|
269
|
-
worker.kill();
|
|
270
|
-
resolve({ result: 'Worker timed out', logs: [] });
|
|
271
|
-
}, workerTimeout);
|
|
272
|
-
|
|
273
|
-
worker.stdout.on('data', (data) => {
|
|
274
|
-
stdout += data.toString();
|
|
275
|
-
});
|
|
276
|
-
|
|
277
|
-
worker.stderr.on('data', (data) => {
|
|
278
|
-
stderr += data.toString();
|
|
279
|
-
});
|
|
280
|
-
|
|
281
|
-
worker.on('close', (code) => {
|
|
282
|
-
clearTimeout(timer);
|
|
283
|
-
if (code !== 0) {
|
|
284
|
-
resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
|
|
285
|
-
return;
|
|
286
|
-
}
|
|
287
|
-
try {
|
|
288
|
-
const output = JSON.parse(stdout);
|
|
289
|
-
resolve(output);
|
|
290
|
-
} catch (e) {
|
|
291
|
-
resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
|
|
292
|
-
}
|
|
293
|
-
});
|
|
294
|
-
|
|
295
|
-
worker.on('error', (err) => {
|
|
296
|
-
clearTimeout(timer);
|
|
297
|
-
resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
|
|
298
|
-
});
|
|
299
|
-
|
|
300
|
-
const input = JSON.stringify({
|
|
301
|
-
script,
|
|
302
|
-
html,
|
|
303
|
-
url: pageUrl,
|
|
304
|
-
includeShadowDom
|
|
305
|
-
});
|
|
306
|
-
|
|
307
|
-
worker.stdin.write(input);
|
|
308
|
-
worker.stdin.end();
|
|
309
|
-
});
|
|
310
|
-
};
|
|
311
|
-
|
|
312
|
-
const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
|
|
313
|
-
|
|
314
|
-
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
315
|
-
await fs.promises.mkdir(capturesDir, { recursive: true });
|
|
316
|
-
|
|
317
|
-
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
318
|
-
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
319
|
-
try {
|
|
320
|
-
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
321
|
-
} catch (e) {
|
|
322
|
-
console.error('Screenshot failed:', e.message);
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
|
|
326
|
-
const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
|
|
327
|
-
|
|
328
|
-
const resultData = {
|
|
329
|
-
title: await page.title(),
|
|
330
|
-
url: page.url(),
|
|
331
|
-
html: formatHTML(productHtml),
|
|
332
|
-
data: formattedExtraction,
|
|
333
|
-
is_partial: !usedFallback,
|
|
334
|
-
selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
|
|
335
|
-
links: await page.$$eval('a[href]', elements => {
|
|
336
|
-
return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
|
|
337
|
-
}),
|
|
338
|
-
screenshot_url: `/captures/${screenshotName}`
|
|
339
|
-
};
|
|
340
|
-
|
|
341
|
-
const video = page.video();
|
|
342
|
-
await context.close();
|
|
343
|
-
if (video) {
|
|
344
|
-
try {
|
|
345
|
-
const videoPath = await video.path();
|
|
346
|
-
const videoExists = videoPath && await fs.promises.access(videoPath).then(() => true).catch(() => false);
|
|
347
|
-
if (videoExists) {
|
|
348
|
-
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
349
|
-
const recordingPath = path.join(capturesDir, recordingName);
|
|
350
|
-
try {
|
|
351
|
-
await fs.promises.rename(videoPath, recordingPath);
|
|
352
|
-
} catch (err) {
|
|
353
|
-
if (err && err.code === 'EXDEV') {
|
|
354
|
-
await fs.promises.copyFile(videoPath, recordingPath);
|
|
355
|
-
await fs.promises.unlink(videoPath);
|
|
356
|
-
} else {
|
|
357
|
-
throw err;
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
} catch (e) {
|
|
362
|
-
console.error('Recording save failed:', e.message);
|
|
363
|
-
}
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
if (browser) await browser.close();
|
|
367
|
-
return resultData;
|
|
368
|
-
} catch (error) {
|
|
369
|
-
if (context) await context.close();
|
|
370
|
-
if (browser) await browser.close();
|
|
371
|
-
throw error;
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
async function handleScrape(req, res) {
|
|
376
|
-
const data = {
|
|
377
|
-
...req.body,
|
|
378
|
-
...req.query
|
|
379
|
-
};
|
|
380
|
-
|
|
381
|
-
try {
|
|
382
|
-
const result = await runScrape(data);
|
|
383
|
-
res.json(result);
|
|
384
|
-
} catch (error) {
|
|
385
|
-
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
module.exports = { runScrape, handleScrape };
|
|
1
|
+
const { chromium } = require('./stealth-chromium');
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const { spawn } = require('child_process');
|
|
5
|
+
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
+
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
+
const { formatHTML } = require('./html-utils');
|
|
8
|
+
const { validateUrl, setupNavigationProtection } = require('./url-utils');
|
|
9
|
+
const { parseBooleanFlag, sanitizeRunId, toCsvString } = require('./common-utils');
|
|
10
|
+
const { installMouseHelper } = require('./src/agent/dom-utils');
|
|
11
|
+
|
|
12
|
+
const PROFILE_DIR = path.join(__dirname, 'data', 'browser-profile-scrape');
|
|
13
|
+
const HEADFUL_STATE_PATH = path.join(__dirname, 'data', 'headful-storage-state.json');
|
|
14
|
+
|
|
15
|
+
async function injectHeadfulCookies(context) {
|
|
16
|
+
try {
|
|
17
|
+
const raw = await fs.promises.readFile(HEADFUL_STATE_PATH, 'utf8');
|
|
18
|
+
const state = JSON.parse(raw);
|
|
19
|
+
const now = Date.now() / 1000;
|
|
20
|
+
const cookies = (state.cookies || []).filter(c => !c.expires || c.expires === -1 || c.expires > now);
|
|
21
|
+
if (cookies.length > 0) {
|
|
22
|
+
await context.addCookies(cookies);
|
|
23
|
+
console.log(`[SCRAPE] Injected ${cookies.length} cookies from headful session`);
|
|
24
|
+
}
|
|
25
|
+
} catch (e) {
|
|
26
|
+
if (e.code !== 'ENOENT') console.error('[SCRAPE] Failed to inject headful cookies:', e.message);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function runScrape(data) {
|
|
31
|
+
const url = data.url;
|
|
32
|
+
const customHeaders = data.headers || {};
|
|
33
|
+
const userSelector = data.selector;
|
|
34
|
+
const waitInput = data.wait;
|
|
35
|
+
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
36
|
+
const rotateUserAgents = data.rotateUserAgents || false;
|
|
37
|
+
const rotateViewportRaw = data.rotateViewport;
|
|
38
|
+
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
39
|
+
const runId = data.runId || null;
|
|
40
|
+
const captureRunId = sanitizeRunId(runId) || `run_${Date.now()}_unknown`;
|
|
41
|
+
const rotateProxiesRaw = data.rotateProxies;
|
|
42
|
+
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
43
|
+
const includeShadowDomRaw = data.includeShadowDom;
|
|
44
|
+
const includeShadowDom = includeShadowDomRaw === undefined
|
|
45
|
+
? true
|
|
46
|
+
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
47
|
+
const disableRecordingRaw = data.disableRecording;
|
|
48
|
+
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
49
|
+
const statelessExecutionRaw = data.statelessExecution;
|
|
50
|
+
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
51
|
+
const extractionScript = data.extractionScript;
|
|
52
|
+
const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
|
|
53
|
+
|
|
54
|
+
if (!url) {
|
|
55
|
+
throw new Error('URL is required.');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
await validateUrl(url);
|
|
59
|
+
|
|
60
|
+
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
61
|
+
|
|
62
|
+
let browser;
|
|
63
|
+
let context;
|
|
64
|
+
let page;
|
|
65
|
+
try {
|
|
66
|
+
const selection = getProxySelection(rotateProxies);
|
|
67
|
+
const hasProxy = !!selection.proxy;
|
|
68
|
+
|
|
69
|
+
const args = [
|
|
70
|
+
'--no-sandbox',
|
|
71
|
+
'--disable-setuid-sandbox',
|
|
72
|
+
'--disable-dev-shm-usage',
|
|
73
|
+
'--disable-blink-features=AutomationControlled',
|
|
74
|
+
'--hide-scrollbars',
|
|
75
|
+
'--mute-audio',
|
|
76
|
+
'--dns-prefetch-disable',
|
|
77
|
+
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp'
|
|
78
|
+
];
|
|
79
|
+
if (!hasProxy) {
|
|
80
|
+
args.push(
|
|
81
|
+
'--enable-features=DnsOverHttps',
|
|
82
|
+
'--dns-over-https-mode=secure',
|
|
83
|
+
'--dns-over-https-templates=https://cloudflare-dns.com/dns-query'
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
88
|
+
await fs.promises.mkdir(recordingsDir, { recursive: true });
|
|
89
|
+
await fs.promises.mkdir(PROFILE_DIR, { recursive: true });
|
|
90
|
+
|
|
91
|
+
const viewport = rotateViewport
|
|
92
|
+
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
93
|
+
: { width: 1366, height: 768 };
|
|
94
|
+
|
|
95
|
+
const contextOptions = {
|
|
96
|
+
headless: true,
|
|
97
|
+
args,
|
|
98
|
+
userAgent: selectedUA,
|
|
99
|
+
extraHTTPHeaders: customHeaders,
|
|
100
|
+
viewport,
|
|
101
|
+
deviceScaleFactor: 1,
|
|
102
|
+
locale: 'en-US',
|
|
103
|
+
timezoneId: 'America/New_York',
|
|
104
|
+
colorScheme: 'dark',
|
|
105
|
+
permissions: ['geolocation']
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
if (selection.proxy) {
|
|
109
|
+
contextOptions.proxy = selection.proxy;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (!disableRecording) {
|
|
113
|
+
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (statelessExecution) {
|
|
117
|
+
const launchOpts = { headless: true, args, ...(selection.proxy ? { proxy: selection.proxy } : {}) };
|
|
118
|
+
browser = await chromium.launch(launchOpts);
|
|
119
|
+
context = await browser.newContext(contextOptions);
|
|
120
|
+
} else {
|
|
121
|
+
await fs.promises.mkdir(PROFILE_DIR, { recursive: true });
|
|
122
|
+
context = await chromium.launchPersistentContext(PROFILE_DIR, { headless: true, args, ...contextOptions });
|
|
123
|
+
browser = context.browser();
|
|
124
|
+
await injectHeadfulCookies(context);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
await setupNavigationProtection(context);
|
|
128
|
+
await context.addInitScript(installMouseHelper);
|
|
129
|
+
|
|
130
|
+
if (includeShadowDom) {
|
|
131
|
+
await context.addInitScript(() => {
|
|
132
|
+
if (!Element.prototype.attachShadow) return;
|
|
133
|
+
const original = Element.prototype.attachShadow;
|
|
134
|
+
Element.prototype.attachShadow = function (init) {
|
|
135
|
+
const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
|
|
136
|
+
return original.call(this, options);
|
|
137
|
+
};
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Persistent context auto-creates a blank page; reuse it or open a new one
|
|
142
|
+
const existingPages = context.pages();
|
|
143
|
+
page = existingPages.length > 0 ? existingPages[0] : await context.newPage();
|
|
144
|
+
|
|
145
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
146
|
+
|
|
147
|
+
await page.evaluate(async () => {
|
|
148
|
+
await new Promise((resolve) => {
|
|
149
|
+
let totalHeight = 0;
|
|
150
|
+
const distance = 400;
|
|
151
|
+
const timer = setInterval(() => {
|
|
152
|
+
const scrollHeight = document.body.scrollHeight;
|
|
153
|
+
window.scrollBy(0, distance);
|
|
154
|
+
totalHeight += distance;
|
|
155
|
+
if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
|
|
156
|
+
}, 100);
|
|
157
|
+
});
|
|
158
|
+
window.scrollTo(0, 0);
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
await page.waitForTimeout(waitTime);
|
|
162
|
+
|
|
163
|
+
let productHtml = '';
|
|
164
|
+
let usedFallback = false;
|
|
165
|
+
|
|
166
|
+
if (userSelector) {
|
|
167
|
+
if (includeShadowDom) {
|
|
168
|
+
productHtml = await page.evaluate((selector) => {
|
|
169
|
+
const stripUseless = (root) => {
|
|
170
|
+
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
171
|
+
useless.forEach(node => node.remove());
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
const cloneWithShadow = (root) => {
|
|
175
|
+
const clone = root.cloneNode(true);
|
|
176
|
+
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
177
|
+
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
178
|
+
|
|
179
|
+
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
180
|
+
const orig = walkerOrig.currentNode;
|
|
181
|
+
const cloned = walkerClone.currentNode;
|
|
182
|
+
if (orig.shadowRoot) {
|
|
183
|
+
const template = document.createElement('template');
|
|
184
|
+
template.setAttribute('data-shadowroot', 'open');
|
|
185
|
+
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
186
|
+
cloned.appendChild(template);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
stripUseless(clone);
|
|
191
|
+
return clone;
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
const elements = Array.from(document.querySelectorAll(selector));
|
|
195
|
+
return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
|
|
196
|
+
}, userSelector);
|
|
197
|
+
} else {
|
|
198
|
+
productHtml = await page.$$eval(userSelector, (elements) => {
|
|
199
|
+
return elements.map(el => {
|
|
200
|
+
const useless = el.querySelectorAll('script, style, svg, link, noscript');
|
|
201
|
+
useless.forEach(node => node.remove());
|
|
202
|
+
return el.outerHTML;
|
|
203
|
+
}).join('\n');
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
if (!productHtml || productHtml.trim() === '') usedFallback = true;
|
|
207
|
+
} else {
|
|
208
|
+
usedFallback = true;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (usedFallback) {
|
|
212
|
+
productHtml = await page.evaluate((withShadow) => {
|
|
213
|
+
const stripUseless = (root) => {
|
|
214
|
+
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
215
|
+
useless.forEach(node => node.remove());
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
const cloneWithShadow = (root) => {
|
|
219
|
+
const clone = root.cloneNode(true);
|
|
220
|
+
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
221
|
+
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
222
|
+
|
|
223
|
+
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
224
|
+
const orig = walkerOrig.currentNode;
|
|
225
|
+
const cloned = walkerClone.currentNode;
|
|
226
|
+
if (orig.shadowRoot) {
|
|
227
|
+
const template = document.createElement('template');
|
|
228
|
+
template.setAttribute('data-shadowroot', 'open');
|
|
229
|
+
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
230
|
+
cloned.appendChild(template);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
stripUseless(clone);
|
|
235
|
+
return clone;
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
if (withShadow) {
|
|
239
|
+
return cloneWithShadow(document.body).innerHTML;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const body = document.body.cloneNode(true);
|
|
243
|
+
stripUseless(body);
|
|
244
|
+
return body.innerHTML;
|
|
245
|
+
}, includeShadowDom);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const runExtractionScript = async (script, html, pageUrl) => {
|
|
249
|
+
if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
|
|
250
|
+
|
|
251
|
+
return new Promise((resolve) => {
|
|
252
|
+
const safeEnv = {
|
|
253
|
+
NODE_ENV: 'production',
|
|
254
|
+
PATH: process.env.PATH,
|
|
255
|
+
LANG: process.env.LANG,
|
|
256
|
+
TZ: process.env.TZ
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
|
|
260
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
261
|
+
env: safeEnv
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
let stdout = '';
|
|
265
|
+
let stderr = '';
|
|
266
|
+
|
|
267
|
+
const workerTimeout = 5000;
|
|
268
|
+
const timer = setTimeout(() => {
|
|
269
|
+
worker.kill();
|
|
270
|
+
resolve({ result: 'Worker timed out', logs: [] });
|
|
271
|
+
}, workerTimeout);
|
|
272
|
+
|
|
273
|
+
worker.stdout.on('data', (data) => {
|
|
274
|
+
stdout += data.toString();
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
worker.stderr.on('data', (data) => {
|
|
278
|
+
stderr += data.toString();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
worker.on('close', (code) => {
|
|
282
|
+
clearTimeout(timer);
|
|
283
|
+
if (code !== 0) {
|
|
284
|
+
resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
try {
|
|
288
|
+
const output = JSON.parse(stdout);
|
|
289
|
+
resolve(output);
|
|
290
|
+
} catch (e) {
|
|
291
|
+
resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
worker.on('error', (err) => {
|
|
296
|
+
clearTimeout(timer);
|
|
297
|
+
resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
const input = JSON.stringify({
|
|
301
|
+
script,
|
|
302
|
+
html,
|
|
303
|
+
url: pageUrl,
|
|
304
|
+
includeShadowDom
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
worker.stdin.write(input);
|
|
308
|
+
worker.stdin.end();
|
|
309
|
+
});
|
|
310
|
+
};
|
|
311
|
+
|
|
312
|
+
const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
|
|
313
|
+
|
|
314
|
+
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
315
|
+
await fs.promises.mkdir(capturesDir, { recursive: true });
|
|
316
|
+
|
|
317
|
+
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
318
|
+
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
319
|
+
try {
|
|
320
|
+
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
321
|
+
} catch (e) {
|
|
322
|
+
console.error('Screenshot failed:', e.message);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
|
|
326
|
+
const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
|
|
327
|
+
|
|
328
|
+
const resultData = {
|
|
329
|
+
title: await page.title(),
|
|
330
|
+
url: page.url(),
|
|
331
|
+
html: formatHTML(productHtml),
|
|
332
|
+
data: formattedExtraction,
|
|
333
|
+
is_partial: !usedFallback,
|
|
334
|
+
selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
|
|
335
|
+
links: await page.$$eval('a[href]', elements => {
|
|
336
|
+
return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
|
|
337
|
+
}),
|
|
338
|
+
screenshot_url: `/captures/${screenshotName}`
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
const video = page.video();
|
|
342
|
+
await context.close();
|
|
343
|
+
if (video) {
|
|
344
|
+
try {
|
|
345
|
+
const videoPath = await video.path();
|
|
346
|
+
const videoExists = videoPath && await fs.promises.access(videoPath).then(() => true).catch(() => false);
|
|
347
|
+
if (videoExists) {
|
|
348
|
+
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
349
|
+
const recordingPath = path.join(capturesDir, recordingName);
|
|
350
|
+
try {
|
|
351
|
+
await fs.promises.rename(videoPath, recordingPath);
|
|
352
|
+
} catch (err) {
|
|
353
|
+
if (err && err.code === 'EXDEV') {
|
|
354
|
+
await fs.promises.copyFile(videoPath, recordingPath);
|
|
355
|
+
await fs.promises.unlink(videoPath);
|
|
356
|
+
} else {
|
|
357
|
+
throw err;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
} catch (e) {
|
|
362
|
+
console.error('Recording save failed:', e.message);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if (browser) await browser.close();
|
|
367
|
+
return resultData;
|
|
368
|
+
} catch (error) {
|
|
369
|
+
if (context) await context.close();
|
|
370
|
+
if (browser) await browser.close();
|
|
371
|
+
throw error;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
async function handleScrape(req, res) {
|
|
376
|
+
const data = {
|
|
377
|
+
...req.body,
|
|
378
|
+
...req.query
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
try {
|
|
382
|
+
const result = await runScrape(data);
|
|
383
|
+
res.json(result);
|
|
384
|
+
} catch (error) {
|
|
385
|
+
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
module.exports = { runScrape, handleScrape };
|