figranium 0.9.1 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scrape.js CHANGED
@@ -1,374 +1,418 @@
1
- const { chromium } = require('playwright');
2
- const fs = require('fs');
3
- const path = require('path');
4
- const { spawn } = require('child_process');
5
- const { getProxySelection } = require('./proxy-rotation');
6
- const { selectUserAgent } = require('./user-agent-settings');
7
- const { formatHTML } = require('./html-utils');
8
- const { validateUrl } = require('./url-utils');
9
- const { parseBooleanFlag, toCsvString } = require('./common-utils');
10
- const { installMouseHelper } = require('./src/agent/dom-utils');
11
-
12
- const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
13
- const STORAGE_STATE_FILE = (() => {
14
- try {
15
- if (fs.existsSync(STORAGE_STATE_PATH)) {
16
- const stat = fs.statSync(STORAGE_STATE_PATH);
17
- if (stat.isDirectory()) {
18
- return path.join(STORAGE_STATE_PATH, 'storage_state.json');
19
- }
20
- }
21
- } catch { }
22
- return STORAGE_STATE_PATH;
23
- })();
24
-
25
- async function runScrape(data) {
26
- const url = data.url;
27
- const customHeaders = data.headers || {};
28
- const userSelector = data.selector;
29
- const waitInput = data.wait;
30
- const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
31
- const rotateUserAgents = data.rotateUserAgents || false;
32
- const rotateViewportRaw = data.rotateViewport;
33
- const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
34
- const runId = data.runId || null;
35
- const captureRunId = runId ? String(runId) : `run_${Date.now()}_unknown`;
36
- const rotateProxiesRaw = data.rotateProxies;
37
- const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
38
- const includeShadowDomRaw = data.includeShadowDom;
39
- const includeShadowDom = includeShadowDomRaw === undefined
40
- ? true
41
- : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
42
- const disableRecordingRaw = data.disableRecording;
43
- const disableRecording = parseBooleanFlag(disableRecordingRaw);
44
- const statelessExecutionRaw = data.statelessExecution;
45
- const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
46
- const extractionScript = data.extractionScript;
47
- const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
48
-
49
- if (!url) {
50
- throw new Error('URL is required.');
51
- }
52
-
53
- await validateUrl(url);
54
-
55
- const selectedUA = await selectUserAgent(rotateUserAgents);
56
-
57
- let browser;
58
- let context;
59
- let page;
60
- try {
61
- const launchOptions = {
62
- headless: true,
63
- args: [
64
- '--no-sandbox',
65
- '--disable-setuid-sandbox',
66
- '--disable-dev-shm-usage',
67
- '--disable-blink-features=AutomationControlled',
68
- '--hide-scrollbars',
69
- '--mute-audio'
70
- ]
71
- };
72
- const selection = getProxySelection(rotateProxies);
73
- if (selection.proxy) {
74
- launchOptions.proxy = selection.proxy;
75
- }
76
-
77
- browser = await chromium.launch(launchOptions);
78
-
79
- const recordingsDir = path.join(__dirname, 'data', 'recordings');
80
- await fs.promises.mkdir(recordingsDir, { recursive: true });
81
-
82
- const viewport = rotateViewport
83
- ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
84
- : { width: 1366, height: 768 };
85
-
86
- const contextOptions = {
87
- userAgent: selectedUA,
88
- extraHTTPHeaders: customHeaders,
89
- viewport,
90
- deviceScaleFactor: 1,
91
- locale: 'en-US',
92
- timezoneId: 'America/New_York',
93
- colorScheme: 'dark',
94
- permissions: ['geolocation']
95
- };
96
-
97
- const shouldUseStorageState = !statelessExecution && await fs.promises.access(STORAGE_STATE_FILE).then(() => true).catch(() => false);
98
- if (shouldUseStorageState) {
99
- contextOptions.storageState = STORAGE_STATE_FILE;
100
- }
101
-
102
- if (!disableRecording) {
103
- contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
104
- }
105
-
106
- context = await browser.newContext(contextOptions);
107
-
108
- await context.addInitScript(() => {
109
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
110
- });
111
- await context.addInitScript(installMouseHelper);
112
-
113
- if (includeShadowDom) {
114
- await context.addInitScript(() => {
115
- if (!Element.prototype.attachShadow) return;
116
- const original = Element.prototype.attachShadow;
117
- Element.prototype.attachShadow = function (init) {
118
- const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
119
- return original.call(this, options);
120
- };
121
- });
122
- }
123
-
124
- page = await context.newPage();
125
-
126
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
127
-
128
- await page.evaluate(async () => {
129
- await new Promise((resolve) => {
130
- let totalHeight = 0;
131
- const distance = 400;
132
- const timer = setInterval(() => {
133
- const scrollHeight = document.body.scrollHeight;
134
- window.scrollBy(0, distance);
135
- totalHeight += distance;
136
- if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
137
- }, 100);
138
- });
139
- window.scrollTo(0, 0);
140
- });
141
-
142
- await page.waitForTimeout(waitTime);
143
-
144
- let productHtml = '';
145
- let usedFallback = false;
146
-
147
- if (userSelector) {
148
- if (includeShadowDom) {
149
- productHtml = await page.evaluate((selector) => {
150
- const stripUseless = (root) => {
151
- const useless = root.querySelectorAll('script, style, svg, link, noscript');
152
- useless.forEach(node => node.remove());
153
- };
154
-
155
- const cloneWithShadow = (root) => {
156
- const clone = root.cloneNode(true);
157
- const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
158
- const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
159
-
160
- while (walkerOrig.nextNode() && walkerClone.nextNode()) {
161
- const orig = walkerOrig.currentNode;
162
- const cloned = walkerClone.currentNode;
163
- if (orig.shadowRoot) {
164
- const template = document.createElement('template');
165
- template.setAttribute('data-shadowroot', 'open');
166
- template.innerHTML = orig.shadowRoot.innerHTML;
167
- cloned.appendChild(template);
168
- }
169
- }
170
-
171
- stripUseless(clone);
172
- return clone;
173
- };
174
-
175
- const elements = Array.from(document.querySelectorAll(selector));
176
- return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
177
- }, userSelector);
178
- } else {
179
- productHtml = await page.$$eval(userSelector, (elements) => {
180
- return elements.map(el => {
181
- const useless = el.querySelectorAll('script, style, svg, link, noscript');
182
- useless.forEach(node => node.remove());
183
- return el.outerHTML;
184
- }).join('\n');
185
- });
186
- }
187
- if (!productHtml || productHtml.trim() === '') usedFallback = true;
188
- } else {
189
- usedFallback = true;
190
- }
191
-
192
- if (usedFallback) {
193
- productHtml = await page.evaluate((withShadow) => {
194
- const stripUseless = (root) => {
195
- const useless = root.querySelectorAll('script, style, svg, link, noscript');
196
- useless.forEach(node => node.remove());
197
- };
198
-
199
- const cloneWithShadow = (root) => {
200
- const clone = root.cloneNode(true);
201
- const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
202
- const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
203
-
204
- while (walkerOrig.nextNode() && walkerClone.nextNode()) {
205
- const orig = walkerOrig.currentNode;
206
- const cloned = walkerClone.currentNode;
207
- if (orig.shadowRoot) {
208
- const template = document.createElement('template');
209
- template.setAttribute('data-shadowroot', 'open');
210
- template.innerHTML = orig.shadowRoot.innerHTML;
211
- cloned.appendChild(template);
212
- }
213
- }
214
-
215
- stripUseless(clone);
216
- return clone;
217
- };
218
-
219
- if (withShadow) {
220
- return cloneWithShadow(document.body).innerHTML;
221
- }
222
-
223
- const body = document.body.cloneNode(true);
224
- stripUseless(body);
225
- return body.innerHTML;
226
- }, includeShadowDom);
227
- }
228
-
229
- const runExtractionScript = async (script, html, pageUrl) => {
230
- if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
231
-
232
- return new Promise((resolve) => {
233
- const safeEnv = {
234
- NODE_ENV: 'production',
235
- PATH: process.env.PATH,
236
- LANG: process.env.LANG,
237
- TZ: process.env.TZ
238
- };
239
-
240
- const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
241
- stdio: ['pipe', 'pipe', 'pipe'],
242
- env: safeEnv
243
- });
244
-
245
- let stdout = '';
246
- let stderr = '';
247
-
248
- const workerTimeout = 5000;
249
- const timer = setTimeout(() => {
250
- worker.kill();
251
- resolve({ result: 'Worker timed out', logs: [] });
252
- }, workerTimeout);
253
-
254
- worker.stdout.on('data', (data) => {
255
- stdout += data.toString();
256
- });
257
-
258
- worker.stderr.on('data', (data) => {
259
- stderr += data.toString();
260
- });
261
-
262
- worker.on('close', (code) => {
263
- clearTimeout(timer);
264
- if (code !== 0) {
265
- resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
266
- return;
267
- }
268
- try {
269
- const output = JSON.parse(stdout);
270
- resolve(output);
271
- } catch (e) {
272
- resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
273
- }
274
- });
275
-
276
- worker.on('error', (err) => {
277
- clearTimeout(timer);
278
- resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
279
- });
280
-
281
- const input = JSON.stringify({
282
- script,
283
- html,
284
- url: pageUrl,
285
- includeShadowDom
286
- });
287
-
288
- worker.stdin.write(input);
289
- worker.stdin.end();
290
- });
291
- };
292
-
293
- const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
294
-
295
- const capturesDir = path.join(__dirname, 'public', 'captures');
296
- await fs.promises.mkdir(capturesDir, { recursive: true });
297
-
298
- const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
299
- const screenshotPath = path.join(capturesDir, screenshotName);
300
- try {
301
- await page.screenshot({ path: screenshotPath, fullPage: false });
302
- } catch (e) {
303
- console.error('Screenshot failed:', e.message);
304
- }
305
-
306
- const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
307
- const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
308
-
309
- const resultData = {
310
- title: await page.title(),
311
- url: page.url(),
312
- html: formatHTML(productHtml),
313
- data: formattedExtraction,
314
- is_partial: !usedFallback,
315
- selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
316
- links: await page.$$eval('a[href]', elements => {
317
- return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
318
- }),
319
- screenshot_url: `/captures/${screenshotName}`
320
- };
321
-
322
- if (!statelessExecution) {
323
- await context.storageState({ path: STORAGE_STATE_FILE });
324
- }
325
-
326
- const video = page.video();
327
- await context.close();
328
- if (video) {
329
- try {
330
- const videoPath = await video.path();
331
- const videoExists = videoPath && await fs.promises.access(videoPath).then(() => true).catch(() => false);
332
- if (videoExists) {
333
- const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
334
- const recordingPath = path.join(capturesDir, recordingName);
335
- try {
336
- await fs.promises.rename(videoPath, recordingPath);
337
- } catch (err) {
338
- if (err && err.code === 'EXDEV') {
339
- await fs.promises.copyFile(videoPath, recordingPath);
340
- await fs.promises.unlink(videoPath);
341
- } else {
342
- throw err;
343
- }
344
- }
345
- }
346
- } catch (e) {
347
- console.error('Recording save failed:', e.message);
348
- }
349
- }
350
-
351
- await browser.close();
352
- return resultData;
353
- } catch (error) {
354
- if (context) await context.close();
355
- if (browser) await browser.close();
356
- throw error;
357
- }
358
- }
359
-
360
- async function handleScrape(req, res) {
361
- const data = {
362
- ...req.body,
363
- ...req.query
364
- };
365
-
366
- try {
367
- const result = await runScrape(data);
368
- res.json(result);
369
- } catch (error) {
370
- res.status(500).json({ error: 'Failed to scrape', details: error.message });
371
- }
372
- }
373
-
374
- module.exports = { runScrape, handleScrape };
1
+ const { chromium } = require('playwright');
2
+ const fs = require('fs');
3
+ const path = require('path');
4
+ const { spawn } = require('child_process');
5
+ const { getProxySelection } = require('./proxy-rotation');
6
+ const { selectUserAgent } = require('./user-agent-settings');
7
+ const { formatHTML } = require('./html-utils');
8
+ const { validateUrl } = require('./url-utils');
9
+ const { parseBooleanFlag, sanitizeRunId, toCsvString, cookieMatches } = require('./common-utils');
10
+ const { installMouseHelper } = require('./src/agent/dom-utils');
11
+
12
+ const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
13
+ const STORAGE_STATE_FILE = (() => {
14
+ try {
15
+ if (fs.existsSync(STORAGE_STATE_PATH)) {
16
+ const stat = fs.statSync(STORAGE_STATE_PATH);
17
+ if (stat.isDirectory()) {
18
+ return path.join(STORAGE_STATE_PATH, 'storage_state.json');
19
+ }
20
+ }
21
+ } catch { }
22
+ return STORAGE_STATE_PATH;
23
+ })();
24
+
25
+ async function runScrape(data) {
26
+ const url = data.url;
27
+ const customHeaders = data.headers || {};
28
+ const userSelector = data.selector;
29
+ const waitInput = data.wait;
30
+ const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
31
+ const rotateUserAgents = data.rotateUserAgents || false;
32
+ const rotateViewportRaw = data.rotateViewport;
33
+ const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
34
+ const runId = data.runId || null;
35
+ const captureRunId = sanitizeRunId(runId) || `run_${Date.now()}_unknown`;
36
+ const rotateProxiesRaw = data.rotateProxies;
37
+ const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
38
+ const includeShadowDomRaw = data.includeShadowDom;
39
+ const includeShadowDom = includeShadowDomRaw === undefined
40
+ ? true
41
+ : !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
42
+ const disableRecordingRaw = data.disableRecording;
43
+ const disableRecording = parseBooleanFlag(disableRecordingRaw);
44
+ const statelessExecutionRaw = data.statelessExecution;
45
+ const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
46
+ const extractionScript = data.extractionScript;
47
+ const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
48
+
49
+ if (!url) {
50
+ throw new Error('URL is required.');
51
+ }
52
+
53
+ await validateUrl(url);
54
+
55
+ const selectedUA = await selectUserAgent(rotateUserAgents);
56
+
57
+ let browser;
58
+ let context;
59
+ let page;
60
+ try {
61
+ const launchOptions = {
62
+ headless: true,
63
+ args: [
64
+ '--no-sandbox',
65
+ '--disable-setuid-sandbox',
66
+ '--disable-dev-shm-usage',
67
+ '--disable-blink-features=AutomationControlled',
68
+ '--hide-scrollbars',
69
+ '--mute-audio'
70
+ ]
71
+ };
72
+ const selection = getProxySelection(rotateProxies);
73
+ if (selection.proxy) {
74
+ launchOptions.proxy = selection.proxy;
75
+ }
76
+
77
+ browser = await chromium.launch(launchOptions);
78
+
79
+ const recordingsDir = path.join(__dirname, 'data', 'recordings');
80
+ await fs.promises.mkdir(recordingsDir, { recursive: true });
81
+
82
+ const viewport = rotateViewport
83
+ ? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
84
+ : { width: 1366, height: 768 };
85
+
86
+ const contextOptions = {
87
+ userAgent: selectedUA,
88
+ extraHTTPHeaders: customHeaders,
89
+ viewport,
90
+ deviceScaleFactor: 1,
91
+ locale: 'en-US',
92
+ timezoneId: 'America/New_York',
93
+ colorScheme: 'dark',
94
+ permissions: ['geolocation']
95
+ };
96
+
97
+ const shouldUseStorageState = !statelessExecution && await fs.promises.access(STORAGE_STATE_FILE).then(() => true).catch(() => false);
98
+ if (shouldUseStorageState) {
99
+ contextOptions.storageState = STORAGE_STATE_FILE;
100
+ }
101
+
102
+ if (!disableRecording) {
103
+ contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
104
+ }
105
+
106
+ context = await browser.newContext(contextOptions);
107
+
108
+ let preloadedCookies = [];
109
+ if (!statelessExecution && fs.existsSync(STORAGE_STATE_FILE)) {
110
+ try {
111
+ const state = JSON.parse(fs.readFileSync(STORAGE_STATE_FILE, 'utf8'));
112
+ preloadedCookies = state.cookies || [];
113
+ } catch (e) { }
114
+ }
115
+
116
+ await context.route('**/*', async (route) => {
117
+ const request = route.request();
118
+ const requestUrl = request.url();
119
+ const resourceType = request.resourceType();
120
+
121
+ const isDataRequest = ['document', 'script', 'xhr', 'fetch'].includes(resourceType);
122
+ if (isDataRequest && preloadedCookies.length > 0) {
123
+ // ⚡ Bolt: Parse URL once to avoid redundant parsing inside cookieMatches filter loop
124
+ const urlObj = new URL(requestUrl);
125
+ const filteredCookies = preloadedCookies.filter(cookie => cookieMatches(cookie, urlObj));
126
+ if (filteredCookies.length > 0) {
127
+ const fileCookieMap = new Map();
128
+ filteredCookies.forEach(c => fileCookieMap.set(c.name, c.value));
129
+
130
+ const existingCookieHeader = request.headers()['cookie'] || '';
131
+ const existingCookies = existingCookieHeader.split(';').filter(Boolean).map(s => s.trim());
132
+
133
+ existingCookies.forEach(s => {
134
+ const [name, ...valParts] = s.split('=');
135
+ const val = valParts.join('=');
136
+ if (!fileCookieMap.has(name)) {
137
+ fileCookieMap.set(name, val);
138
+ }
139
+ });
140
+
141
+ const cookieHeader = Array.from(fileCookieMap.entries()).map(([n, v]) => `${n}=${v}`).join('; ');
142
+ const headers = { ...request.headers(), 'cookie': cookieHeader };
143
+ return route.continue({ headers });
144
+ }
145
+ }
146
+ route.continue();
147
+ });
148
+
149
+ await context.addInitScript(() => {
150
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
151
+ });
152
+ await context.addInitScript(installMouseHelper);
153
+
154
+ if (includeShadowDom) {
155
+ await context.addInitScript(() => {
156
+ if (!Element.prototype.attachShadow) return;
157
+ const original = Element.prototype.attachShadow;
158
+ Element.prototype.attachShadow = function (init) {
159
+ const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
160
+ return original.call(this, options);
161
+ };
162
+ });
163
+ }
164
+
165
+ page = await context.newPage();
166
+
167
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
168
+
169
+ await page.evaluate(async () => {
170
+ await new Promise((resolve) => {
171
+ let totalHeight = 0;
172
+ const distance = 400;
173
+ const timer = setInterval(() => {
174
+ const scrollHeight = document.body.scrollHeight;
175
+ window.scrollBy(0, distance);
176
+ totalHeight += distance;
177
+ if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
178
+ }, 100);
179
+ });
180
+ window.scrollTo(0, 0);
181
+ });
182
+
183
+ await page.waitForTimeout(waitTime);
184
+
185
+ let productHtml = '';
186
+ let usedFallback = false;
187
+
188
+ if (userSelector) {
189
+ if (includeShadowDom) {
190
+ productHtml = await page.evaluate((selector) => {
191
+ const stripUseless = (root) => {
192
+ const useless = root.querySelectorAll('script, style, svg, link, noscript');
193
+ useless.forEach(node => node.remove());
194
+ };
195
+
196
+ const cloneWithShadow = (root) => {
197
+ const clone = root.cloneNode(true);
198
+ const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
199
+ const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
200
+
201
+ while (walkerOrig.nextNode() && walkerClone.nextNode()) {
202
+ const orig = walkerOrig.currentNode;
203
+ const cloned = walkerClone.currentNode;
204
+ if (orig.shadowRoot) {
205
+ const template = document.createElement('template');
206
+ template.setAttribute('data-shadowroot', 'open');
207
+ template.innerHTML = orig.shadowRoot.innerHTML;
208
+ cloned.appendChild(template);
209
+ }
210
+ }
211
+
212
+ stripUseless(clone);
213
+ return clone;
214
+ };
215
+
216
+ const elements = Array.from(document.querySelectorAll(selector));
217
+ return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
218
+ }, userSelector);
219
+ } else {
220
+ productHtml = await page.$$eval(userSelector, (elements) => {
221
+ return elements.map(el => {
222
+ const useless = el.querySelectorAll('script, style, svg, link, noscript');
223
+ useless.forEach(node => node.remove());
224
+ return el.outerHTML;
225
+ }).join('\n');
226
+ });
227
+ }
228
+ if (!productHtml || productHtml.trim() === '') usedFallback = true;
229
+ } else {
230
+ usedFallback = true;
231
+ }
232
+
233
+ if (usedFallback) {
234
+ productHtml = await page.evaluate((withShadow) => {
235
+ const stripUseless = (root) => {
236
+ const useless = root.querySelectorAll('script, style, svg, link, noscript');
237
+ useless.forEach(node => node.remove());
238
+ };
239
+
240
+ const cloneWithShadow = (root) => {
241
+ const clone = root.cloneNode(true);
242
+ const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
243
+ const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
244
+
245
+ while (walkerOrig.nextNode() && walkerClone.nextNode()) {
246
+ const orig = walkerOrig.currentNode;
247
+ const cloned = walkerClone.currentNode;
248
+ if (orig.shadowRoot) {
249
+ const template = document.createElement('template');
250
+ template.setAttribute('data-shadowroot', 'open');
251
+ template.innerHTML = orig.shadowRoot.innerHTML;
252
+ cloned.appendChild(template);
253
+ }
254
+ }
255
+
256
+ stripUseless(clone);
257
+ return clone;
258
+ };
259
+
260
+ if (withShadow) {
261
+ return cloneWithShadow(document.body).innerHTML;
262
+ }
263
+
264
+ const body = document.body.cloneNode(true);
265
+ stripUseless(body);
266
+ return body.innerHTML;
267
+ }, includeShadowDom);
268
+ }
269
+
270
+ const runExtractionScript = async (script, html, pageUrl) => {
271
+ if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
272
+
273
+ return new Promise((resolve) => {
274
+ const safeEnv = {
275
+ NODE_ENV: 'production',
276
+ PATH: process.env.PATH,
277
+ LANG: process.env.LANG,
278
+ TZ: process.env.TZ
279
+ };
280
+
281
+ const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
282
+ stdio: ['pipe', 'pipe', 'pipe'],
283
+ env: safeEnv
284
+ });
285
+
286
+ let stdout = '';
287
+ let stderr = '';
288
+
289
+ const workerTimeout = 5000;
290
+ const timer = setTimeout(() => {
291
+ worker.kill();
292
+ resolve({ result: 'Worker timed out', logs: [] });
293
+ }, workerTimeout);
294
+
295
+ worker.stdout.on('data', (data) => {
296
+ stdout += data.toString();
297
+ });
298
+
299
+ worker.stderr.on('data', (data) => {
300
+ stderr += data.toString();
301
+ });
302
+
303
+ worker.on('close', (code) => {
304
+ clearTimeout(timer);
305
+ if (code !== 0) {
306
+ resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
307
+ return;
308
+ }
309
+ try {
310
+ const output = JSON.parse(stdout);
311
+ resolve(output);
312
+ } catch (e) {
313
+ resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
314
+ }
315
+ });
316
+
317
+ worker.on('error', (err) => {
318
+ clearTimeout(timer);
319
+ resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
320
+ });
321
+
322
+ const input = JSON.stringify({
323
+ script,
324
+ html,
325
+ url: pageUrl,
326
+ includeShadowDom
327
+ });
328
+
329
+ worker.stdin.write(input);
330
+ worker.stdin.end();
331
+ });
332
+ };
333
+
334
+ const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
335
+
336
+ const capturesDir = path.join(__dirname, 'public', 'captures');
337
+ await fs.promises.mkdir(capturesDir, { recursive: true });
338
+
339
+ const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
340
+ const screenshotPath = path.join(capturesDir, screenshotName);
341
+ try {
342
+ await page.screenshot({ path: screenshotPath, fullPage: false });
343
+ } catch (e) {
344
+ console.error('Screenshot failed:', e.message);
345
+ }
346
+
347
+ const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
348
+ const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
349
+
350
+ const resultData = {
351
+ title: await page.title(),
352
+ url: page.url(),
353
+ html: formatHTML(productHtml),
354
+ data: formattedExtraction,
355
+ is_partial: !usedFallback,
356
+ selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
357
+ links: await page.$$eval('a[href]', elements => {
358
+ return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
359
+ }),
360
+ screenshot_url: `/captures/${screenshotName}`
361
+ };
362
+
363
+ if (!statelessExecution) {
364
+ try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch { }
365
+ }
366
+
367
+ const video = page.video();
368
+ await context.close();
369
+ if (video) {
370
+ try {
371
+ const videoPath = await video.path();
372
+ const videoExists = videoPath && await fs.promises.access(videoPath).then(() => true).catch(() => false);
373
+ if (videoExists) {
374
+ const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
375
+ const recordingPath = path.join(capturesDir, recordingName);
376
+ try {
377
+ await fs.promises.rename(videoPath, recordingPath);
378
+ } catch (err) {
379
+ if (err && err.code === 'EXDEV') {
380
+ await fs.promises.copyFile(videoPath, recordingPath);
381
+ await fs.promises.unlink(videoPath);
382
+ } else {
383
+ throw err;
384
+ }
385
+ }
386
+ }
387
+ } catch (e) {
388
+ console.error('Recording save failed:', e.message);
389
+ }
390
+ }
391
+
392
+ await browser.close();
393
+ return resultData;
394
+ } catch (error) {
395
+ if (context && !statelessExecution) {
396
+ try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch { }
397
+ }
398
+ if (context) await context.close();
399
+ if (browser) await browser.close();
400
+ throw error;
401
+ }
402
+ }
403
+
404
+ async function handleScrape(req, res) {
405
+ const data = {
406
+ ...req.body,
407
+ ...req.query
408
+ };
409
+
410
+ try {
411
+ const result = await runScrape(data);
412
+ res.json(result);
413
+ } catch (error) {
414
+ res.status(500).json({ error: 'Failed to scrape', details: error.message });
415
+ }
416
+ }
417
+
418
+ module.exports = { runScrape, handleScrape };