figranium 0.9.1 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +674 -674
- package/README.md +336 -318
- package/agent.js +1 -1
- package/common-utils.js +211 -166
- package/dist/assets/index--OZi5-p_.css +1 -0
- package/dist/assets/index-Bkr74C53.js +15 -0
- package/dist/index.html +26 -26
- package/dist/novnc.html +108 -108
- package/extraction-worker.js +204 -197
- package/headful.js +583 -219
- package/html-utils.js +24 -24
- package/package.json +81 -78
- package/proxy-rotation.js +261 -261
- package/proxy-utils.js +84 -84
- package/public/novnc.html +108 -108
- package/scrape.js +418 -374
- package/server.js +501 -404
- package/src/server/cron-parser.js +316 -0
- package/src/server/routes/schedules.js +171 -0
- package/src/server/scheduler.js +381 -0
- package/url-utils.js +137 -116
- package/user-agent-settings.js +76 -76
- package/dist/assets/index-ALim18cn.css +0 -1
- package/dist/assets/index-D8YbCWRx.js +0 -15
package/scrape.js
CHANGED
|
@@ -1,374 +1,418 @@
|
|
|
1
|
-
const { chromium } = require('playwright');
|
|
2
|
-
const fs = require('fs');
|
|
3
|
-
const path = require('path');
|
|
4
|
-
const { spawn } = require('child_process');
|
|
5
|
-
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
-
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
-
const { formatHTML } = require('./html-utils');
|
|
8
|
-
const { validateUrl } = require('./url-utils');
|
|
9
|
-
const { parseBooleanFlag, toCsvString } = require('./common-utils');
|
|
10
|
-
const { installMouseHelper } = require('./src/agent/dom-utils');
|
|
11
|
-
|
|
12
|
-
const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
|
|
13
|
-
const STORAGE_STATE_FILE = (() => {
|
|
14
|
-
try {
|
|
15
|
-
if (fs.existsSync(STORAGE_STATE_PATH)) {
|
|
16
|
-
const stat = fs.statSync(STORAGE_STATE_PATH);
|
|
17
|
-
if (stat.isDirectory()) {
|
|
18
|
-
return path.join(STORAGE_STATE_PATH, 'storage_state.json');
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
} catch { }
|
|
22
|
-
return STORAGE_STATE_PATH;
|
|
23
|
-
})();
|
|
24
|
-
|
|
25
|
-
async function runScrape(data) {
|
|
26
|
-
const url = data.url;
|
|
27
|
-
const customHeaders = data.headers || {};
|
|
28
|
-
const userSelector = data.selector;
|
|
29
|
-
const waitInput = data.wait;
|
|
30
|
-
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
31
|
-
const rotateUserAgents = data.rotateUserAgents || false;
|
|
32
|
-
const rotateViewportRaw = data.rotateViewport;
|
|
33
|
-
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
34
|
-
const runId = data.runId || null;
|
|
35
|
-
const captureRunId =
|
|
36
|
-
const rotateProxiesRaw = data.rotateProxies;
|
|
37
|
-
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
38
|
-
const includeShadowDomRaw = data.includeShadowDom;
|
|
39
|
-
const includeShadowDom = includeShadowDomRaw === undefined
|
|
40
|
-
? true
|
|
41
|
-
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
42
|
-
const disableRecordingRaw = data.disableRecording;
|
|
43
|
-
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
44
|
-
const statelessExecutionRaw = data.statelessExecution;
|
|
45
|
-
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
46
|
-
const extractionScript = data.extractionScript;
|
|
47
|
-
const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
|
|
48
|
-
|
|
49
|
-
if (!url) {
|
|
50
|
-
throw new Error('URL is required.');
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
await validateUrl(url);
|
|
54
|
-
|
|
55
|
-
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
56
|
-
|
|
57
|
-
let browser;
|
|
58
|
-
let context;
|
|
59
|
-
let page;
|
|
60
|
-
try {
|
|
61
|
-
const launchOptions = {
|
|
62
|
-
headless: true,
|
|
63
|
-
args: [
|
|
64
|
-
'--no-sandbox',
|
|
65
|
-
'--disable-setuid-sandbox',
|
|
66
|
-
'--disable-dev-shm-usage',
|
|
67
|
-
'--disable-blink-features=AutomationControlled',
|
|
68
|
-
'--hide-scrollbars',
|
|
69
|
-
'--mute-audio'
|
|
70
|
-
]
|
|
71
|
-
};
|
|
72
|
-
const selection = getProxySelection(rotateProxies);
|
|
73
|
-
if (selection.proxy) {
|
|
74
|
-
launchOptions.proxy = selection.proxy;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
browser = await chromium.launch(launchOptions);
|
|
78
|
-
|
|
79
|
-
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
80
|
-
await fs.promises.mkdir(recordingsDir, { recursive: true });
|
|
81
|
-
|
|
82
|
-
const viewport = rotateViewport
|
|
83
|
-
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
84
|
-
: { width: 1366, height: 768 };
|
|
85
|
-
|
|
86
|
-
const contextOptions = {
|
|
87
|
-
userAgent: selectedUA,
|
|
88
|
-
extraHTTPHeaders: customHeaders,
|
|
89
|
-
viewport,
|
|
90
|
-
deviceScaleFactor: 1,
|
|
91
|
-
locale: 'en-US',
|
|
92
|
-
timezoneId: 'America/New_York',
|
|
93
|
-
colorScheme: 'dark',
|
|
94
|
-
permissions: ['geolocation']
|
|
95
|
-
};
|
|
96
|
-
|
|
97
|
-
const shouldUseStorageState = !statelessExecution && await fs.promises.access(STORAGE_STATE_FILE).then(() => true).catch(() => false);
|
|
98
|
-
if (shouldUseStorageState) {
|
|
99
|
-
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
if (!disableRecording) {
|
|
103
|
-
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
context = await browser.newContext(contextOptions);
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
};
|
|
239
|
-
|
|
240
|
-
const
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
const
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
const
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
1
|
+
const { chromium } = require('playwright');
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const { spawn } = require('child_process');
|
|
5
|
+
const { getProxySelection } = require('./proxy-rotation');
|
|
6
|
+
const { selectUserAgent } = require('./user-agent-settings');
|
|
7
|
+
const { formatHTML } = require('./html-utils');
|
|
8
|
+
const { validateUrl } = require('./url-utils');
|
|
9
|
+
const { parseBooleanFlag, sanitizeRunId, toCsvString, cookieMatches } = require('./common-utils');
|
|
10
|
+
const { installMouseHelper } = require('./src/agent/dom-utils');
|
|
11
|
+
|
|
12
|
+
const STORAGE_STATE_PATH = path.join(__dirname, 'storage_state.json');
|
|
13
|
+
const STORAGE_STATE_FILE = (() => {
|
|
14
|
+
try {
|
|
15
|
+
if (fs.existsSync(STORAGE_STATE_PATH)) {
|
|
16
|
+
const stat = fs.statSync(STORAGE_STATE_PATH);
|
|
17
|
+
if (stat.isDirectory()) {
|
|
18
|
+
return path.join(STORAGE_STATE_PATH, 'storage_state.json');
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
} catch { }
|
|
22
|
+
return STORAGE_STATE_PATH;
|
|
23
|
+
})();
|
|
24
|
+
|
|
25
|
+
async function runScrape(data) {
|
|
26
|
+
const url = data.url;
|
|
27
|
+
const customHeaders = data.headers || {};
|
|
28
|
+
const userSelector = data.selector;
|
|
29
|
+
const waitInput = data.wait;
|
|
30
|
+
const waitTime = waitInput ? parseFloat(waitInput) * 1000 : 2000;
|
|
31
|
+
const rotateUserAgents = data.rotateUserAgents || false;
|
|
32
|
+
const rotateViewportRaw = data.rotateViewport;
|
|
33
|
+
const rotateViewport = String(rotateViewportRaw).toLowerCase() === 'true' || rotateViewportRaw === true;
|
|
34
|
+
const runId = data.runId || null;
|
|
35
|
+
const captureRunId = sanitizeRunId(runId) || `run_${Date.now()}_unknown`;
|
|
36
|
+
const rotateProxiesRaw = data.rotateProxies;
|
|
37
|
+
const rotateProxies = String(rotateProxiesRaw).toLowerCase() === 'true' || rotateProxiesRaw === true;
|
|
38
|
+
const includeShadowDomRaw = data.includeShadowDom;
|
|
39
|
+
const includeShadowDom = includeShadowDomRaw === undefined
|
|
40
|
+
? true
|
|
41
|
+
: !(String(includeShadowDomRaw).toLowerCase() === 'false' || includeShadowDomRaw === false);
|
|
42
|
+
const disableRecordingRaw = data.disableRecording;
|
|
43
|
+
const disableRecording = parseBooleanFlag(disableRecordingRaw);
|
|
44
|
+
const statelessExecutionRaw = data.statelessExecution;
|
|
45
|
+
const statelessExecution = parseBooleanFlag(statelessExecutionRaw);
|
|
46
|
+
const extractionScript = data.extractionScript;
|
|
47
|
+
const extractionFormat = data.extractionFormat === 'csv' ? 'csv' : 'json';
|
|
48
|
+
|
|
49
|
+
if (!url) {
|
|
50
|
+
throw new Error('URL is required.');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
await validateUrl(url);
|
|
54
|
+
|
|
55
|
+
const selectedUA = await selectUserAgent(rotateUserAgents);
|
|
56
|
+
|
|
57
|
+
let browser;
|
|
58
|
+
let context;
|
|
59
|
+
let page;
|
|
60
|
+
try {
|
|
61
|
+
const launchOptions = {
|
|
62
|
+
headless: true,
|
|
63
|
+
args: [
|
|
64
|
+
'--no-sandbox',
|
|
65
|
+
'--disable-setuid-sandbox',
|
|
66
|
+
'--disable-dev-shm-usage',
|
|
67
|
+
'--disable-blink-features=AutomationControlled',
|
|
68
|
+
'--hide-scrollbars',
|
|
69
|
+
'--mute-audio'
|
|
70
|
+
]
|
|
71
|
+
};
|
|
72
|
+
const selection = getProxySelection(rotateProxies);
|
|
73
|
+
if (selection.proxy) {
|
|
74
|
+
launchOptions.proxy = selection.proxy;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
browser = await chromium.launch(launchOptions);
|
|
78
|
+
|
|
79
|
+
const recordingsDir = path.join(__dirname, 'data', 'recordings');
|
|
80
|
+
await fs.promises.mkdir(recordingsDir, { recursive: true });
|
|
81
|
+
|
|
82
|
+
const viewport = rotateViewport
|
|
83
|
+
? { width: 1280 + Math.floor(Math.random() * 640), height: 720 + Math.floor(Math.random() * 360) }
|
|
84
|
+
: { width: 1366, height: 768 };
|
|
85
|
+
|
|
86
|
+
const contextOptions = {
|
|
87
|
+
userAgent: selectedUA,
|
|
88
|
+
extraHTTPHeaders: customHeaders,
|
|
89
|
+
viewport,
|
|
90
|
+
deviceScaleFactor: 1,
|
|
91
|
+
locale: 'en-US',
|
|
92
|
+
timezoneId: 'America/New_York',
|
|
93
|
+
colorScheme: 'dark',
|
|
94
|
+
permissions: ['geolocation']
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
const shouldUseStorageState = !statelessExecution && await fs.promises.access(STORAGE_STATE_FILE).then(() => true).catch(() => false);
|
|
98
|
+
if (shouldUseStorageState) {
|
|
99
|
+
contextOptions.storageState = STORAGE_STATE_FILE;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (!disableRecording) {
|
|
103
|
+
contextOptions.recordVideo = { dir: recordingsDir, size: viewport };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
context = await browser.newContext(contextOptions);
|
|
107
|
+
|
|
108
|
+
let preloadedCookies = [];
|
|
109
|
+
if (!statelessExecution && fs.existsSync(STORAGE_STATE_FILE)) {
|
|
110
|
+
try {
|
|
111
|
+
const state = JSON.parse(fs.readFileSync(STORAGE_STATE_FILE, 'utf8'));
|
|
112
|
+
preloadedCookies = state.cookies || [];
|
|
113
|
+
} catch (e) { }
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
await context.route('**/*', async (route) => {
|
|
117
|
+
const request = route.request();
|
|
118
|
+
const requestUrl = request.url();
|
|
119
|
+
const resourceType = request.resourceType();
|
|
120
|
+
|
|
121
|
+
const isDataRequest = ['document', 'script', 'xhr', 'fetch'].includes(resourceType);
|
|
122
|
+
if (isDataRequest && preloadedCookies.length > 0) {
|
|
123
|
+
// ⚡ Bolt: Parse URL once to avoid redundant parsing inside cookieMatches filter loop
|
|
124
|
+
const urlObj = new URL(requestUrl);
|
|
125
|
+
const filteredCookies = preloadedCookies.filter(cookie => cookieMatches(cookie, urlObj));
|
|
126
|
+
if (filteredCookies.length > 0) {
|
|
127
|
+
const fileCookieMap = new Map();
|
|
128
|
+
filteredCookies.forEach(c => fileCookieMap.set(c.name, c.value));
|
|
129
|
+
|
|
130
|
+
const existingCookieHeader = request.headers()['cookie'] || '';
|
|
131
|
+
const existingCookies = existingCookieHeader.split(';').filter(Boolean).map(s => s.trim());
|
|
132
|
+
|
|
133
|
+
existingCookies.forEach(s => {
|
|
134
|
+
const [name, ...valParts] = s.split('=');
|
|
135
|
+
const val = valParts.join('=');
|
|
136
|
+
if (!fileCookieMap.has(name)) {
|
|
137
|
+
fileCookieMap.set(name, val);
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
const cookieHeader = Array.from(fileCookieMap.entries()).map(([n, v]) => `${n}=${v}`).join('; ');
|
|
142
|
+
const headers = { ...request.headers(), 'cookie': cookieHeader };
|
|
143
|
+
return route.continue({ headers });
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
route.continue();
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
await context.addInitScript(() => {
|
|
150
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
151
|
+
});
|
|
152
|
+
await context.addInitScript(installMouseHelper);
|
|
153
|
+
|
|
154
|
+
if (includeShadowDom) {
|
|
155
|
+
await context.addInitScript(() => {
|
|
156
|
+
if (!Element.prototype.attachShadow) return;
|
|
157
|
+
const original = Element.prototype.attachShadow;
|
|
158
|
+
Element.prototype.attachShadow = function (init) {
|
|
159
|
+
const options = init ? { ...init, mode: 'open' } : { mode: 'open' };
|
|
160
|
+
return original.call(this, options);
|
|
161
|
+
};
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
page = await context.newPage();
|
|
166
|
+
|
|
167
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
168
|
+
|
|
169
|
+
await page.evaluate(async () => {
|
|
170
|
+
await new Promise((resolve) => {
|
|
171
|
+
let totalHeight = 0;
|
|
172
|
+
const distance = 400;
|
|
173
|
+
const timer = setInterval(() => {
|
|
174
|
+
const scrollHeight = document.body.scrollHeight;
|
|
175
|
+
window.scrollBy(0, distance);
|
|
176
|
+
totalHeight += distance;
|
|
177
|
+
if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); }
|
|
178
|
+
}, 100);
|
|
179
|
+
});
|
|
180
|
+
window.scrollTo(0, 0);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
await page.waitForTimeout(waitTime);
|
|
184
|
+
|
|
185
|
+
let productHtml = '';
|
|
186
|
+
let usedFallback = false;
|
|
187
|
+
|
|
188
|
+
if (userSelector) {
|
|
189
|
+
if (includeShadowDom) {
|
|
190
|
+
productHtml = await page.evaluate((selector) => {
|
|
191
|
+
const stripUseless = (root) => {
|
|
192
|
+
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
193
|
+
useless.forEach(node => node.remove());
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
const cloneWithShadow = (root) => {
|
|
197
|
+
const clone = root.cloneNode(true);
|
|
198
|
+
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
199
|
+
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
200
|
+
|
|
201
|
+
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
202
|
+
const orig = walkerOrig.currentNode;
|
|
203
|
+
const cloned = walkerClone.currentNode;
|
|
204
|
+
if (orig.shadowRoot) {
|
|
205
|
+
const template = document.createElement('template');
|
|
206
|
+
template.setAttribute('data-shadowroot', 'open');
|
|
207
|
+
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
208
|
+
cloned.appendChild(template);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
stripUseless(clone);
|
|
213
|
+
return clone;
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
const elements = Array.from(document.querySelectorAll(selector));
|
|
217
|
+
return elements.map(el => cloneWithShadow(el).outerHTML).join('\n');
|
|
218
|
+
}, userSelector);
|
|
219
|
+
} else {
|
|
220
|
+
productHtml = await page.$$eval(userSelector, (elements) => {
|
|
221
|
+
return elements.map(el => {
|
|
222
|
+
const useless = el.querySelectorAll('script, style, svg, link, noscript');
|
|
223
|
+
useless.forEach(node => node.remove());
|
|
224
|
+
return el.outerHTML;
|
|
225
|
+
}).join('\n');
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
if (!productHtml || productHtml.trim() === '') usedFallback = true;
|
|
229
|
+
} else {
|
|
230
|
+
usedFallback = true;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (usedFallback) {
|
|
234
|
+
productHtml = await page.evaluate((withShadow) => {
|
|
235
|
+
const stripUseless = (root) => {
|
|
236
|
+
const useless = root.querySelectorAll('script, style, svg, link, noscript');
|
|
237
|
+
useless.forEach(node => node.remove());
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
const cloneWithShadow = (root) => {
|
|
241
|
+
const clone = root.cloneNode(true);
|
|
242
|
+
const walkerOrig = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT);
|
|
243
|
+
const walkerClone = document.createTreeWalker(clone, NodeFilter.SHOW_ELEMENT);
|
|
244
|
+
|
|
245
|
+
while (walkerOrig.nextNode() && walkerClone.nextNode()) {
|
|
246
|
+
const orig = walkerOrig.currentNode;
|
|
247
|
+
const cloned = walkerClone.currentNode;
|
|
248
|
+
if (orig.shadowRoot) {
|
|
249
|
+
const template = document.createElement('template');
|
|
250
|
+
template.setAttribute('data-shadowroot', 'open');
|
|
251
|
+
template.innerHTML = orig.shadowRoot.innerHTML;
|
|
252
|
+
cloned.appendChild(template);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
stripUseless(clone);
|
|
257
|
+
return clone;
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
if (withShadow) {
|
|
261
|
+
return cloneWithShadow(document.body).innerHTML;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const body = document.body.cloneNode(true);
|
|
265
|
+
stripUseless(body);
|
|
266
|
+
return body.innerHTML;
|
|
267
|
+
}, includeShadowDom);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const runExtractionScript = async (script, html, pageUrl) => {
|
|
271
|
+
if (!script || typeof script !== 'string') return { result: undefined, logs: [] };
|
|
272
|
+
|
|
273
|
+
return new Promise((resolve) => {
|
|
274
|
+
const safeEnv = {
|
|
275
|
+
NODE_ENV: 'production',
|
|
276
|
+
PATH: process.env.PATH,
|
|
277
|
+
LANG: process.env.LANG,
|
|
278
|
+
TZ: process.env.TZ
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
const worker = spawn('node', [path.join(__dirname, 'extraction-worker.js')], {
|
|
282
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
283
|
+
env: safeEnv
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
let stdout = '';
|
|
287
|
+
let stderr = '';
|
|
288
|
+
|
|
289
|
+
const workerTimeout = 5000;
|
|
290
|
+
const timer = setTimeout(() => {
|
|
291
|
+
worker.kill();
|
|
292
|
+
resolve({ result: 'Worker timed out', logs: [] });
|
|
293
|
+
}, workerTimeout);
|
|
294
|
+
|
|
295
|
+
worker.stdout.on('data', (data) => {
|
|
296
|
+
stdout += data.toString();
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
worker.stderr.on('data', (data) => {
|
|
300
|
+
stderr += data.toString();
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
worker.on('close', (code) => {
|
|
304
|
+
clearTimeout(timer);
|
|
305
|
+
if (code !== 0) {
|
|
306
|
+
resolve({ result: `Worker exited with code ${code}: ${stderr}`, logs: [] });
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
const output = JSON.parse(stdout);
|
|
311
|
+
resolve(output);
|
|
312
|
+
} catch (e) {
|
|
313
|
+
resolve({ result: `Worker output parse error: ${e.message}. Stdout: ${stdout}`, logs: [] });
|
|
314
|
+
}
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
worker.on('error', (err) => {
|
|
318
|
+
clearTimeout(timer);
|
|
319
|
+
resolve({ result: `Worker spawn error: ${err.message}`, logs: [] });
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
const input = JSON.stringify({
|
|
323
|
+
script,
|
|
324
|
+
html,
|
|
325
|
+
url: pageUrl,
|
|
326
|
+
includeShadowDom
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
worker.stdin.write(input);
|
|
330
|
+
worker.stdin.end();
|
|
331
|
+
});
|
|
332
|
+
};
|
|
333
|
+
|
|
334
|
+
const extraction = await runExtractionScript(extractionScript, productHtml, page.url());
|
|
335
|
+
|
|
336
|
+
const capturesDir = path.join(__dirname, 'public', 'captures');
|
|
337
|
+
await fs.promises.mkdir(capturesDir, { recursive: true });
|
|
338
|
+
|
|
339
|
+
const screenshotName = `${captureRunId}_scrape_${Date.now()}.png`;
|
|
340
|
+
const screenshotPath = path.join(capturesDir, screenshotName);
|
|
341
|
+
try {
|
|
342
|
+
await page.screenshot({ path: screenshotPath, fullPage: false });
|
|
343
|
+
} catch (e) {
|
|
344
|
+
console.error('Screenshot failed:', e.message);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const rawExtraction = extraction.result !== undefined ? extraction.result : (extraction.logs.length ? extraction.logs.join('\n') : undefined);
|
|
348
|
+
const formattedExtraction = extractionFormat === 'csv' ? toCsvString(rawExtraction) : rawExtraction;
|
|
349
|
+
|
|
350
|
+
const resultData = {
|
|
351
|
+
title: await page.title(),
|
|
352
|
+
url: page.url(),
|
|
353
|
+
html: formatHTML(productHtml),
|
|
354
|
+
data: formattedExtraction,
|
|
355
|
+
is_partial: !usedFallback,
|
|
356
|
+
selector_used: usedFallback ? (userSelector ? `${userSelector} (not found, using body)` : 'body (default)') : userSelector,
|
|
357
|
+
links: await page.$$eval('a[href]', elements => {
|
|
358
|
+
return elements.map(el => el.href).filter(href => href && href.startsWith('http'));
|
|
359
|
+
}),
|
|
360
|
+
screenshot_url: `/captures/${screenshotName}`
|
|
361
|
+
};
|
|
362
|
+
|
|
363
|
+
if (!statelessExecution) {
|
|
364
|
+
try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch { }
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
const video = page.video();
|
|
368
|
+
await context.close();
|
|
369
|
+
if (video) {
|
|
370
|
+
try {
|
|
371
|
+
const videoPath = await video.path();
|
|
372
|
+
const videoExists = videoPath && await fs.promises.access(videoPath).then(() => true).catch(() => false);
|
|
373
|
+
if (videoExists) {
|
|
374
|
+
const recordingName = `${captureRunId}_scrape_${Date.now()}.webm`;
|
|
375
|
+
const recordingPath = path.join(capturesDir, recordingName);
|
|
376
|
+
try {
|
|
377
|
+
await fs.promises.rename(videoPath, recordingPath);
|
|
378
|
+
} catch (err) {
|
|
379
|
+
if (err && err.code === 'EXDEV') {
|
|
380
|
+
await fs.promises.copyFile(videoPath, recordingPath);
|
|
381
|
+
await fs.promises.unlink(videoPath);
|
|
382
|
+
} else {
|
|
383
|
+
throw err;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
} catch (e) {
|
|
388
|
+
console.error('Recording save failed:', e.message);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
await browser.close();
|
|
393
|
+
return resultData;
|
|
394
|
+
} catch (error) {
|
|
395
|
+
if (context && !statelessExecution) {
|
|
396
|
+
try { await context.storageState({ path: STORAGE_STATE_FILE }); } catch { }
|
|
397
|
+
}
|
|
398
|
+
if (context) await context.close();
|
|
399
|
+
if (browser) await browser.close();
|
|
400
|
+
throw error;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
async function handleScrape(req, res) {
|
|
405
|
+
const data = {
|
|
406
|
+
...req.body,
|
|
407
|
+
...req.query
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
try {
|
|
411
|
+
const result = await runScrape(data);
|
|
412
|
+
res.json(result);
|
|
413
|
+
} catch (error) {
|
|
414
|
+
res.status(500).json({ error: 'Failed to scrape', details: error.message });
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
module.exports = { runScrape, handleScrape };
|