stealth-cli 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +295 -0
- package/bin/stealth.js +50 -0
- package/package.json +65 -0
- package/skills/SKILL.md +244 -0
- package/src/browser.js +341 -0
- package/src/client.js +115 -0
- package/src/commands/batch.js +180 -0
- package/src/commands/browse.js +101 -0
- package/src/commands/config.js +85 -0
- package/src/commands/crawl.js +169 -0
- package/src/commands/daemon.js +143 -0
- package/src/commands/extract.js +153 -0
- package/src/commands/fingerprint.js +306 -0
- package/src/commands/interactive.js +284 -0
- package/src/commands/mcp.js +68 -0
- package/src/commands/monitor.js +160 -0
- package/src/commands/pdf.js +109 -0
- package/src/commands/profile.js +112 -0
- package/src/commands/proxy.js +116 -0
- package/src/commands/screenshot.js +96 -0
- package/src/commands/search.js +162 -0
- package/src/commands/serve.js +240 -0
- package/src/config.js +123 -0
- package/src/cookies.js +67 -0
- package/src/daemon-entry.js +19 -0
- package/src/daemon.js +294 -0
- package/src/errors.js +136 -0
- package/src/extractors/base.js +59 -0
- package/src/extractors/bing.js +47 -0
- package/src/extractors/duckduckgo.js +91 -0
- package/src/extractors/github.js +103 -0
- package/src/extractors/google.js +173 -0
- package/src/extractors/index.js +55 -0
- package/src/extractors/youtube.js +87 -0
- package/src/humanize.js +210 -0
- package/src/index.js +32 -0
- package/src/macros.js +36 -0
- package/src/mcp-server.js +341 -0
- package/src/output.js +65 -0
- package/src/profiles.js +308 -0
- package/src/proxy-pool.js +256 -0
- package/src/retry.js +112 -0
- package/src/session.js +159 -0
package/src/browser.js
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core browser module - wraps camoufox-js for anti-detection browsing
|
|
3
|
+
*
|
|
4
|
+
* Supports two modes:
|
|
5
|
+
* 1. Direct mode — launches a new browser per command
|
|
6
|
+
* 2. Daemon mode — reuses background browser (faster)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { launchOptions } from 'camoufox-js';
|
|
10
|
+
import { firefox } from 'playwright-core';
|
|
11
|
+
import os from 'os';
|
|
12
|
+
import { withRetry, navigateWithRetry } from './retry.js';
|
|
13
|
+
import { randomDelay, postNavigationBehavior } from './humanize.js';
|
|
14
|
+
import { isDaemonRunning } from './daemon.js';
|
|
15
|
+
import { daemonNavigate, daemonText, daemonScreenshot, daemonRequest } from './client.js';
|
|
16
|
+
import { loadProfile, touchProfile, saveCookiesToProfile, loadCookiesFromProfile } from './profiles.js';
|
|
17
|
+
import { restoreSession, captureSession } from './session.js';
|
|
18
|
+
import { getNextProxy, getRandomProxy, reportProxy } from './proxy-pool.js';
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Detect host OS for fingerprint matching
|
|
22
|
+
*/
|
|
23
|
+
function getHostOS() {
|
|
24
|
+
const platform = os.platform();
|
|
25
|
+
if (platform === 'darwin') return 'macos';
|
|
26
|
+
if (platform === 'win32') return 'windows';
|
|
27
|
+
return 'linux';
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Build proxy configuration
|
|
32
|
+
*/
|
|
33
|
+
function buildProxy(proxyStr) {
|
|
34
|
+
if (!proxyStr) return null;
|
|
35
|
+
|
|
36
|
+
try {
|
|
37
|
+
let url;
|
|
38
|
+
if (proxyStr.startsWith('http')) {
|
|
39
|
+
url = new URL(proxyStr);
|
|
40
|
+
} else {
|
|
41
|
+
url = new URL(`http://${proxyStr}`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
server: `${url.protocol}//${url.hostname}:${url.port}`,
|
|
46
|
+
username: url.username || undefined,
|
|
47
|
+
password: url.password || undefined,
|
|
48
|
+
};
|
|
49
|
+
} catch {
|
|
50
|
+
const [host, port] = proxyStr.split(':');
|
|
51
|
+
return { server: `http://${host}:${port}` };
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Launch a stealth browser instance
|
|
57
|
+
*
|
|
58
|
+
* @param {object} opts
|
|
59
|
+
* @param {boolean} [opts.headless=true] - Run in headless mode
|
|
60
|
+
* @param {string} [opts.proxy] - Proxy string (http://user:pass@host:port)
|
|
61
|
+
* @param {boolean} [opts.proxyRotate] - Use proxy pool rotation
|
|
62
|
+
* @param {string} [opts.profile] - Profile name to use
|
|
63
|
+
* @param {string} [opts.session] - Session name (persist cookies/state)
|
|
64
|
+
* @param {string} [opts.locale] - Browser locale (default: en-US)
|
|
65
|
+
* @param {string} [opts.timezone] - Timezone ID
|
|
66
|
+
* @param {object} [opts.viewport] - { width, height }
|
|
67
|
+
* @param {boolean} [opts.humanize] - Enable human behavior simulation
|
|
68
|
+
* @returns {Promise<{ browser, context, page, isDaemon, _meta }>}
|
|
69
|
+
*/
|
|
70
|
+
export async function launchBrowser(opts = {}) {
|
|
71
|
+
let {
|
|
72
|
+
headless = true,
|
|
73
|
+
proxy: proxyStr,
|
|
74
|
+
proxyRotate = false,
|
|
75
|
+
profile: profileName,
|
|
76
|
+
session: sessionName,
|
|
77
|
+
locale = 'en-US',
|
|
78
|
+
timezone = 'America/Los_Angeles',
|
|
79
|
+
viewport = { width: 1280, height: 720 },
|
|
80
|
+
humanize = false,
|
|
81
|
+
} = opts;
|
|
82
|
+
|
|
83
|
+
// --- Load profile if specified ---
|
|
84
|
+
let profileData = null;
|
|
85
|
+
if (profileName) {
|
|
86
|
+
try {
|
|
87
|
+
profileData = loadProfile(profileName);
|
|
88
|
+
const fp = profileData.fingerprint;
|
|
89
|
+
locale = fp.locale || locale;
|
|
90
|
+
timezone = fp.timezone || timezone;
|
|
91
|
+
viewport = fp.viewport || viewport;
|
|
92
|
+
if (profileData.proxy && !proxyStr) {
|
|
93
|
+
proxyStr = profileData.proxy;
|
|
94
|
+
}
|
|
95
|
+
touchProfile(profileName);
|
|
96
|
+
} catch {}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// --- Proxy pool rotation ---
|
|
100
|
+
if (proxyRotate && !proxyStr) {
|
|
101
|
+
proxyStr = getNextProxy();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Check if daemon is available (skip if proxy/profile/session needed)
|
|
105
|
+
if (!proxyStr && !profileName && !sessionName && isDaemonRunning()) {
|
|
106
|
+
return {
|
|
107
|
+
browser: null,
|
|
108
|
+
context: null,
|
|
109
|
+
page: null,
|
|
110
|
+
isDaemon: true,
|
|
111
|
+
_meta: { profileName, sessionName, proxyUrl: null },
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const hostOS = profileData?.fingerprint?.os || getHostOS();
|
|
116
|
+
const proxy = buildProxy(proxyStr);
|
|
117
|
+
|
|
118
|
+
const options = await launchOptions({
|
|
119
|
+
headless,
|
|
120
|
+
os: hostOS,
|
|
121
|
+
humanize: true,
|
|
122
|
+
enable_cache: true,
|
|
123
|
+
proxy: proxy || undefined,
|
|
124
|
+
geoip: !!proxy,
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
const browser = await firefox.launch(options);
|
|
128
|
+
|
|
129
|
+
const contextOptions = {
|
|
130
|
+
viewport,
|
|
131
|
+
permissions: ['geolocation'],
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
if (!proxy) {
|
|
135
|
+
contextOptions.locale = locale;
|
|
136
|
+
contextOptions.timezoneId = timezone;
|
|
137
|
+
const geo = profileData?.fingerprint?.geo || { latitude: 37.7749, longitude: -122.4194 };
|
|
138
|
+
contextOptions.geolocation = geo;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const context = await browser.newContext(contextOptions);
|
|
142
|
+
|
|
143
|
+
// --- Restore profile cookies ---
|
|
144
|
+
if (profileName) {
|
|
145
|
+
await loadCookiesFromProfile(profileName, context);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// --- Restore session ---
|
|
149
|
+
let sessionInfo = null;
|
|
150
|
+
if (sessionName) {
|
|
151
|
+
sessionInfo = await restoreSession(sessionName, context);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const page = await context.newPage();
|
|
155
|
+
|
|
156
|
+
// If session had a last URL, navigate to it
|
|
157
|
+
if (sessionInfo?.lastUrl && sessionInfo.lastUrl !== 'about:blank') {
|
|
158
|
+
try {
|
|
159
|
+
await page.goto(sessionInfo.lastUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
|
|
160
|
+
} catch {}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
browser, context, page, isDaemon: false,
|
|
165
|
+
_meta: { profileName, sessionName, proxyUrl: proxyStr, sessionInfo },
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Safely close browser and clean up (no-op for daemon mode)
|
|
171
|
+
* Auto-saves profile cookies and session state before closing
|
|
172
|
+
*/
|
|
173
|
+
export async function closeBrowser(handle) {
|
|
174
|
+
const { browser, context, page, isDaemon, _meta } = handle;
|
|
175
|
+
|
|
176
|
+
if (isDaemon) return;
|
|
177
|
+
|
|
178
|
+
try {
|
|
179
|
+
// Auto-save profile cookies
|
|
180
|
+
if (_meta?.profileName && context) {
|
|
181
|
+
await saveCookiesToProfile(_meta.profileName, context).catch(() => {});
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Auto-save session
|
|
185
|
+
if (_meta?.sessionName && context && page) {
|
|
186
|
+
await captureSession(_meta.sessionName, context, page, {
|
|
187
|
+
profile: _meta.profileName,
|
|
188
|
+
}).catch(() => {});
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if (context) await context.close().catch(() => {});
|
|
192
|
+
if (browser) await browser.close().catch(() => {});
|
|
193
|
+
} catch {
|
|
194
|
+
// Ignore cleanup errors
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Navigate to URL — uses daemon if available, with retry support
|
|
200
|
+
*
|
|
201
|
+
* @param {object} handle - { page, isDaemon } from launchBrowser
|
|
202
|
+
* @param {string} url - Target URL
|
|
203
|
+
* @param {object} opts - Navigation options
|
|
204
|
+
*/
|
|
205
|
+
export async function navigate(handle, url, opts = {}) {
|
|
206
|
+
const { timeout = 30000, waitUntil = 'domcontentloaded', humanize = false, retries = 2 } = opts;
|
|
207
|
+
|
|
208
|
+
if (handle.isDaemon) {
|
|
209
|
+
const result = await withRetry(
|
|
210
|
+
async () => {
|
|
211
|
+
const res = await daemonNavigate(url, { timeout, waitUntil });
|
|
212
|
+
if (!res?.ok) throw new Error(res?.error || 'Daemon navigation failed');
|
|
213
|
+
return res.url;
|
|
214
|
+
},
|
|
215
|
+
{ maxRetries: retries, label: `navigate(daemon)` },
|
|
216
|
+
);
|
|
217
|
+
return result;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const finalUrl = await navigateWithRetry(handle.page, url, { timeout, waitUntil, maxRetries: retries });
|
|
221
|
+
|
|
222
|
+
// Human behavior after navigation
|
|
223
|
+
if (humanize) {
|
|
224
|
+
await postNavigationBehavior(handle.page);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return finalUrl;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Wait for page to be ready
|
|
232
|
+
*/
|
|
233
|
+
export async function waitForReady(page, opts = {}) {
|
|
234
|
+
const { timeout = 5000, waitForNetwork = true } = opts;
|
|
235
|
+
|
|
236
|
+
if (!page) return; // Daemon mode — no direct page access
|
|
237
|
+
|
|
238
|
+
try {
|
|
239
|
+
if (waitForNetwork) {
|
|
240
|
+
await page.waitForLoadState('networkidle', { timeout });
|
|
241
|
+
} else {
|
|
242
|
+
await page.waitForLoadState('domcontentloaded', { timeout });
|
|
243
|
+
}
|
|
244
|
+
} catch {
|
|
245
|
+
// Timeout is OK
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Get accessibility snapshot of the page
|
|
251
|
+
*/
|
|
252
|
+
export async function getSnapshot(handle) {
|
|
253
|
+
if (handle.isDaemon) {
|
|
254
|
+
const res = await daemonRequest('/snapshot');
|
|
255
|
+
return res?.snapshot || '';
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
try {
|
|
259
|
+
await waitForReady(handle.page, { waitForNetwork: false });
|
|
260
|
+
const snapshot = await handle.page.locator('body').ariaSnapshot({ timeout: 8000 });
|
|
261
|
+
return snapshot || '';
|
|
262
|
+
} catch {
|
|
263
|
+
return '';
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Extract visible text content from page
|
|
269
|
+
*/
|
|
270
|
+
export async function getTextContent(handle) {
|
|
271
|
+
if (handle.isDaemon) {
|
|
272
|
+
const res = await daemonRequest('/text');
|
|
273
|
+
return res?.text || '';
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
return handle.page.evaluate(() => {
|
|
277
|
+
const body = document.body;
|
|
278
|
+
if (!body) return '';
|
|
279
|
+
|
|
280
|
+
const clone = body.cloneNode(true);
|
|
281
|
+
clone.querySelectorAll('script, style, noscript').forEach((el) => el.remove());
|
|
282
|
+
|
|
283
|
+
return clone.innerText || clone.textContent || '';
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Get page title
|
|
289
|
+
*/
|
|
290
|
+
export async function getTitle(handle) {
|
|
291
|
+
if (handle.isDaemon) {
|
|
292
|
+
const res = await daemonRequest('/title');
|
|
293
|
+
return res?.title || '';
|
|
294
|
+
}
|
|
295
|
+
return handle.page.title();
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Get current URL
|
|
300
|
+
*/
|
|
301
|
+
export async function getUrl(handle) {
|
|
302
|
+
if (handle.isDaemon) {
|
|
303
|
+
// Use /title endpoint which returns the current page URL
|
|
304
|
+
const res = await daemonRequest('/title');
|
|
305
|
+
return res?.url || 'about:blank';
|
|
306
|
+
}
|
|
307
|
+
return handle.page.url();
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Take screenshot
|
|
312
|
+
*/
|
|
313
|
+
export async function takeScreenshot(handle, opts = {}) {
|
|
314
|
+
const { path: filePath, fullPage = false } = opts;
|
|
315
|
+
|
|
316
|
+
if (handle.isDaemon) {
|
|
317
|
+
const res = await daemonRequest('/screenshot', { fullPage });
|
|
318
|
+
if (res?.data && filePath) {
|
|
319
|
+
const { writeFileSync } = await import('fs');
|
|
320
|
+
writeFileSync(filePath, Buffer.from(res.data, 'base64'));
|
|
321
|
+
}
|
|
322
|
+
return res;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const screenshotOpts = { fullPage };
|
|
326
|
+
if (filePath) screenshotOpts.path = filePath;
|
|
327
|
+
|
|
328
|
+
const buffer = await handle.page.screenshot(screenshotOpts);
|
|
329
|
+
return { data: buffer.toString('base64') };
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Evaluate JavaScript in page
|
|
334
|
+
*/
|
|
335
|
+
export async function evaluate(handle, expression) {
|
|
336
|
+
if (handle.isDaemon) {
|
|
337
|
+
const res = await daemonRequest('/evaluate', { expression });
|
|
338
|
+
return res?.result;
|
|
339
|
+
}
|
|
340
|
+
return handle.page.evaluate(expression);
|
|
341
|
+
}
|
package/src/client.js
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Daemon client — communicates with background daemon via unix socket
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* if (await daemonRequest('/status')) { ... } // daemon is running
|
|
6
|
+
* const result = await daemonNavigate('https://example.com');
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import http from 'http';
|
|
10
|
+
import { SOCKET_PATH, isDaemonRunning } from './daemon.js';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Send a request to the daemon
|
|
14
|
+
*
|
|
15
|
+
* @param {string} route - Route path (e.g. '/navigate')
|
|
16
|
+
* @param {object} body - Request body
|
|
17
|
+
* @param {number} timeout - Request timeout in ms
|
|
18
|
+
* @returns {Promise<object|null>} Response data, or null if daemon not available
|
|
19
|
+
*/
|
|
20
|
+
export async function daemonRequest(route, body = {}, timeout = 35000) {
|
|
21
|
+
if (!isDaemonRunning()) return null;
|
|
22
|
+
|
|
23
|
+
return new Promise((resolve, reject) => {
|
|
24
|
+
const postData = JSON.stringify(body);
|
|
25
|
+
|
|
26
|
+
const req = http.request(
|
|
27
|
+
{
|
|
28
|
+
socketPath: SOCKET_PATH,
|
|
29
|
+
path: route,
|
|
30
|
+
method: 'POST',
|
|
31
|
+
headers: {
|
|
32
|
+
'Content-Type': 'application/json',
|
|
33
|
+
'Content-Length': Buffer.byteLength(postData),
|
|
34
|
+
},
|
|
35
|
+
timeout,
|
|
36
|
+
},
|
|
37
|
+
(res) => {
|
|
38
|
+
let data = '';
|
|
39
|
+
res.on('data', (chunk) => { data += chunk; });
|
|
40
|
+
res.on('end', () => {
|
|
41
|
+
try {
|
|
42
|
+
resolve(JSON.parse(data));
|
|
43
|
+
} catch {
|
|
44
|
+
resolve(null);
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
},
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
req.on('error', () => resolve(null));
|
|
51
|
+
req.on('timeout', () => {
|
|
52
|
+
req.destroy();
|
|
53
|
+
resolve(null);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
req.write(postData);
|
|
57
|
+
req.end();
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Navigate via daemon (returns null if daemon not available)
|
|
63
|
+
*/
|
|
64
|
+
export async function daemonNavigate(url, opts = {}) {
|
|
65
|
+
return daemonRequest('/navigate', { url, ...opts });
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Get page snapshot via daemon
|
|
70
|
+
*/
|
|
71
|
+
export async function daemonSnapshot(opts = {}) {
|
|
72
|
+
return daemonRequest('/snapshot', opts);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Get page text via daemon
|
|
77
|
+
*/
|
|
78
|
+
export async function daemonText(opts = {}) {
|
|
79
|
+
return daemonRequest('/text', opts);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Take screenshot via daemon (returns base64)
|
|
84
|
+
*/
|
|
85
|
+
export async function daemonScreenshot(opts = {}) {
|
|
86
|
+
return daemonRequest('/screenshot', opts);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Get page title via daemon
|
|
91
|
+
*/
|
|
92
|
+
export async function daemonTitle(opts = {}) {
|
|
93
|
+
return daemonRequest('/title', opts);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Evaluate JS via daemon
|
|
98
|
+
*/
|
|
99
|
+
export async function daemonEvaluate(expression, opts = {}) {
|
|
100
|
+
return daemonRequest('/evaluate', { expression, ...opts });
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Check daemon status
|
|
105
|
+
*/
|
|
106
|
+
export async function daemonStatus() {
|
|
107
|
+
return daemonRequest('/status');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Shutdown daemon
|
|
112
|
+
*/
|
|
113
|
+
export async function daemonShutdown() {
|
|
114
|
+
return daemonRequest('/shutdown');
|
|
115
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stealth batch <file> - Execute commands for a list of URLs
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import fs from 'fs';
|
|
6
|
+
import ora from 'ora';
|
|
7
|
+
import {
|
|
8
|
+
launchBrowser, closeBrowser, navigate, getTextContent,
|
|
9
|
+
getTitle, takeScreenshot, waitForReady,
|
|
10
|
+
} from '../browser.js';
|
|
11
|
+
import { navigateWithRetry } from '../retry.js';
|
|
12
|
+
import { randomDelay } from '../humanize.js';
|
|
13
|
+
import { formatOutput, log } from '../output.js';
|
|
14
|
+
|
|
15
|
+
export function registerBatch(program) {
|
|
16
|
+
program
|
|
17
|
+
.command('batch')
|
|
18
|
+
.description('Process a list of URLs in batch')
|
|
19
|
+
.argument('<file>', 'File containing URLs (one per line)')
|
|
20
|
+
.option('-c, --command <cmd>', 'Command to run: browse, screenshot, extract', 'browse')
|
|
21
|
+
.option('-o, --output <dir>', 'Output directory for results', '.')
|
|
22
|
+
.option('-f, --format <format>', 'Output format: json, jsonl, text', 'jsonl')
|
|
23
|
+
.option('--delay <ms>', 'Delay between URLs (ms)', '1000')
|
|
24
|
+
.option('--concurrency <n>', 'Max parallel operations (reuses single browser)', '1')
|
|
25
|
+
.option('-s, --selector <selector>', 'CSS selector for extract mode')
|
|
26
|
+
.option('--proxy <proxy>', 'Proxy server')
|
|
27
|
+
.option('--profile <name>', 'Use a browser profile')
|
|
28
|
+
.option('--proxy-rotate', 'Rotate proxy from pool')
|
|
29
|
+
.option('--no-headless', 'Show browser window')
|
|
30
|
+
.option('--retries <n>', 'Max retries per URL', '2')
|
|
31
|
+
.option('--skip-errors', 'Continue on errors instead of stopping')
|
|
32
|
+
.action(async (file, opts) => {
|
|
33
|
+
// Read URLs from file
|
|
34
|
+
if (!fs.existsSync(file)) {
|
|
35
|
+
log.error(`File not found: ${file}`);
|
|
36
|
+
process.exit(1);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const urls = fs.readFileSync(file, 'utf-8')
|
|
40
|
+
.split('\n')
|
|
41
|
+
.map((line) => line.trim())
|
|
42
|
+
.filter((line) => line && !line.startsWith('#') && line.startsWith('http'));
|
|
43
|
+
|
|
44
|
+
if (urls.length === 0) {
|
|
45
|
+
log.error('No valid URLs found in file');
|
|
46
|
+
process.exit(1);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
log.info(`Batch processing ${urls.length} URLs (command: ${opts.command})`);
|
|
50
|
+
|
|
51
|
+
const spinner = ora('Launching stealth browser...').start();
|
|
52
|
+
let handle;
|
|
53
|
+
const results = [];
|
|
54
|
+
let success = 0;
|
|
55
|
+
let failed = 0;
|
|
56
|
+
|
|
57
|
+
try {
|
|
58
|
+
handle = await launchBrowser({
|
|
59
|
+
headless: opts.headless,
|
|
60
|
+
proxy: opts.proxy,
|
|
61
|
+
proxyRotate: opts.proxyRotate,
|
|
62
|
+
profile: opts.profile,
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
// Ensure output directory exists for screenshot mode
|
|
66
|
+
if (opts.command === 'screenshot') {
|
|
67
|
+
fs.mkdirSync(opts.output, { recursive: true });
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Create output stream for jsonl mode
|
|
71
|
+
let outputStream;
|
|
72
|
+
if (opts.format === 'jsonl' && opts.command !== 'screenshot') {
|
|
73
|
+
const outFile = `${opts.output}/batch-${Date.now()}.jsonl`;
|
|
74
|
+
outputStream = fs.createWriteStream(outFile);
|
|
75
|
+
log.dim(` Output: ${outFile}`);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
for (let i = 0; i < urls.length; i++) {
|
|
79
|
+
const url = urls[i];
|
|
80
|
+
spinner.text = `[${i + 1}/${urls.length}] ${opts.command}: ${url.slice(0, 50)}...`;
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
if (handle.isDaemon) {
|
|
84
|
+
await navigate(handle, url, { retries: parseInt(opts.retries) });
|
|
85
|
+
} else {
|
|
86
|
+
await navigateWithRetry(handle.page, url, {
|
|
87
|
+
maxRetries: parseInt(opts.retries),
|
|
88
|
+
});
|
|
89
|
+
await waitForReady(handle.page, { timeout: 3000 });
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
let result;
|
|
93
|
+
|
|
94
|
+
switch (opts.command) {
|
|
95
|
+
case 'browse': {
|
|
96
|
+
const title = await getTitle(handle);
|
|
97
|
+
const content = await getTextContent(handle);
|
|
98
|
+
result = {
|
|
99
|
+
url,
|
|
100
|
+
title,
|
|
101
|
+
content: content.slice(0, 5000),
|
|
102
|
+
timestamp: new Date().toISOString(),
|
|
103
|
+
};
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
case 'screenshot': {
|
|
108
|
+
// Generate filename from URL
|
|
109
|
+
const filename = url
|
|
110
|
+
.replace(/https?:\/\//, '')
|
|
111
|
+
.replace(/[^a-zA-Z0-9]/g, '_')
|
|
112
|
+
.slice(0, 80) + '.png';
|
|
113
|
+
const filepath = `${opts.output}/${filename}`;
|
|
114
|
+
|
|
115
|
+
await takeScreenshot(handle, { path: filepath });
|
|
116
|
+
result = { url, screenshot: filepath, timestamp: new Date().toISOString() };
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
case 'extract': {
|
|
121
|
+
const selector = opts.selector || 'body';
|
|
122
|
+
let data;
|
|
123
|
+
if (handle.isDaemon) {
|
|
124
|
+
const { evaluate } = await import('../browser.js');
|
|
125
|
+
data = await evaluate(handle, `document.querySelector('${selector}')?.textContent?.trim() || ''`);
|
|
126
|
+
} else {
|
|
127
|
+
data = await handle.page.$eval(selector, (el) => el.textContent?.trim()).catch(() => '');
|
|
128
|
+
}
|
|
129
|
+
result = {
|
|
130
|
+
url,
|
|
131
|
+
selector,
|
|
132
|
+
data,
|
|
133
|
+
timestamp: new Date().toISOString(),
|
|
134
|
+
};
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
default:
|
|
139
|
+
throw new Error(`Unknown command: ${opts.command}`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Output result
|
|
143
|
+
if (outputStream) {
|
|
144
|
+
outputStream.write(JSON.stringify(result) + '\n');
|
|
145
|
+
} else if (opts.command !== 'screenshot') {
|
|
146
|
+
console.log(opts.format === 'jsonl' ? JSON.stringify(result) : formatOutput(result, opts.format));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
results.push(result);
|
|
150
|
+
success++;
|
|
151
|
+
} catch (err) {
|
|
152
|
+
failed++;
|
|
153
|
+
log.warn(`[${i + 1}] Failed: ${url} — ${err.message}`);
|
|
154
|
+
|
|
155
|
+
if (!opts.skipErrors) {
|
|
156
|
+
throw err;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Delay between URLs
|
|
161
|
+
if (i < urls.length - 1) {
|
|
162
|
+
const delay = parseInt(opts.delay);
|
|
163
|
+
await randomDelay(delay * 0.8, delay * 1.2);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (outputStream) outputStream.end();
|
|
168
|
+
|
|
169
|
+
spinner.stop();
|
|
170
|
+
log.success(`Batch complete: ${success} succeeded, ${failed} failed, ${urls.length} total`);
|
|
171
|
+
} catch (err) {
|
|
172
|
+
spinner.stop();
|
|
173
|
+
log.error(`Batch failed: ${err.message}`);
|
|
174
|
+
log.dim(` Completed: ${success}/${urls.length}`);
|
|
175
|
+
process.exit(1);
|
|
176
|
+
} finally {
|
|
177
|
+
if (handle) await closeBrowser(handle);
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
}
|