stealth-cli 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +295 -0
- package/bin/stealth.js +50 -0
- package/package.json +65 -0
- package/skills/SKILL.md +244 -0
- package/src/browser.js +341 -0
- package/src/client.js +115 -0
- package/src/commands/batch.js +180 -0
- package/src/commands/browse.js +101 -0
- package/src/commands/config.js +85 -0
- package/src/commands/crawl.js +169 -0
- package/src/commands/daemon.js +143 -0
- package/src/commands/extract.js +153 -0
- package/src/commands/fingerprint.js +306 -0
- package/src/commands/interactive.js +284 -0
- package/src/commands/mcp.js +68 -0
- package/src/commands/monitor.js +160 -0
- package/src/commands/pdf.js +109 -0
- package/src/commands/profile.js +112 -0
- package/src/commands/proxy.js +116 -0
- package/src/commands/screenshot.js +96 -0
- package/src/commands/search.js +162 -0
- package/src/commands/serve.js +240 -0
- package/src/config.js +123 -0
- package/src/cookies.js +67 -0
- package/src/daemon-entry.js +19 -0
- package/src/daemon.js +294 -0
- package/src/errors.js +136 -0
- package/src/extractors/base.js +59 -0
- package/src/extractors/bing.js +47 -0
- package/src/extractors/duckduckgo.js +91 -0
- package/src/extractors/github.js +103 -0
- package/src/extractors/google.js +173 -0
- package/src/extractors/index.js +55 -0
- package/src/extractors/youtube.js +87 -0
- package/src/humanize.js +210 -0
- package/src/index.js +32 -0
- package/src/macros.js +36 -0
- package/src/mcp-server.js +341 -0
- package/src/output.js +65 -0
- package/src/profiles.js +308 -0
- package/src/proxy-pool.js +256 -0
- package/src/retry.js +112 -0
- package/src/session.js +159 -0
package/src/daemon.js
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Daemon mode — keeps browser alive in background for instant reuse
|
|
3
|
+
*
|
|
4
|
+
* Architecture:
|
|
5
|
+
* 1. `stealth daemon start` spawns a background HTTP server on a unix socket
|
|
6
|
+
* 2. CLI commands detect the daemon and send requests via HTTP
|
|
7
|
+
* 3. Daemon auto-shuts down after idle timeout (default 5 min)
|
|
8
|
+
*
|
|
9
|
+
* Socket: ~/.stealth/daemon.sock
|
|
10
|
+
* PID: ~/.stealth/daemon.pid
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import http from 'http';
|
|
14
|
+
import fs from 'fs';
|
|
15
|
+
import path from 'path';
|
|
16
|
+
import os from 'os';
|
|
17
|
+
import { launchOptions } from 'camoufox-js';
|
|
18
|
+
import { firefox } from 'playwright-core';
|
|
19
|
+
|
|
20
|
+
const STEALTH_DIR = path.join(os.homedir(), '.stealth');
|
|
21
|
+
const SOCKET_PATH = path.join(STEALTH_DIR, 'daemon.sock');
|
|
22
|
+
const PID_PATH = path.join(STEALTH_DIR, 'daemon.pid');
|
|
23
|
+
const DEFAULT_IDLE_TIMEOUT = 5 * 60 * 1000; // 5 minutes
|
|
24
|
+
|
|
25
|
+
export { SOCKET_PATH, PID_PATH, STEALTH_DIR };
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Check if daemon is currently running
|
|
29
|
+
*/
|
|
30
|
+
export function isDaemonRunning() {
|
|
31
|
+
try {
|
|
32
|
+
if (!fs.existsSync(PID_PATH)) return false;
|
|
33
|
+
const pid = parseInt(fs.readFileSync(PID_PATH, 'utf-8').trim());
|
|
34
|
+
// Check if process is alive
|
|
35
|
+
process.kill(pid, 0);
|
|
36
|
+
// Also check if socket exists
|
|
37
|
+
return fs.existsSync(SOCKET_PATH);
|
|
38
|
+
} catch {
|
|
39
|
+
// Process not found or permission error
|
|
40
|
+
cleanup();
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Clean up stale socket/pid files
|
|
47
|
+
*/
|
|
48
|
+
function cleanup() {
|
|
49
|
+
try { fs.unlinkSync(SOCKET_PATH); } catch {}
|
|
50
|
+
try { fs.unlinkSync(PID_PATH); } catch {}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Get host OS for fingerprint
|
|
55
|
+
*/
|
|
56
|
+
function getHostOS() {
|
|
57
|
+
const platform = os.platform();
|
|
58
|
+
if (platform === 'darwin') return 'macos';
|
|
59
|
+
if (platform === 'win32') return 'windows';
|
|
60
|
+
return 'linux';
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Start the daemon server
|
|
65
|
+
*/
|
|
66
|
+
export async function startDaemon(opts = {}) {
|
|
67
|
+
const { idleTimeout = DEFAULT_IDLE_TIMEOUT, verbose = false } = opts;
|
|
68
|
+
|
|
69
|
+
// Ensure directory exists
|
|
70
|
+
fs.mkdirSync(STEALTH_DIR, { recursive: true });
|
|
71
|
+
|
|
72
|
+
// Clean up stale files
|
|
73
|
+
cleanup();
|
|
74
|
+
|
|
75
|
+
const log = (msg) => {
|
|
76
|
+
if (verbose) {
|
|
77
|
+
const ts = new Date().toISOString();
|
|
78
|
+
process.stdout.write(`[${ts}] ${msg}\n`);
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
// Launch browser
|
|
83
|
+
log('Launching Camoufox browser...');
|
|
84
|
+
const options = await launchOptions({
|
|
85
|
+
headless: true,
|
|
86
|
+
os: getHostOS(),
|
|
87
|
+
humanize: true,
|
|
88
|
+
enable_cache: true,
|
|
89
|
+
});
|
|
90
|
+
const browser = await firefox.launch(options);
|
|
91
|
+
log('Browser launched');
|
|
92
|
+
|
|
93
|
+
// Track contexts for reuse
|
|
94
|
+
// key → { context, page, lastUsed }
|
|
95
|
+
const contexts = new Map();
|
|
96
|
+
let idleTimer = null;
|
|
97
|
+
|
|
98
|
+
function resetIdleTimer() {
|
|
99
|
+
if (idleTimer) clearTimeout(idleTimer);
|
|
100
|
+
idleTimer = setTimeout(async () => {
|
|
101
|
+
log('Idle timeout reached, shutting down...');
|
|
102
|
+
await shutdown();
|
|
103
|
+
}, idleTimeout);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async function getOrCreateContext(key = 'default', contextOpts = {}) {
|
|
107
|
+
resetIdleTimer();
|
|
108
|
+
|
|
109
|
+
if (contexts.has(key)) {
|
|
110
|
+
const ctx = contexts.get(key);
|
|
111
|
+
ctx.lastUsed = Date.now();
|
|
112
|
+
|
|
113
|
+
// Check if page is still alive
|
|
114
|
+
try {
|
|
115
|
+
await ctx.page.evaluate('1');
|
|
116
|
+
return ctx;
|
|
117
|
+
} catch {
|
|
118
|
+
// Page died, recreate
|
|
119
|
+
try { await ctx.context.close(); } catch {}
|
|
120
|
+
contexts.delete(key);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const {
|
|
125
|
+
locale = 'en-US',
|
|
126
|
+
timezone = 'America/Los_Angeles',
|
|
127
|
+
viewport = { width: 1280, height: 720 },
|
|
128
|
+
} = contextOpts;
|
|
129
|
+
|
|
130
|
+
const context = await browser.newContext({
|
|
131
|
+
viewport,
|
|
132
|
+
locale,
|
|
133
|
+
timezoneId: timezone,
|
|
134
|
+
permissions: ['geolocation'],
|
|
135
|
+
geolocation: { latitude: 37.7749, longitude: -122.4194 },
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
const page = await context.newPage();
|
|
139
|
+
const entry = { context, page, lastUsed: Date.now() };
|
|
140
|
+
contexts.set(key, entry);
|
|
141
|
+
|
|
142
|
+
return entry;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Handle JSON request body
|
|
146
|
+
function parseBody(req) {
|
|
147
|
+
return new Promise((resolve, reject) => {
|
|
148
|
+
let body = '';
|
|
149
|
+
req.on('data', (chunk) => { body += chunk; });
|
|
150
|
+
req.on('end', () => {
|
|
151
|
+
try {
|
|
152
|
+
resolve(body ? JSON.parse(body) : {});
|
|
153
|
+
} catch (e) {
|
|
154
|
+
reject(new Error('Invalid JSON'));
|
|
155
|
+
}
|
|
156
|
+
});
|
|
157
|
+
req.on('error', reject);
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// HTTP server on unix socket
|
|
162
|
+
const server = http.createServer(async (req, res) => {
|
|
163
|
+
res.setHeader('Content-Type', 'application/json');
|
|
164
|
+
|
|
165
|
+
try {
|
|
166
|
+
const body = await parseBody(req);
|
|
167
|
+
const url = new URL(req.url, 'http://localhost');
|
|
168
|
+
const route = url.pathname;
|
|
169
|
+
|
|
170
|
+
resetIdleTimer();
|
|
171
|
+
|
|
172
|
+
// --- Routes ---
|
|
173
|
+
|
|
174
|
+
if (route === '/status') {
|
|
175
|
+
res.end(JSON.stringify({
|
|
176
|
+
ok: true,
|
|
177
|
+
pid: process.pid,
|
|
178
|
+
uptime: Math.floor(process.uptime()),
|
|
179
|
+
contexts: contexts.size,
|
|
180
|
+
browserConnected: browser.isConnected(),
|
|
181
|
+
memoryMB: Math.round(process.memoryUsage().rss / 1024 / 1024),
|
|
182
|
+
}));
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (route === '/navigate') {
|
|
187
|
+
const { key = 'default', url: targetUrl, waitUntil = 'domcontentloaded', timeout = 30000 } = body;
|
|
188
|
+
const ctx = await getOrCreateContext(key);
|
|
189
|
+
await ctx.page.goto(targetUrl, { waitUntil, timeout });
|
|
190
|
+
res.end(JSON.stringify({ ok: true, url: ctx.page.url() }));
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (route === '/snapshot') {
|
|
195
|
+
const { key = 'default' } = body;
|
|
196
|
+
const ctx = await getOrCreateContext(key);
|
|
197
|
+
const snapshot = await ctx.page.locator('body').ariaSnapshot({ timeout: 8000 }).catch(() => '');
|
|
198
|
+
res.end(JSON.stringify({ ok: true, snapshot, url: ctx.page.url() }));
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if (route === '/text') {
|
|
203
|
+
const { key = 'default' } = body;
|
|
204
|
+
const ctx = await getOrCreateContext(key);
|
|
205
|
+
const text = await ctx.page.evaluate(() => {
|
|
206
|
+
const clone = document.body.cloneNode(true);
|
|
207
|
+
clone.querySelectorAll('script, style, noscript').forEach((el) => el.remove());
|
|
208
|
+
return clone.innerText || '';
|
|
209
|
+
});
|
|
210
|
+
res.end(JSON.stringify({ ok: true, text, url: ctx.page.url() }));
|
|
211
|
+
return;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (route === '/screenshot') {
|
|
215
|
+
const { key = 'default', fullPage = false } = body;
|
|
216
|
+
const ctx = await getOrCreateContext(key);
|
|
217
|
+
const buffer = await ctx.page.screenshot({ type: 'png', fullPage });
|
|
218
|
+
res.end(JSON.stringify({ ok: true, data: buffer.toString('base64'), url: ctx.page.url() }));
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (route === '/evaluate') {
|
|
223
|
+
const { key = 'default', expression } = body;
|
|
224
|
+
const ctx = await getOrCreateContext(key);
|
|
225
|
+
const result = await ctx.page.evaluate(expression);
|
|
226
|
+
res.end(JSON.stringify({ ok: true, result }));
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (route === '/title') {
|
|
231
|
+
const { key = 'default' } = body;
|
|
232
|
+
const ctx = await getOrCreateContext(key);
|
|
233
|
+
const title = await ctx.page.title();
|
|
234
|
+
res.end(JSON.stringify({ ok: true, title, url: ctx.page.url() }));
|
|
235
|
+
return;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (route === '/close') {
|
|
239
|
+
const { key } = body;
|
|
240
|
+
if (key && contexts.has(key)) {
|
|
241
|
+
const ctx = contexts.get(key);
|
|
242
|
+
await ctx.context.close().catch(() => {});
|
|
243
|
+
contexts.delete(key);
|
|
244
|
+
}
|
|
245
|
+
res.end(JSON.stringify({ ok: true }));
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (route === '/shutdown') {
|
|
250
|
+
res.end(JSON.stringify({ ok: true, message: 'Shutting down' }));
|
|
251
|
+
setTimeout(() => shutdown(), 100);
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
res.statusCode = 404;
|
|
256
|
+
res.end(JSON.stringify({ error: 'Not found' }));
|
|
257
|
+
} catch (err) {
|
|
258
|
+
res.statusCode = 500;
|
|
259
|
+
res.end(JSON.stringify({ error: err.message }));
|
|
260
|
+
}
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
async function shutdown() {
|
|
264
|
+
log('Shutting down daemon...');
|
|
265
|
+
for (const [, ctx] of contexts) {
|
|
266
|
+
await ctx.context.close().catch(() => {});
|
|
267
|
+
}
|
|
268
|
+
contexts.clear();
|
|
269
|
+
await browser.close().catch(() => {});
|
|
270
|
+
server.close();
|
|
271
|
+
cleanup();
|
|
272
|
+
log('Daemon stopped');
|
|
273
|
+
process.exit(0);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Handle signals
|
|
277
|
+
process.on('SIGTERM', shutdown);
|
|
278
|
+
process.on('SIGINT', shutdown);
|
|
279
|
+
|
|
280
|
+
// Listen on unix socket
|
|
281
|
+
server.listen(SOCKET_PATH, () => {
|
|
282
|
+
// Write PID file
|
|
283
|
+
fs.writeFileSync(PID_PATH, String(process.pid));
|
|
284
|
+
log(`Daemon started (pid: ${process.pid}, socket: ${SOCKET_PATH})`);
|
|
285
|
+
log(`Idle timeout: ${idleTimeout / 1000}s`);
|
|
286
|
+
resetIdleTimer();
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
server.on('error', (err) => {
|
|
290
|
+
console.error(`Daemon error: ${err.message}`);
|
|
291
|
+
cleanup();
|
|
292
|
+
process.exit(1);
|
|
293
|
+
});
|
|
294
|
+
}
|
package/src/errors.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Standardized error types with user-friendly messages and hints
|
|
3
|
+
*
|
|
4
|
+
* Exit codes:
|
|
5
|
+
* 0 — success
|
|
6
|
+
* 1 — general error
|
|
7
|
+
* 2 — invalid arguments
|
|
8
|
+
* 3 — browser launch failed
|
|
9
|
+
* 4 — navigation failed
|
|
10
|
+
* 5 — extraction failed
|
|
11
|
+
* 6 — timeout
|
|
12
|
+
* 7 — proxy error
|
|
13
|
+
* 8 — profile/session error
|
|
14
|
+
* 130 — interrupted (SIGINT)
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export class StealthError extends Error {
|
|
18
|
+
constructor(message, opts = {}) {
|
|
19
|
+
super(message);
|
|
20
|
+
this.name = 'StealthError';
|
|
21
|
+
this.code = opts.code || 1;
|
|
22
|
+
this.hint = opts.hint || null;
|
|
23
|
+
this.cause = opts.cause || null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
format() {
|
|
27
|
+
let msg = this.message;
|
|
28
|
+
if (this.hint) msg += `\n Hint: ${this.hint}`;
|
|
29
|
+
return msg;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export class BrowserLaunchError extends StealthError {
|
|
34
|
+
constructor(message, opts = {}) {
|
|
35
|
+
super(message, { code: 3, ...opts });
|
|
36
|
+
this.name = 'BrowserLaunchError';
|
|
37
|
+
this.hint = opts.hint || 'Try: npx camoufox-js fetch (re-download browser)';
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export class NavigationError extends StealthError {
|
|
42
|
+
constructor(url, cause) {
|
|
43
|
+
const msg = `Failed to navigate to ${url}`;
|
|
44
|
+
let hint = 'Check the URL and your network connection';
|
|
45
|
+
if (cause?.message?.includes('timeout')) {
|
|
46
|
+
hint = 'Page load timed out. Try --wait <ms> or --retries <n>';
|
|
47
|
+
} else if (cause?.message?.includes('net::ERR_')) {
|
|
48
|
+
hint = 'Network error. Check DNS, proxy, or firewall';
|
|
49
|
+
}
|
|
50
|
+
super(msg, { code: 4, hint, cause });
|
|
51
|
+
this.name = 'NavigationError';
|
|
52
|
+
this.url = url;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export class ExtractionError extends StealthError {
|
|
57
|
+
constructor(message, opts = {}) {
|
|
58
|
+
super(message, { code: 5, ...opts });
|
|
59
|
+
this.name = 'ExtractionError';
|
|
60
|
+
this.hint = opts.hint || 'The page structure may have changed. Try -f snapshot to inspect';
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export class TimeoutError extends StealthError {
|
|
65
|
+
constructor(operation, timeoutMs) {
|
|
66
|
+
super(`${operation} timed out after ${timeoutMs}ms`, {
|
|
67
|
+
code: 6,
|
|
68
|
+
hint: 'Try increasing --wait or --retries',
|
|
69
|
+
});
|
|
70
|
+
this.name = 'TimeoutError';
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export class ProxyError extends StealthError {
|
|
75
|
+
constructor(proxyUrl, cause) {
|
|
76
|
+
super(`Proxy connection failed: ${proxyUrl}`, {
|
|
77
|
+
code: 7,
|
|
78
|
+
hint: 'Check proxy URL, credentials, and connectivity. Run: stealth proxy test',
|
|
79
|
+
cause,
|
|
80
|
+
});
|
|
81
|
+
this.name = 'ProxyError';
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export class ProfileError extends StealthError {
|
|
86
|
+
constructor(message, opts = {}) {
|
|
87
|
+
super(message, { code: 8, ...opts });
|
|
88
|
+
this.name = 'ProfileError';
|
|
89
|
+
this.hint = opts.hint || 'Run: stealth profile list';
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export class BlockedError extends StealthError {
|
|
94
|
+
constructor(engine, url) {
|
|
95
|
+
super(`${engine} detected automation and blocked the request`, {
|
|
96
|
+
code: 4,
|
|
97
|
+
hint: 'Try: --proxy <proxy>, --warmup, --humanize, or use a different engine',
|
|
98
|
+
});
|
|
99
|
+
this.name = 'BlockedError';
|
|
100
|
+
this.url = url;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Format and print error with hint, then exit
|
|
106
|
+
*/
|
|
107
|
+
export function handleError(err) {
|
|
108
|
+
const { log } = loadOutput();
|
|
109
|
+
|
|
110
|
+
if (err instanceof StealthError) {
|
|
111
|
+
log.error(err.message);
|
|
112
|
+
if (err.hint) log.dim(` Hint: ${err.hint}`);
|
|
113
|
+
process.exit(err.code);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Unknown error
|
|
117
|
+
log.error(err.message || String(err));
|
|
118
|
+
|
|
119
|
+
// Common error patterns → helpful hints
|
|
120
|
+
const msg = err.message || '';
|
|
121
|
+
if (msg.includes('ECONNREFUSED')) {
|
|
122
|
+
log.dim(' Hint: Connection refused. Is the target server running?');
|
|
123
|
+
} else if (msg.includes('ENOTFOUND')) {
|
|
124
|
+
log.dim(' Hint: DNS lookup failed. Check the URL');
|
|
125
|
+
} else if (msg.includes('camoufox')) {
|
|
126
|
+
log.dim(' Hint: Try: npx camoufox-js fetch');
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
process.exit(1);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Lazy import to avoid circular dependency
|
|
133
|
+
function loadOutput() {
|
|
134
|
+
// Use dynamic require-like pattern
|
|
135
|
+
return { log: console };
|
|
136
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base extractor — generic fallback for any search page
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export const name = 'generic';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Check if this extractor can handle the given URL
|
|
9
|
+
*/
|
|
10
|
+
export function canHandle() {
|
|
11
|
+
return true; // Fallback — always matches
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Wait for search results to render
|
|
16
|
+
*/
|
|
17
|
+
export async function waitForResults(page) {
|
|
18
|
+
try {
|
|
19
|
+
await page.waitForSelector(
|
|
20
|
+
'h2 a[href], h3 a[href], li a[href], article a[href], .result a[href]',
|
|
21
|
+
{ timeout: 5000 },
|
|
22
|
+
);
|
|
23
|
+
} catch {}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Extract search results
|
|
28
|
+
*/
|
|
29
|
+
export async function extractResults(page, maxResults = 10) {
|
|
30
|
+
await waitForResults(page);
|
|
31
|
+
|
|
32
|
+
return page.evaluate((max) => {
|
|
33
|
+
const results = [];
|
|
34
|
+
const candidates = document.querySelectorAll(
|
|
35
|
+
'h2 a[href], h3 a[href], li a[href], article a[href], .result a[href]',
|
|
36
|
+
);
|
|
37
|
+
const seenUrls = new Set();
|
|
38
|
+
|
|
39
|
+
for (const link of candidates) {
|
|
40
|
+
if (results.length >= max) break;
|
|
41
|
+
const href = link.href;
|
|
42
|
+
const text = link.textContent?.trim();
|
|
43
|
+
|
|
44
|
+
if (href?.startsWith('http') && text?.length > 3 && !seenUrls.has(href)) {
|
|
45
|
+
seenUrls.add(href);
|
|
46
|
+
const parent = link.closest('li, article, div, .result');
|
|
47
|
+
const snippetEl = parent?.querySelector('p, .snippet, .description, span:not(:has(a))');
|
|
48
|
+
|
|
49
|
+
results.push({
|
|
50
|
+
title: text.slice(0, 200),
|
|
51
|
+
url: href,
|
|
52
|
+
snippet: snippetEl?.textContent?.trim().slice(0, 300) || '',
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return results;
|
|
58
|
+
}, maxResults);
|
|
59
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bing Search extractor
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export const name = 'bing';
|
|
6
|
+
|
|
7
|
+
export function canHandle(url) {
|
|
8
|
+
return /bing\.com\/search/.test(url);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function waitForResults(page) {
|
|
12
|
+
try {
|
|
13
|
+
await page.waitForSelector('#b_results .b_algo, .b_algo', { timeout: 5000 });
|
|
14
|
+
} catch {}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function extractResults(page, maxResults = 10) {
|
|
18
|
+
await waitForResults(page);
|
|
19
|
+
|
|
20
|
+
return page.evaluate((max) => {
|
|
21
|
+
const results = [];
|
|
22
|
+
const items = document.querySelectorAll('#b_results .b_algo, .b_algo');
|
|
23
|
+
|
|
24
|
+
for (const item of items) {
|
|
25
|
+
if (results.length >= max) break;
|
|
26
|
+
|
|
27
|
+
const linkEl = item.querySelector('h2 a[href]');
|
|
28
|
+
const snippetEl = item.querySelector('.b_caption p, .b_lineclamp2, .b_lineclamp3');
|
|
29
|
+
const urlEl = item.querySelector('cite, .b_attribution cite');
|
|
30
|
+
|
|
31
|
+
if (linkEl) {
|
|
32
|
+
const href = linkEl.href;
|
|
33
|
+
const title = linkEl.textContent?.trim();
|
|
34
|
+
if (href?.startsWith('http') && title) {
|
|
35
|
+
results.push({
|
|
36
|
+
title,
|
|
37
|
+
url: href,
|
|
38
|
+
snippet: snippetEl?.textContent?.trim() || '',
|
|
39
|
+
displayUrl: urlEl?.textContent?.trim() || '',
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return results;
|
|
46
|
+
}, maxResults);
|
|
47
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DuckDuckGo Search extractor
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export const name = 'duckduckgo';
|
|
6
|
+
|
|
7
|
+
export function canHandle(url) {
|
|
8
|
+
return /duckduckgo\.com/.test(url);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function waitForResults(page) {
|
|
12
|
+
const selectors = [
|
|
13
|
+
'[data-result="web"]',
|
|
14
|
+
'article[data-testid="result"]',
|
|
15
|
+
'.result--web',
|
|
16
|
+
'ol.react-results--main li',
|
|
17
|
+
];
|
|
18
|
+
|
|
19
|
+
for (const sel of selectors) {
|
|
20
|
+
try {
|
|
21
|
+
await page.waitForSelector(sel, { timeout: 4000 });
|
|
22
|
+
return;
|
|
23
|
+
} catch {}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function extractResults(page, maxResults = 10) {
|
|
28
|
+
await waitForResults(page);
|
|
29
|
+
|
|
30
|
+
return page.evaluate((max) => {
|
|
31
|
+
const results = [];
|
|
32
|
+
|
|
33
|
+
// Modern DuckDuckGo layout (React-based)
|
|
34
|
+
const items = document.querySelectorAll(
|
|
35
|
+
'article[data-testid="result"], [data-result="web"], .result--web, ol.react-results--main > li',
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
for (const item of items) {
|
|
39
|
+
if (results.length >= max) break;
|
|
40
|
+
|
|
41
|
+
// Multiple title selectors for different DDG versions
|
|
42
|
+
const linkEl = item.querySelector(
|
|
43
|
+
'a[data-testid="result-title-a"], h2 a[href], a.result__a, a[href]:has(h2)',
|
|
44
|
+
);
|
|
45
|
+
const snippetEl = item.querySelector(
|
|
46
|
+
'[data-testid="result-snippet"], .result__snippet, .E2eLOJr8HctVnDOTM8fs, span.kY2IgmnCmOGjharHErah',
|
|
47
|
+
);
|
|
48
|
+
const urlEl = item.querySelector(
|
|
49
|
+
'[data-testid="result-extras-url-link"], .result__url, a.result__check',
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
if (linkEl) {
|
|
53
|
+
const href = linkEl.href;
|
|
54
|
+
const title = linkEl.textContent?.trim();
|
|
55
|
+
if (href?.startsWith('http') && !href.includes('duckduckgo.com') && title) {
|
|
56
|
+
results.push({
|
|
57
|
+
title,
|
|
58
|
+
url: href,
|
|
59
|
+
snippet: snippetEl?.textContent?.trim() || '',
|
|
60
|
+
displayUrl: urlEl?.textContent?.trim() || '',
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Fallback: extract from any visible links in the results area
|
|
67
|
+
if (results.length === 0) {
|
|
68
|
+
const seenUrls = new Set();
|
|
69
|
+
const links = document.querySelectorAll(
|
|
70
|
+
'#links a[href], .results a[href], [data-testid="mainline"] a[href]',
|
|
71
|
+
);
|
|
72
|
+
|
|
73
|
+
for (const link of links) {
|
|
74
|
+
if (results.length >= max) break;
|
|
75
|
+
const href = link.href;
|
|
76
|
+
const text = link.textContent?.trim();
|
|
77
|
+
if (
|
|
78
|
+
href?.startsWith('http') &&
|
|
79
|
+
!href.includes('duckduckgo.com') &&
|
|
80
|
+
text?.length > 5 &&
|
|
81
|
+
!seenUrls.has(href)
|
|
82
|
+
) {
|
|
83
|
+
seenUrls.add(href);
|
|
84
|
+
results.push({ title: text.slice(0, 200), url: href, snippet: '' });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return results;
|
|
90
|
+
}, maxResults);
|
|
91
|
+
}
|