deepspider 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +3 -0
- package/README.md +21 -15
- package/package.json +9 -7
- package/src/agent/core/PanelBridge.js +56 -78
- package/src/agent/core/StreamHandler.js +244 -20
- package/src/agent/index.js +120 -23
- package/src/agent/logger.js +183 -8
- package/src/agent/middleware/memoryFlush.js +48 -0
- package/src/agent/middleware/report.js +95 -37
- package/src/agent/middleware/subagent.js +236 -0
- package/src/agent/middleware/toolAvailability.js +37 -0
- package/src/agent/middleware/toolGuard.js +187 -0
- package/src/agent/middleware/validationWorkflow.js +171 -0
- package/src/agent/prompts/system.js +310 -59
- package/src/agent/run.js +168 -20
- package/src/agent/sessions.js +88 -0
- package/src/agent/skills/anti-detect/SKILL.md +89 -14
- package/src/agent/skills/captcha/SKILL.md +93 -19
- package/src/agent/skills/crawler/SKILL.md +64 -3
- package/src/agent/skills/crawler/evolved.md +9 -1
- package/src/agent/skills/dynamic-analysis/SKILL.md +74 -7
- package/src/agent/skills/env/SKILL.md +75 -0
- package/src/agent/skills/js2python/evolved.md +5 -1
- package/src/agent/skills/sandbox/SKILL.md +35 -0
- package/src/agent/skills/static-analysis/SKILL.md +98 -2
- package/src/agent/skills/static-analysis/evolved.md +5 -1
- package/src/agent/subagents/anti-detect.js +36 -24
- package/src/agent/subagents/captcha.js +35 -28
- package/src/agent/subagents/crawler.js +40 -105
- package/src/agent/subagents/factory.js +129 -9
- package/src/agent/subagents/index.js +4 -13
- package/src/agent/subagents/js2python.js +25 -35
- package/src/agent/subagents/reverse.js +180 -0
- package/src/agent/tools/analysis.js +101 -8
- package/src/agent/tools/anti-detect.js +5 -2
- package/src/agent/tools/browser.js +186 -13
- package/src/agent/tools/capture.js +24 -3
- package/src/agent/tools/correlate.js +129 -15
- package/src/agent/tools/crawler.js +3 -2
- package/src/agent/tools/crawlerGenerator.js +90 -0
- package/src/agent/tools/debug.js +43 -6
- package/src/agent/tools/evolve.js +5 -2
- package/src/agent/tools/extractor.js +5 -1
- package/src/agent/tools/file.js +14 -5
- package/src/agent/tools/generateHook.js +66 -0
- package/src/agent/tools/hookManager.js +19 -9
- package/src/agent/tools/index.js +36 -21
- package/src/agent/tools/nodejs.js +41 -6
- package/src/agent/tools/patch.js +1 -1
- package/src/agent/tools/sandbox.js +21 -1
- package/src/agent/tools/scratchpad.js +70 -0
- package/src/agent/tools/store.js +1 -1
- package/src/agent/tools/tracing.js +26 -0
- package/src/agent/tools/verifyAlgorithm.js +117 -0
- package/src/browser/EnvBridge.js +27 -13
- package/src/browser/client.js +128 -18
- package/src/browser/collector.js +101 -22
- package/src/browser/defaultHooks.js +3 -1
- package/src/browser/hooks/index.js +5 -0
- package/src/browser/interceptors/AntiDebugInterceptor.js +132 -0
- package/src/browser/interceptors/NetworkInterceptor.js +76 -12
- package/src/browser/interceptors/ScriptInterceptor.js +32 -7
- package/src/browser/interceptors/index.js +1 -0
- package/src/browser/ui/analysisPanel.js +541 -464
- package/src/cli/commands/config.js +11 -3
- package/src/config/paths.js +9 -1
- package/src/config/settings.js +7 -1
- package/src/core/PatchGenerator.js +24 -4
- package/src/core/Sandbox.js +140 -3
- package/src/env/EnvCodeGenerator.js +60 -88
- package/src/env/modules/bom/history.js +6 -0
- package/src/env/modules/bom/location.js +6 -0
- package/src/env/modules/bom/navigator.js +13 -0
- package/src/env/modules/bom/screen.js +6 -0
- package/src/env/modules/bom/storage.js +7 -0
- package/src/env/modules/dom/document.js +14 -0
- package/src/env/modules/dom/event.js +4 -0
- package/src/env/modules/index.js +27 -10
- package/src/env/modules/webapi/fetch.js +4 -0
- package/src/env/modules/webapi/url.js +4 -0
- package/src/env/modules/webapi/xhr.js +8 -0
- package/src/store/DataStore.js +125 -42
- package/src/store/Store.js +2 -1
- package/src/agent/subagents/dynamic.js +0 -64
- package/src/agent/subagents/env-agent.js +0 -82
- package/src/agent/subagents/sandbox.js +0 -55
- package/src/agent/subagents/static.js +0 -66
package/src/agent/run.js
CHANGED
|
@@ -10,13 +10,17 @@ import readline from 'readline';
|
|
|
10
10
|
import { readFileSync } from 'fs';
|
|
11
11
|
import { marked } from 'marked';
|
|
12
12
|
import { createDeepSpiderAgent } from './index.js';
|
|
13
|
-
import { fullAnalysisPrompt } from './prompts/system.js';
|
|
13
|
+
import { fullAnalysisPrompt, tracePrompt, decryptPrompt, extractPrompt } from './prompts/system.js';
|
|
14
14
|
import { getBrowser } from '../browser/index.js';
|
|
15
15
|
import { markHookInjected } from './tools/runtime.js';
|
|
16
|
+
import { getDataStore } from '../store/DataStore.js';
|
|
16
17
|
import { createLogger } from './logger.js';
|
|
17
18
|
import { browserTools } from './tools/browser.js';
|
|
18
19
|
import { ensureConfig } from './setup.js';
|
|
20
|
+
import { getConfigValues } from '../config/settings.js';
|
|
21
|
+
import { PATHS, ensureDir } from '../config/paths.js';
|
|
19
22
|
import { StreamHandler, PanelBridge } from './core/index.js';
|
|
23
|
+
import { createCheckpointer, generateThreadId, createSession, listSessions, touchSession, cleanExpiredSessions } from './sessions.js';
|
|
20
24
|
|
|
21
25
|
let rl = null;
|
|
22
26
|
let browser = null;
|
|
@@ -26,14 +30,15 @@ let DEBUG = false;
|
|
|
26
30
|
let debugFn = () => {};
|
|
27
31
|
let agent = null;
|
|
28
32
|
let agentConfig = null;
|
|
33
|
+
let currentThreadId = null;
|
|
34
|
+
let isResuming = false;
|
|
29
35
|
|
|
30
36
|
/**
|
|
31
37
|
* 从文件显示报告(由中间件回调触发)
|
|
32
38
|
*/
|
|
33
39
|
async function showReportFromFile(mdFilePath) {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
console.log('[report] 错误: 无法获取 page');
|
|
40
|
+
if (!browser) {
|
|
41
|
+
console.log('[report] 错误: 无浏览器实例');
|
|
37
42
|
return;
|
|
38
43
|
}
|
|
39
44
|
|
|
@@ -45,10 +50,15 @@ async function showReportFromFile(mdFilePath) {
|
|
|
45
50
|
const escaped = JSON.stringify(htmlContent);
|
|
46
51
|
const cdp = await browser?.getCDPSession?.();
|
|
47
52
|
if (cdp) {
|
|
48
|
-
await
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
await Promise.race([
|
|
54
|
+
cdp.send('Runtime.evaluate', {
|
|
55
|
+
expression: `window.__deepspider__?.showReport?.(${escaped}, true)`,
|
|
56
|
+
returnByValue: true,
|
|
57
|
+
}),
|
|
58
|
+
new Promise((_, reject) =>
|
|
59
|
+
setTimeout(() => reject(new Error('showReport timeout')), 5000)
|
|
60
|
+
),
|
|
61
|
+
]);
|
|
52
62
|
}
|
|
53
63
|
console.log('[report] 已显示分析报告');
|
|
54
64
|
} catch (e) {
|
|
@@ -56,11 +66,40 @@ async function showReportFromFile(mdFilePath) {
|
|
|
56
66
|
}
|
|
57
67
|
}
|
|
58
68
|
|
|
69
|
+
function getActionPrompt(action) {
|
|
70
|
+
switch (action) {
|
|
71
|
+
case 'trace': return tracePrompt;
|
|
72
|
+
case 'decrypt': return decryptPrompt;
|
|
73
|
+
case 'extract': return extractPrompt;
|
|
74
|
+
case 'full':
|
|
75
|
+
default: return fullAnalysisPrompt;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* 生成轻量浏览器状态摘要(注入 prompt,帮助主 agent 判断和委派)
|
|
81
|
+
* 只含计数信息,不含实际数据
|
|
82
|
+
*/
|
|
83
|
+
function getBrowserStateSummary() {
|
|
84
|
+
try {
|
|
85
|
+
const store = getDataStore();
|
|
86
|
+
const sites = store.getSiteList();
|
|
87
|
+
if (!sites.length) return '';
|
|
88
|
+
|
|
89
|
+
const lines = sites.map(s =>
|
|
90
|
+
` - ${s.hostname}: ${s.responseCount} 条请求, ${s.scriptCount} 个脚本`
|
|
91
|
+
);
|
|
92
|
+
return `\n已捕获数据:\n${lines.join('\n')}`;
|
|
93
|
+
} catch {
|
|
94
|
+
return '';
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
59
98
|
/**
|
|
60
99
|
* 处理浏览器消息(通过 CDP binding 接收)
|
|
61
100
|
*/
|
|
62
|
-
async function handleBrowserMessage(data
|
|
63
|
-
debugFn(`handleBrowserMessage: 收到消息, type=${data.type}
|
|
101
|
+
async function handleBrowserMessage(data) {
|
|
102
|
+
debugFn(`handleBrowserMessage: 收到消息, type=${data.type}`);
|
|
64
103
|
|
|
65
104
|
const browserReadyPrefix = '[浏览器已就绪] ';
|
|
66
105
|
|
|
@@ -72,12 +111,13 @@ async function handleBrowserMessage(data, page) {
|
|
|
72
111
|
).join('\n');
|
|
73
112
|
|
|
74
113
|
const supplementText = data.text ? `\n\n用户补充说明: ${data.text}` : '';
|
|
114
|
+
const action = data.action || 'full';
|
|
75
115
|
|
|
76
|
-
userPrompt = `${browserReadyPrefix}
|
|
116
|
+
userPrompt = `${browserReadyPrefix}用户选中了以下数据:
|
|
77
117
|
|
|
78
118
|
${elementsDesc}${supplementText}
|
|
79
119
|
|
|
80
|
-
${
|
|
120
|
+
${getActionPrompt(action)}`;
|
|
81
121
|
} else if (data.type === 'generate-config') {
|
|
82
122
|
const config = data.config;
|
|
83
123
|
userPrompt = `${browserReadyPrefix}请使用 crawler 子代理生成爬虫。
|
|
@@ -89,16 +129,21 @@ ${JSON.stringify(config.fields, null, 2)}
|
|
|
89
129
|
|
|
90
130
|
请先用 query_store 查询已有的加密代码,然后整合生成配置和脚本。`;
|
|
91
131
|
} else if (data.type === 'chat') {
|
|
132
|
+
const pageUrl = browser?.getPage()?.url?.() || targetUrl || '';
|
|
133
|
+
const urlLine = pageUrl ? `当前页面: ${pageUrl}\n` : '';
|
|
134
|
+
const stateSummary = getBrowserStateSummary();
|
|
92
135
|
if (data.elements && data.elements.length > 0) {
|
|
93
136
|
const elementsDesc = data.elements.map((el, i) =>
|
|
94
137
|
`${i + 1}. "${el.text?.slice(0, 100) || ''}"\n XPath: ${el.xpath}`
|
|
95
138
|
).join('\n');
|
|
96
|
-
userPrompt = `${browserReadyPrefix}${
|
|
139
|
+
userPrompt = `${browserReadyPrefix}${urlLine}${stateSummary}
|
|
140
|
+
|
|
141
|
+
${data.text}
|
|
97
142
|
|
|
98
143
|
用户选中的元素:
|
|
99
144
|
${elementsDesc}`;
|
|
100
145
|
} else {
|
|
101
|
-
userPrompt = `${browserReadyPrefix}${data.text}`;
|
|
146
|
+
userPrompt = `${browserReadyPrefix}${urlLine}${stateSummary}\n\n${data.text}`;
|
|
102
147
|
}
|
|
103
148
|
} else if (data.type === 'open-file') {
|
|
104
149
|
let filePath = data.path;
|
|
@@ -117,12 +162,41 @@ ${elementsDesc}`;
|
|
|
117
162
|
});
|
|
118
163
|
}
|
|
119
164
|
return;
|
|
165
|
+
} else if (data.type === 'choice') {
|
|
166
|
+
// interrupt 恢复:用户点击了选项
|
|
167
|
+
console.log('\n[浏览器] 用户选择: ' + data.value);
|
|
168
|
+
await streamHandler.resumeInterrupt(data.value);
|
|
169
|
+
console.log('\n');
|
|
170
|
+
process.stdout.write('> ');
|
|
171
|
+
return;
|
|
172
|
+
} else if (data.type === 'confirm-result') {
|
|
173
|
+
// interrupt 恢复:用户点击了确认/取消
|
|
174
|
+
console.log('\n[浏览器] 用户' + (data.confirmed ? '确认' : '取消'));
|
|
175
|
+
await streamHandler.resumeInterrupt(data.confirmed);
|
|
176
|
+
console.log('\n');
|
|
177
|
+
process.stdout.write('> ');
|
|
178
|
+
return;
|
|
179
|
+
} else if (data.type === 'resume') {
|
|
180
|
+
if (isResuming) return;
|
|
181
|
+
isResuming = true;
|
|
182
|
+
console.log('\n[恢复] 用户选择恢复 session: ' + data.threadId);
|
|
183
|
+
currentThreadId = data.threadId;
|
|
184
|
+
agentConfig.configurable.thread_id = data.threadId;
|
|
185
|
+
try {
|
|
186
|
+
await streamHandler.chatStreamResume();
|
|
187
|
+
} finally {
|
|
188
|
+
isResuming = false;
|
|
189
|
+
}
|
|
190
|
+
console.log('\n');
|
|
191
|
+
process.stdout.write('> ');
|
|
192
|
+
return;
|
|
120
193
|
} else {
|
|
121
194
|
return;
|
|
122
195
|
}
|
|
123
196
|
|
|
124
197
|
console.log('\n[浏览器] ' + (data.type === 'analysis' ? '分析请求' : data.type === 'generate-config' ? '生成配置' : '对话'));
|
|
125
198
|
await streamHandler.chatStream(userPrompt);
|
|
199
|
+
if (currentThreadId) touchSession(currentThreadId);
|
|
126
200
|
console.log('\n');
|
|
127
201
|
process.stdout.write('> ');
|
|
128
202
|
}
|
|
@@ -140,7 +214,13 @@ function prompt() {
|
|
|
140
214
|
return;
|
|
141
215
|
}
|
|
142
216
|
|
|
143
|
-
|
|
217
|
+
let enrichedInput = input;
|
|
218
|
+
if (browser) {
|
|
219
|
+
const url = browser.getPage()?.url?.() || targetUrl || '';
|
|
220
|
+
enrichedInput = `[浏览器已就绪] 当前页面: ${url}\n\n${input}`;
|
|
221
|
+
}
|
|
222
|
+
await streamHandler.chatStream(enrichedInput);
|
|
223
|
+
if (currentThreadId) touchSession(currentThreadId);
|
|
144
224
|
console.log('\n');
|
|
145
225
|
prompt();
|
|
146
226
|
});
|
|
@@ -151,6 +231,8 @@ async function init() {
|
|
|
151
231
|
const args = process.argv.slice(2);
|
|
152
232
|
targetUrl = args.find(arg => arg.startsWith('http://') || arg.startsWith('https://'));
|
|
153
233
|
DEBUG = process.env.DEBUG === 'true' || args.includes('--debug');
|
|
234
|
+
const PERSIST = args.includes('--persist');
|
|
235
|
+
const RESUME = args.includes('--resume');
|
|
154
236
|
debugFn = (...a) => { if (DEBUG) console.log('[DEBUG]', ...a); };
|
|
155
237
|
|
|
156
238
|
debugFn('init: 启动');
|
|
@@ -169,23 +251,57 @@ async function init() {
|
|
|
169
251
|
output: process.stdout,
|
|
170
252
|
});
|
|
171
253
|
|
|
172
|
-
const
|
|
254
|
+
const loggerCallbacks = createLogger();
|
|
173
255
|
|
|
174
256
|
async function onReportReady(mdFilePath) {
|
|
175
257
|
console.log('[report] 中间件触发报告显示:', mdFilePath);
|
|
176
258
|
await showReportFromFile(mdFilePath);
|
|
177
259
|
}
|
|
178
260
|
|
|
179
|
-
|
|
261
|
+
// panelBridge 引用,在后面初始化后赋值
|
|
262
|
+
let sharedPanelBridge = null;
|
|
180
263
|
|
|
264
|
+
async function onFileSaved({ path, type }) {
|
|
265
|
+
console.log(`[report] 文件已保存: ${path} (${type})`);
|
|
266
|
+
if (!sharedPanelBridge) return;
|
|
267
|
+
const shortPath = path.replace(process.env.HOME || '', '~');
|
|
268
|
+
await sharedPanelBridge.sendMessage('file-saved', { path: shortPath, type });
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// 持久化 checkpointer + session 管理
|
|
272
|
+
const checkpointer = createCheckpointer();
|
|
273
|
+
cleanExpiredSessions();
|
|
274
|
+
let domain = targetUrl ? new URL(targetUrl).hostname : null;
|
|
275
|
+
let threadId;
|
|
276
|
+
let autoResume = false;
|
|
277
|
+
|
|
278
|
+
if (RESUME && domain) {
|
|
279
|
+
const existing = listSessions(domain);
|
|
280
|
+
if (existing.length > 0) {
|
|
281
|
+
threadId = existing[0].thread_id;
|
|
282
|
+
autoResume = true;
|
|
283
|
+
console.log(`[恢复] 找到上次 session: ${threadId}`);
|
|
284
|
+
console.log(`[恢复] 上次活跃: ${new Date(existing[0].updated_at).toLocaleString()}, 消息数: ${existing[0].message_count}`);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if (!threadId) {
|
|
289
|
+
threadId = domain ? generateThreadId(domain) : `deepspider-${Date.now()}`;
|
|
290
|
+
if (domain) createSession(threadId, domain, targetUrl);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
agent = createDeepSpiderAgent({ onReportReady, onFileSaved, checkpointer });
|
|
294
|
+
|
|
295
|
+
currentThreadId = threadId;
|
|
181
296
|
agentConfig = {
|
|
182
|
-
configurable: { thread_id:
|
|
297
|
+
configurable: { thread_id: threadId },
|
|
183
298
|
recursionLimit: 5000,
|
|
184
|
-
callbacks:
|
|
299
|
+
callbacks: loggerCallbacks,
|
|
185
300
|
};
|
|
186
301
|
|
|
187
302
|
// 初始化流处理器
|
|
188
303
|
const panelBridge = new PanelBridge(() => browser, debugFn);
|
|
304
|
+
sharedPanelBridge = panelBridge;
|
|
189
305
|
streamHandler = new StreamHandler({
|
|
190
306
|
agent,
|
|
191
307
|
config: agentConfig,
|
|
@@ -201,7 +317,14 @@ async function init() {
|
|
|
201
317
|
console.log(`正在打开: ${targetUrl}\n`);
|
|
202
318
|
try {
|
|
203
319
|
debugFn('init: 获取浏览器实例');
|
|
204
|
-
|
|
320
|
+
const browserOptions = {};
|
|
321
|
+
const config = getConfigValues();
|
|
322
|
+
if (PERSIST || config.persistBrowserData) {
|
|
323
|
+
ensureDir(PATHS.BROWSER_DATA_DIR);
|
|
324
|
+
browserOptions.userDataDir = PATHS.BROWSER_DATA_DIR;
|
|
325
|
+
console.log(`[持久化模式] 浏览器数据保存在 ${PATHS.BROWSER_DATA_DIR}`);
|
|
326
|
+
}
|
|
327
|
+
browser = await getBrowser(browserOptions);
|
|
205
328
|
browser.onMessage = handleBrowserMessage;
|
|
206
329
|
debugFn('init: 导航到目标URL');
|
|
207
330
|
await browser.navigate(targetUrl);
|
|
@@ -209,6 +332,31 @@ async function init() {
|
|
|
209
332
|
debugFn('init: 浏览器就绪');
|
|
210
333
|
console.log('浏览器已就绪,数据自动记录中');
|
|
211
334
|
console.log('点击面板选择按钮(⦿)选择数据进行分析\n');
|
|
335
|
+
|
|
336
|
+
// 恢复逻辑
|
|
337
|
+
if (autoResume) {
|
|
338
|
+
console.log('[恢复] 从上次中断处继续...\n');
|
|
339
|
+
await streamHandler.chatStreamResume();
|
|
340
|
+
console.log('\n');
|
|
341
|
+
} else if (domain) {
|
|
342
|
+
const existing = listSessions(domain).filter(s => s.thread_id !== threadId && s.message_count > 0);
|
|
343
|
+
if (existing.length > 0) {
|
|
344
|
+
const ready = await panelBridge.waitForPanel();
|
|
345
|
+
if (ready) {
|
|
346
|
+
const s = existing[0];
|
|
347
|
+
const ago = Math.round((Date.now() - s.updated_at) / 60000);
|
|
348
|
+
const timeStr = ago < 60 ? `${ago}分钟前` : `${Math.round(ago / 60)}小时前`;
|
|
349
|
+
await panelBridge.sendMessage('resume-available', {
|
|
350
|
+
threadId: s.thread_id,
|
|
351
|
+
domain: s.domain,
|
|
352
|
+
messageCount: s.message_count,
|
|
353
|
+
timeAgo: timeStr,
|
|
354
|
+
});
|
|
355
|
+
} else {
|
|
356
|
+
debugFn('init: 面板未就绪,跳过恢复横幅');
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
212
360
|
} catch (error) {
|
|
213
361
|
console.error('启动浏览器失败:', error.message);
|
|
214
362
|
debugFn('init: 浏览器启动失败 -', error.stack);
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepSpider - Session 管理
|
|
3
|
+
* 基于 SQLite 持久化 session 元数据,支持跨进程恢复
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import Database from 'better-sqlite3';
|
|
7
|
+
import { join } from 'path';
|
|
8
|
+
import { SqliteSaver } from '@langchain/langgraph-checkpoint-sqlite';
|
|
9
|
+
import { DEEPSPIDER_HOME, ensureDir } from '../config/paths.js';
|
|
10
|
+
|
|
11
|
+
const DB_PATH = join(DEEPSPIDER_HOME, 'sessions.db');
|
|
12
|
+
const CHECKPOINT_DB_PATH = join(DEEPSPIDER_HOME, 'checkpoints.db');
|
|
13
|
+
const SESSION_EXPIRE_DAYS = 7;
|
|
14
|
+
|
|
15
|
+
let _db = null;
|
|
16
|
+
|
|
17
|
+
function getDb() {
|
|
18
|
+
if (!_db) {
|
|
19
|
+
ensureDir(DEEPSPIDER_HOME);
|
|
20
|
+
_db = new Database(DB_PATH);
|
|
21
|
+
_db.pragma('journal_mode=WAL');
|
|
22
|
+
_db.exec(`
|
|
23
|
+
CREATE TABLE IF NOT EXISTS sessions (
|
|
24
|
+
thread_id TEXT PRIMARY KEY,
|
|
25
|
+
domain TEXT NOT NULL,
|
|
26
|
+
url TEXT NOT NULL,
|
|
27
|
+
created_at INTEGER NOT NULL,
|
|
28
|
+
updated_at INTEGER NOT NULL,
|
|
29
|
+
message_count INTEGER DEFAULT 0,
|
|
30
|
+
status TEXT DEFAULT 'active'
|
|
31
|
+
)`);
|
|
32
|
+
}
|
|
33
|
+
return _db;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* 创建 LangGraph checkpointer(独立 DB 文件,避免与 session 元数据竞争)
|
|
38
|
+
*/
|
|
39
|
+
export function createCheckpointer() {
|
|
40
|
+
ensureDir(DEEPSPIDER_HOME);
|
|
41
|
+
return SqliteSaver.fromConnString(CHECKPOINT_DB_PATH);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* 生成 thread_id
|
|
46
|
+
*/
|
|
47
|
+
export function generateThreadId(domain) {
|
|
48
|
+
return `deepspider-${domain}-${Date.now()}`;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* 创建新 session
|
|
53
|
+
*/
|
|
54
|
+
export function createSession(threadId, domain, url) {
|
|
55
|
+
const now = Date.now();
|
|
56
|
+
getDb().prepare(
|
|
57
|
+
'INSERT INTO sessions (thread_id, domain, url, created_at, updated_at) VALUES (?, ?, ?, ?, ?)'
|
|
58
|
+
).run(threadId, domain, url, now, now);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* 更新 session 活跃时间和消息数
|
|
63
|
+
*/
|
|
64
|
+
export function touchSession(threadId) {
|
|
65
|
+
getDb().prepare('UPDATE sessions SET updated_at = ?, message_count = message_count + 1 WHERE thread_id = ?')
|
|
66
|
+
.run(Date.now(), threadId);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* 清理过期 session
|
|
71
|
+
*/
|
|
72
|
+
export function cleanExpiredSessions() {
|
|
73
|
+
const cutoff = Date.now() - SESSION_EXPIRE_DAYS * 86400000;
|
|
74
|
+
getDb().prepare('DELETE FROM sessions WHERE updated_at < ?').run(cutoff);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* 列出可恢复的 session(按域名过滤,7天内)
|
|
79
|
+
*/
|
|
80
|
+
export function listSessions(domain = null) {
|
|
81
|
+
const db = getDb();
|
|
82
|
+
const cutoff = Date.now() - SESSION_EXPIRE_DAYS * 86400000;
|
|
83
|
+
const sql = domain
|
|
84
|
+
? 'SELECT * FROM sessions WHERE domain = ? AND status = ? AND updated_at >= ? ORDER BY updated_at DESC'
|
|
85
|
+
: 'SELECT * FROM sessions WHERE status = ? AND updated_at >= ? ORDER BY updated_at DESC';
|
|
86
|
+
const params = domain ? [domain, 'active', cutoff] : ['active', cutoff];
|
|
87
|
+
return db.prepare(sql).all(...params);
|
|
88
|
+
}
|
|
@@ -1,28 +1,103 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: anti-detect
|
|
3
3
|
description: |
|
|
4
|
-
反检测经验。浏览器指纹、代理IP
|
|
4
|
+
反检测经验。浏览器指纹、代理IP、TLS指纹、行为检测、风控规避技巧。
|
|
5
|
+
触发:反爬绕过、IP封禁、指纹检测、风控拦截、403/429处理。
|
|
5
6
|
---
|
|
6
7
|
|
|
7
8
|
# 反检测经验
|
|
8
9
|
|
|
10
|
+
## 快速诊断流程
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
请求被拦截?
|
|
14
|
+
├── 403 Forbidden
|
|
15
|
+
│ ├── 换 IP 后正常 → IP 黑名单
|
|
16
|
+
│ ├── 换 IP 仍 403 → 指纹/TLS 检测
|
|
17
|
+
│ └── 带 Cookie 正常 → Cookie 验证
|
|
18
|
+
├── 429 Too Many Requests → 频率限制
|
|
19
|
+
├── 返回验证码页面 → 风控触发(转 captcha)
|
|
20
|
+
├── 返回空数据/假数据 → 静默风控
|
|
21
|
+
└── JS 渲染异常 → 环境检测
|
|
22
|
+
```
|
|
23
|
+
|
|
9
24
|
## 浏览器指纹
|
|
10
25
|
|
|
11
|
-
###
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
26
|
+
### 检测点与绕过
|
|
27
|
+
|
|
28
|
+
| 检测项 | 检测方式 | 绕过策略 |
|
|
29
|
+
|--------|----------|----------|
|
|
30
|
+
| webdriver | `navigator.webdriver` | Patchright 已自动处理 |
|
|
31
|
+
| chrome 对象 | `window.chrome` 存在性 | Patchright 已自动处理 |
|
|
32
|
+
| Canvas | `toDataURL()` 哈希 | 注入噪声或固定返回值 |
|
|
33
|
+
| WebGL | `getParameter()` 渲染器信息 | 伪造 vendor/renderer 字符串 |
|
|
34
|
+
| Audio | `AudioContext` 指纹 | 固定 oscillator 输出 |
|
|
35
|
+
| 字体 | `measureText()` 宽度差异 | 安装常见字体集 |
|
|
36
|
+
| 屏幕 | `screen.width/height` | 设置 viewport 匹配常见分辨率 |
|
|
37
|
+
| 插件 | `navigator.plugins.length` | 注入常见插件列表 |
|
|
38
|
+
|
|
39
|
+
### 指纹一致性原则
|
|
40
|
+
- 同一 Profile 内所有指纹项必须自洽(如 UA 说 Windows 但 platform 说 MacIntel 会被检测)
|
|
41
|
+
- User-Agent 与 navigator 属性、屏幕分辨率、时区要匹配
|
|
42
|
+
- 持久化 Profile 复用,避免每次生成新指纹
|
|
16
43
|
|
|
17
44
|
## 代理 IP
|
|
18
45
|
|
|
19
|
-
###
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
46
|
+
### 代理选型
|
|
47
|
+
|
|
48
|
+
| 类型 | 适用场景 | 特点 |
|
|
49
|
+
|------|----------|------|
|
|
50
|
+
| 数据中心代理 | 大规模采集、对 IP 质量要求不高 | 便宜、速度快、易被识别 |
|
|
51
|
+
| 住宅代理 | 反检测要求高的网站 | 贵、IP 质量高、不易被封 |
|
|
52
|
+
| ISP 代理 | 需要固定 IP 的场景 | 稳定、速度快 |
|
|
53
|
+
| 移动代理 | 移动端 API 采集 | IP 池大、信任度高 |
|
|
54
|
+
|
|
55
|
+
### 轮换策略
|
|
56
|
+
- 每个 IP 请求次数上限(根据目标网站调整,通常 10-50 次)
|
|
57
|
+
- 被封后标记冷却时间,不要立即重试
|
|
58
|
+
- 同一 session 保持同一 IP(避免 Cookie 与 IP 绑定检测)
|
|
59
|
+
|
|
60
|
+
## TLS 指纹
|
|
61
|
+
|
|
62
|
+
### JA3/JA4 指纹
|
|
63
|
+
- 原理:TLS 握手中的 cipher suites、extensions 顺序构成唯一指纹
|
|
64
|
+
- requests 库的 JA3 与真实浏览器不同,容易被识别
|
|
65
|
+
- 绕过方案:
|
|
66
|
+
- `curl_cffi`:模拟 Chrome/Firefox 的 TLS 指纹
|
|
67
|
+
- `tls_client`:Go 实现的 TLS 客户端
|
|
68
|
+
- Patchright/Playwright:真实浏览器,指纹天然正确
|
|
69
|
+
|
|
70
|
+
### HTTP/2 指纹
|
|
71
|
+
- 部分网站检测 HTTP/2 的 SETTINGS 帧和优先级
|
|
72
|
+
- requests 不支持 HTTP/2,用 `httpx` 或 `curl_cffi`
|
|
73
|
+
|
|
74
|
+
## 行为检测
|
|
75
|
+
|
|
76
|
+
### 常见检测维度
|
|
77
|
+
- 鼠标轨迹:是否有自然的移动路径
|
|
78
|
+
- 点击间隔:是否过于均匀
|
|
79
|
+
- 滚动行为:是否有自然的加速减速
|
|
80
|
+
- 页面停留时间:是否过短
|
|
81
|
+
- 请求顺序:是否跳过了正常浏览流程(如直接请求 API 不加载页面)
|
|
82
|
+
|
|
83
|
+
### 应对策略
|
|
84
|
+
- 请求间隔随机化:`random.uniform(1, 3)` 秒
|
|
85
|
+
- 模拟正常浏览流程:先请求页面 → 加载静态资源 → 再请求 API
|
|
86
|
+
- Referer 链完整:每个请求的 Referer 要与浏览路径一致
|
|
87
|
+
|
|
88
|
+
## 常见风控系统
|
|
23
89
|
|
|
24
|
-
|
|
90
|
+
| 系统 | 识别特征 | 绕过难度 |
|
|
91
|
+
|------|----------|----------|
|
|
92
|
+
| Cloudflare | `cf-` 前缀 Cookie、JS Challenge | 高(建议用浏览器方案) |
|
|
93
|
+
| Akamai | `_abck` Cookie、sensor_data | 高 |
|
|
94
|
+
| PerimeterX | `_px` 前缀 Cookie | 高 |
|
|
95
|
+
| 瑞数信息 | `$_ts` 变量、动态 JS | 极高 |
|
|
96
|
+
| 同盾 | 设备指纹 + 行为分析 | 中高 |
|
|
97
|
+
| 极验 | 滑块/点选验证码 | 中(转 captcha 处理) |
|
|
25
98
|
|
|
26
|
-
###
|
|
27
|
-
-
|
|
28
|
-
-
|
|
99
|
+
### 通用原则
|
|
100
|
+
- 能用请求重放就不用浏览器(性能好)
|
|
101
|
+
- 请求重放被拦截再升级到浏览器方案
|
|
102
|
+
- 浏览器方案优先用 Patchright(反检测最好)
|
|
103
|
+
- 遇到瑞数/Akamai 等高强度风控,考虑补环境方案或放弃纯请求
|
|
@@ -2,36 +2,110 @@
|
|
|
2
2
|
name: captcha
|
|
3
3
|
description: |
|
|
4
4
|
验证码处理经验。滑块验证码、图片验证码、点选验证码的识别与绕过技巧。
|
|
5
|
+
触发:验证码识别、滑块绕过、OCR、点选验证码、验证码拦截。
|
|
5
6
|
---
|
|
6
7
|
|
|
7
8
|
# 验证码处理经验
|
|
8
9
|
|
|
10
|
+
## 快速识别流程
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
页面出现验证码?
|
|
14
|
+
├── 有滑动条 + 背景图 + 滑块图 → 滑块验证码
|
|
15
|
+
├── 有图片 + 输入框 → 图片验证码(OCR)
|
|
16
|
+
├── 有背景图 + 文字/图标提示 → 点选验证码
|
|
17
|
+
├── 有旋转图片 → 旋转验证码
|
|
18
|
+
├── 有拼图 → 拼图验证码(类似滑块)
|
|
19
|
+
└── 弹出第三方页面(极验/网易/腾讯)→ 第三方验证码服务
|
|
20
|
+
```
|
|
21
|
+
|
|
9
22
|
## 滑块验证码
|
|
10
23
|
|
|
11
|
-
###
|
|
12
|
-
|
|
13
|
-
|
|
24
|
+
### 缺口检测方法
|
|
25
|
+
|
|
26
|
+
| 方法 | 适用场景 | 精度 |
|
|
27
|
+
|------|----------|------|
|
|
28
|
+
| Canny 边缘检测 | 缺口边缘清晰 | 高 |
|
|
29
|
+
| 模板匹配 | 有独立滑块图片 | 高 |
|
|
30
|
+
| 像素差异对比 | 有完整背景图和缺口背景图 | 最高 |
|
|
31
|
+
| 灰度梯度 | 缺口区域与背景对比度大 | 中 |
|
|
32
|
+
|
|
33
|
+
### 轨迹生成参数
|
|
14
34
|
|
|
15
|
-
|
|
16
|
-
-
|
|
17
|
-
|
|
18
|
-
-
|
|
19
|
-
-
|
|
35
|
+
```
|
|
36
|
+
总时长: 300-800ms(过快被检测,过慢不自然)
|
|
37
|
+
阶段分配:
|
|
38
|
+
- 起步 (0-15%): 加速,速度从 0 到峰值
|
|
39
|
+
- 中段 (15-75%): 匀速或微加速
|
|
40
|
+
- 减速 (75-90%): 明显减速
|
|
41
|
+
- 微调 (90-100%): 缓慢靠近 + 轻微回弹 (1-3px)
|
|
42
|
+
|
|
43
|
+
Y 轴抖动: ±1-2px 随机偏移(纯水平移动会被检测)
|
|
44
|
+
采样间隔: 10-30ms(模拟 requestAnimationFrame)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 常见失败原因
|
|
48
|
+
- 缺口偏移量不准 → 加 ±2px 随机偏移重试
|
|
49
|
+
- 轨迹太规则 → 增加 Y 轴抖动和速度波动
|
|
50
|
+
- 滑动太快 → 总时长调到 500ms+
|
|
51
|
+
- 验证接口校验 → 检查是否需要额外参数(如 trace 数据、加密 token)
|
|
20
52
|
|
|
21
53
|
## 图片验证码
|
|
22
54
|
|
|
23
|
-
### OCR
|
|
24
|
-
|
|
25
|
-
|
|
55
|
+
### OCR 方案选择
|
|
56
|
+
|
|
57
|
+
| 方案 | 识别率 | 成本 | 适用场景 |
|
|
58
|
+
|------|--------|------|----------|
|
|
59
|
+
| ddddocr | 60-80% | 免费 | 简单数字/字母验证码 |
|
|
60
|
+
| Tesseract + 预处理 | 50-70% | 免费 | 清晰文字 |
|
|
61
|
+
| 打码平台 (超级鹰等) | 90%+ | 付费 | 复杂验证码、量大 |
|
|
62
|
+
| 自训练模型 | 95%+ | 开发成本高 | 固定样式、长期使用 |
|
|
26
63
|
|
|
27
|
-
###
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
64
|
+
### 图片预处理流程
|
|
65
|
+
1. 灰度化 → 去除颜色干扰
|
|
66
|
+
2. 二值化 → 分离文字和背景(阈值需调试)
|
|
67
|
+
3. 去噪点 → 中值滤波或形态学操作
|
|
68
|
+
4. 字符分割 → 连通域分析(粘连字符需特殊处理)
|
|
69
|
+
|
|
70
|
+
### 常见失败原因
|
|
71
|
+
- 背景干扰线 → 加强二值化阈值或用形态学开运算
|
|
72
|
+
- 字符粘连 → 投影分割或滴水算法
|
|
73
|
+
- 字体扭曲 → ddddocr 通常比 Tesseract 更好
|
|
74
|
+
- 识别错误 → 刷新验证码重试(最多 3 次)
|
|
31
75
|
|
|
32
76
|
## 点选验证码
|
|
33
77
|
|
|
34
|
-
###
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
78
|
+
### 处理流程
|
|
79
|
+
1. 获取提示信息("请依次点击:X、Y、Z")
|
|
80
|
+
2. 目标检测:定位背景图中每个候选目标的坐标
|
|
81
|
+
3. 匹配:将提示与候选目标对应
|
|
82
|
+
4. 按顺序点击坐标
|
|
83
|
+
|
|
84
|
+
### 难点与应对
|
|
85
|
+
- 文字点选:OCR 识别背景图中的文字位置
|
|
86
|
+
- 图标点选:图像分类匹配提示图标
|
|
87
|
+
- 语义点选("点击最大的"):需要目标检测 + 属性比较
|
|
88
|
+
- 3D/旋转文字:打码平台更可靠
|
|
89
|
+
|
|
90
|
+
## 第三方验证码服务
|
|
91
|
+
|
|
92
|
+
| 服务商 | 常见类型 | 特征 |
|
|
93
|
+
|--------|----------|------|
|
|
94
|
+
| 极验 (GeeTest) | 滑块、点选、九宫格 | `gt` + `challenge` 参数 |
|
|
95
|
+
| 网易易盾 | 滑块、拼图、点选 | `NECaptcha` 对象 |
|
|
96
|
+
| 腾讯防水墙 | 滑块 | `TencentCaptcha` 对象 |
|
|
97
|
+
| 阿里云 | 滑块、智能验证 | `ALIYUN` 前缀 |
|
|
98
|
+
| hCaptcha | 图像分类 | `h-captcha` class |
|
|
99
|
+
| reCAPTCHA | 图像分类、评分 | `g-recaptcha` class |
|
|
100
|
+
|
|
101
|
+
### 通用应对策略
|
|
102
|
+
- 优先尝试绕过:部分验证码有 API 直接获取 token 的方式
|
|
103
|
+
- 浏览器方案:用 Patchright 模拟真实操作
|
|
104
|
+
- 打码平台:极验/hCaptcha 等有专门的打码服务
|
|
105
|
+
- 降低触发频率:通过反检测手段减少验证码出现
|
|
106
|
+
|
|
107
|
+
## 验证失败通用处理
|
|
108
|
+
|
|
109
|
+
1. 第 1 次失败 → 刷新验证码,调整参数重试
|
|
110
|
+
2. 第 2 次失败 → 换方案(如 OCR → 打码平台)
|
|
111
|
+
3. 第 3 次失败 → 返回告知主 agent,建议人工介入或换策略
|