deepspider 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +3 -0
- package/README.md +21 -15
- package/package.json +9 -7
- package/src/agent/core/PanelBridge.js +56 -78
- package/src/agent/core/StreamHandler.js +244 -20
- package/src/agent/index.js +120 -23
- package/src/agent/logger.js +183 -8
- package/src/agent/middleware/memoryFlush.js +48 -0
- package/src/agent/middleware/report.js +95 -37
- package/src/agent/middleware/subagent.js +236 -0
- package/src/agent/middleware/toolAvailability.js +37 -0
- package/src/agent/middleware/toolGuard.js +187 -0
- package/src/agent/middleware/validationWorkflow.js +171 -0
- package/src/agent/prompts/system.js +310 -59
- package/src/agent/run.js +168 -20
- package/src/agent/sessions.js +88 -0
- package/src/agent/skills/anti-detect/SKILL.md +89 -14
- package/src/agent/skills/captcha/SKILL.md +93 -19
- package/src/agent/skills/crawler/SKILL.md +64 -3
- package/src/agent/skills/crawler/evolved.md +9 -1
- package/src/agent/skills/dynamic-analysis/SKILL.md +74 -7
- package/src/agent/skills/env/SKILL.md +75 -0
- package/src/agent/skills/js2python/evolved.md +5 -1
- package/src/agent/skills/sandbox/SKILL.md +35 -0
- package/src/agent/skills/static-analysis/SKILL.md +98 -2
- package/src/agent/skills/static-analysis/evolved.md +5 -1
- package/src/agent/subagents/anti-detect.js +36 -24
- package/src/agent/subagents/captcha.js +35 -28
- package/src/agent/subagents/crawler.js +40 -105
- package/src/agent/subagents/factory.js +129 -9
- package/src/agent/subagents/index.js +4 -13
- package/src/agent/subagents/js2python.js +25 -35
- package/src/agent/subagents/reverse.js +180 -0
- package/src/agent/tools/analysis.js +101 -8
- package/src/agent/tools/anti-detect.js +5 -2
- package/src/agent/tools/browser.js +186 -13
- package/src/agent/tools/capture.js +24 -3
- package/src/agent/tools/correlate.js +129 -15
- package/src/agent/tools/crawler.js +3 -2
- package/src/agent/tools/crawlerGenerator.js +90 -0
- package/src/agent/tools/debug.js +43 -6
- package/src/agent/tools/evolve.js +5 -2
- package/src/agent/tools/extractor.js +5 -1
- package/src/agent/tools/file.js +14 -5
- package/src/agent/tools/generateHook.js +66 -0
- package/src/agent/tools/hookManager.js +19 -9
- package/src/agent/tools/index.js +36 -21
- package/src/agent/tools/nodejs.js +41 -6
- package/src/agent/tools/patch.js +1 -1
- package/src/agent/tools/sandbox.js +21 -1
- package/src/agent/tools/scratchpad.js +70 -0
- package/src/agent/tools/store.js +1 -1
- package/src/agent/tools/tracing.js +26 -0
- package/src/agent/tools/verifyAlgorithm.js +117 -0
- package/src/browser/EnvBridge.js +27 -13
- package/src/browser/client.js +128 -18
- package/src/browser/collector.js +101 -22
- package/src/browser/defaultHooks.js +3 -1
- package/src/browser/hooks/index.js +5 -0
- package/src/browser/interceptors/AntiDebugInterceptor.js +132 -0
- package/src/browser/interceptors/NetworkInterceptor.js +76 -12
- package/src/browser/interceptors/ScriptInterceptor.js +32 -7
- package/src/browser/interceptors/index.js +1 -0
- package/src/browser/ui/analysisPanel.js +541 -464
- package/src/cli/commands/config.js +11 -3
- package/src/config/paths.js +9 -1
- package/src/config/settings.js +7 -1
- package/src/core/PatchGenerator.js +24 -4
- package/src/core/Sandbox.js +140 -3
- package/src/env/EnvCodeGenerator.js +60 -88
- package/src/env/modules/bom/history.js +6 -0
- package/src/env/modules/bom/location.js +6 -0
- package/src/env/modules/bom/navigator.js +13 -0
- package/src/env/modules/bom/screen.js +6 -0
- package/src/env/modules/bom/storage.js +7 -0
- package/src/env/modules/dom/document.js +14 -0
- package/src/env/modules/dom/event.js +4 -0
- package/src/env/modules/index.js +27 -10
- package/src/env/modules/webapi/fetch.js +4 -0
- package/src/env/modules/webapi/url.js +4 -0
- package/src/env/modules/webapi/xhr.js +8 -0
- package/src/store/DataStore.js +125 -42
- package/src/store/Store.js +2 -1
- package/src/agent/subagents/dynamic.js +0 -64
- package/src/agent/subagents/env-agent.js +0 -82
- package/src/agent/subagents/sandbox.js +0 -55
- package/src/agent/subagents/static.js +0 -66
package/.env.example
CHANGED
|
@@ -5,6 +5,9 @@ DEEPSPIDER_API_KEY=your_api_key_here
|
|
|
5
5
|
DEEPSPIDER_BASE_URL=https://api.openai.com/v1
|
|
6
6
|
DEEPSPIDER_MODEL=gpt-4o
|
|
7
7
|
|
|
8
|
+
# 浏览器持久化(可选,保持登录态)
|
|
9
|
+
# DEEPSPIDER_PERSIST_BROWSER=true
|
|
10
|
+
|
|
8
11
|
# LangSmith 追踪配置(可选)
|
|
9
12
|
LANGSMITH_TRACING=true
|
|
10
13
|
LANGSMITH_API_KEY=your_langsmith_api_key_here
|
package/README.md
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
> 智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent
|
|
7
7
|
|
|
8
|
+
[English](README_EN.md)
|
|
9
|
+
|
|
8
10
|
从 JS 逆向到完整爬虫脚本的一站式 AI Agent 解决方案。
|
|
9
11
|
|
|
10
12
|
## 特性
|
|
@@ -53,6 +55,7 @@ DeepSpider 需要配置 LLM API 才能运行。支持任何兼容 OpenAI 格式
|
|
|
53
55
|
| `apiKey` | `DEEPSPIDER_API_KEY` | API 密钥 |
|
|
54
56
|
| `baseUrl` | `DEEPSPIDER_BASE_URL` | API 地址 |
|
|
55
57
|
| `model` | `DEEPSPIDER_MODEL` | 模型名称 |
|
|
58
|
+
| `persistBrowserData` | `DEEPSPIDER_PERSIST_BROWSER` | 持久化浏览器数据(保持登录态) |
|
|
56
59
|
|
|
57
60
|
优先级:环境变量 > 配置文件 (`~/.deepspider/config/settings.json`) > 默认值
|
|
58
61
|
|
|
@@ -92,6 +95,9 @@ deepspider config set model deepseek-chat
|
|
|
92
95
|
# 启动 Agent - 指定目标网站
|
|
93
96
|
deepspider https://example.com
|
|
94
97
|
|
|
98
|
+
# 启动 Agent - 持久化浏览器数据(一次性)
|
|
99
|
+
deepspider --persist https://example.com
|
|
100
|
+
|
|
95
101
|
# 启动 Agent - 纯交互模式
|
|
96
102
|
deepspider
|
|
97
103
|
|
|
@@ -103,6 +109,9 @@ deepspider config list # 查看所有配置
|
|
|
103
109
|
deepspider config set apiKey sk-xxx
|
|
104
110
|
deepspider config set model gpt-4o
|
|
105
111
|
|
|
112
|
+
# 持久化浏览器数据(需要登录的网站,下次启动自动恢复登录态)
|
|
113
|
+
deepspider config set persistBrowserData true
|
|
114
|
+
|
|
106
115
|
# 检查更新
|
|
107
116
|
deepspider update
|
|
108
117
|
```
|
|
@@ -132,11 +141,15 @@ pnpm test
|
|
|
132
141
|
|
|
133
142
|
### 使用流程
|
|
134
143
|
|
|
135
|
-
1. **启动**: `
|
|
144
|
+
1. **启动**: `deepspider https://target-site.com`
|
|
136
145
|
2. **等待**: 浏览器打开,系统自动记录数据(不消耗 API)
|
|
137
146
|
3. **操作**: 在网站上登录、翻页、触发目标请求
|
|
138
147
|
4. **选择**: 点击面板的选择按钮 ⦿,进入选择模式
|
|
139
|
-
5. **分析**:
|
|
148
|
+
5. **分析**: 点击目标数据元素,选择快捷操作:
|
|
149
|
+
- **追踪数据来源** — 定位选中数据的 API 接口
|
|
150
|
+
- **分析加密参数** — 识别并逆向加密参数
|
|
151
|
+
- **完整分析并生成爬虫** — 端到端:逆向、验证、生成代码
|
|
152
|
+
- **提取页面结构** — 分析 DOM 结构,生成选择器和字段配置
|
|
140
153
|
6. **对话**: 在面板或 CLI 继续提问,深入分析
|
|
141
154
|
|
|
142
155
|
## 架构
|
|
@@ -150,19 +163,14 @@ pnpm test
|
|
|
150
163
|
┌───────────────┼───────────────┐
|
|
151
164
|
▼ ▼ ▼
|
|
152
165
|
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
153
|
-
│
|
|
154
|
-
│
|
|
166
|
+
│reverse-agent│ │captcha-agent│ │anti-detect │
|
|
167
|
+
│ 逆向分析 │ │ 验证码处理 │ │ 反检测 │
|
|
155
168
|
└──────┬──────┘ └─────────────┘ └─────────────┘
|
|
156
169
|
▼
|
|
157
170
|
┌─────────────┐
|
|
158
|
-
│
|
|
159
|
-
│
|
|
160
|
-
|
|
161
|
-
▼
|
|
162
|
-
┌─────────────┐ ┌─────────────┐
|
|
163
|
-
│sandbox-agent│ ──▶ │js2python │
|
|
164
|
-
│ 沙箱验证 │ │ 代码转换 │
|
|
165
|
-
└─────────────┘ └─────────────┘
|
|
171
|
+
│js2python │
|
|
172
|
+
│ 代码转换 │
|
|
173
|
+
└─────────────┘
|
|
166
174
|
```
|
|
167
175
|
|
|
168
176
|
### 子代理体系
|
|
@@ -170,9 +178,7 @@ pnpm test
|
|
|
170
178
|
| 子代理 | 职责 | 核心工具 |
|
|
171
179
|
|--------|------|----------|
|
|
172
180
|
| crawler | 爬虫编排:整合各模块、生成完整脚本 | file, store, crawler |
|
|
173
|
-
|
|
|
174
|
-
| dynamic | 动态分析:浏览器控制、Hook、数据采集 | browser, debug, capture |
|
|
175
|
-
| sandbox | 沙箱执行:环境补全、代码执行 | sandbox, env, patch |
|
|
181
|
+
| reverse | 逆向分析全流程:反混淆、断点调试、Hook、沙箱验证、补环境 | tracing, deobfuscate, debug, capture, sandbox, env |
|
|
176
182
|
| js2python | JS转Python:加密代码转换、验证 | python, analyzer |
|
|
177
183
|
| captcha | 验证码处理:OCR、滑块、点选 | captcha_ocr, captcha_slide |
|
|
178
184
|
| anti-detect | 反检测:指纹管理、代理池 | proxy, fingerprint |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "deepspider",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.js",
|
|
@@ -19,11 +19,11 @@
|
|
|
19
19
|
"cli": "node bin/cli.js",
|
|
20
20
|
"mcp": "node src/mcp/server.js",
|
|
21
21
|
"agent": "node bin/cli.js",
|
|
22
|
-
"test": "node --test test
|
|
22
|
+
"test": "node --test 'test/*.test.js'",
|
|
23
23
|
"lint": "eslint src/",
|
|
24
24
|
"lint:fix": "eslint src/ --fix",
|
|
25
25
|
"setup:crypto": "uv venv .venv --python 3.11 2>/dev/null || true && uv pip install -r requirements-crypto.txt",
|
|
26
|
-
"postinstall": "patchright install chromium && npm rebuild isolated-vm 2>/dev/null || true",
|
|
26
|
+
"postinstall": "patchright install chromium && npm rebuild isolated-vm better-sqlite3 2>/dev/null || true",
|
|
27
27
|
"prepare": "husky"
|
|
28
28
|
},
|
|
29
29
|
"keywords": [
|
|
@@ -54,20 +54,22 @@
|
|
|
54
54
|
"@babel/parser": "^7.28.6",
|
|
55
55
|
"@babel/traverse": "^7.28.6",
|
|
56
56
|
"@babel/types": "^7.28.6",
|
|
57
|
-
"@langchain/anthropic": "^1.3.
|
|
58
|
-
"@langchain/core": "^1.1.
|
|
57
|
+
"@langchain/anthropic": "^1.3.17",
|
|
58
|
+
"@langchain/core": "^1.1.24",
|
|
59
59
|
"@langchain/langgraph": "^1.1.2",
|
|
60
|
+
"@langchain/langgraph-checkpoint-sqlite": "^1.0.1",
|
|
60
61
|
"@langchain/openai": "^1.2.3",
|
|
61
62
|
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
63
|
+
"better-sqlite3": "^12.6.2",
|
|
62
64
|
"crypto-js": "^4.2.0",
|
|
63
|
-
"deepagents": "^1.6
|
|
65
|
+
"deepagents": "^1.7.6",
|
|
64
66
|
"dotenv": "^17.2.3",
|
|
65
67
|
"hono": "4.11.7",
|
|
66
68
|
"isolated-vm": "^6.0.2",
|
|
67
69
|
"js-md5": "^0.8.3",
|
|
68
70
|
"js-sha256": "^0.11.1",
|
|
69
71
|
"jsencrypt": "^3.5.4",
|
|
70
|
-
"langchain": "^1.2.
|
|
72
|
+
"langchain": "^1.2.24",
|
|
71
73
|
"marked": "^17.0.1",
|
|
72
74
|
"patchright": "^1.57.0",
|
|
73
75
|
"sm-crypto": "^0.4.0",
|
|
@@ -1,22 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* DeepSpider - 面板通信桥接
|
|
3
|
-
*
|
|
3
|
+
* 处理与浏览器面板的结构化消息通信
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
export class PanelBridge {
|
|
7
7
|
constructor(browserGetter, debugFn = () => {}) {
|
|
8
8
|
this.getBrowser = browserGetter;
|
|
9
9
|
this.debug = debugFn;
|
|
10
|
-
this.textBuffer = '';
|
|
11
|
-
this.hasStartedAssistantMsg = false;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
/**
|
|
15
|
-
* 重置状态
|
|
16
|
-
*/
|
|
17
|
-
reset() {
|
|
18
|
-
this.textBuffer = '';
|
|
19
|
-
this.hasStartedAssistantMsg = false;
|
|
20
10
|
}
|
|
21
11
|
|
|
22
12
|
/**
|
|
@@ -28,10 +18,16 @@ export class PanelBridge {
|
|
|
28
18
|
if (!cdp) return null;
|
|
29
19
|
|
|
30
20
|
try {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
21
|
+
// 3s 超时:断点暂停时 Runtime.evaluate 会永远挂住,必须限时
|
|
22
|
+
const result = await Promise.race([
|
|
23
|
+
cdp.send('Runtime.evaluate', {
|
|
24
|
+
expression: code,
|
|
25
|
+
returnByValue: true,
|
|
26
|
+
}),
|
|
27
|
+
new Promise((_, reject) =>
|
|
28
|
+
setTimeout(() => reject(new Error('evaluateInPage timeout (debugger paused?)')), 3000)
|
|
29
|
+
),
|
|
30
|
+
]);
|
|
35
31
|
return result.result?.value;
|
|
36
32
|
} catch (e) {
|
|
37
33
|
this.debug('evaluateInPage 失败:', e.message);
|
|
@@ -40,81 +36,51 @@ export class PanelBridge {
|
|
|
40
36
|
}
|
|
41
37
|
|
|
42
38
|
/**
|
|
43
|
-
*
|
|
39
|
+
* 等待面板 JS 初始化完成
|
|
44
40
|
*/
|
|
45
|
-
async
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
try {
|
|
53
|
-
const escaped = JSON.stringify(content.trim());
|
|
54
|
-
const code = `window.__deepspider__?.addMessage?.('${role}', ${escaped})`;
|
|
55
|
-
await this.evaluateInPage(code);
|
|
56
|
-
} catch {
|
|
57
|
-
// ignore
|
|
41
|
+
async waitForPanel(timeoutMs = 5000) {
|
|
42
|
+
const start = Date.now();
|
|
43
|
+
while (Date.now() - start < timeoutMs) {
|
|
44
|
+
const ready = await this.evaluateInPage('!!window.__deepspider__?.addStructuredMessage');
|
|
45
|
+
if (ready) return true;
|
|
46
|
+
await new Promise(r => setTimeout(r, 200));
|
|
58
47
|
}
|
|
48
|
+
return false;
|
|
59
49
|
}
|
|
60
50
|
|
|
61
51
|
/**
|
|
62
|
-
*
|
|
52
|
+
* 批量发送消息到面板(单次 CDP 调用)
|
|
63
53
|
*/
|
|
64
|
-
async
|
|
65
|
-
if (!
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
await this.flushPanelText();
|
|
71
|
-
}
|
|
54
|
+
async sendBatch(messages) {
|
|
55
|
+
if (!messages?.length) return;
|
|
56
|
+
const escaped = JSON.stringify(messages);
|
|
57
|
+
await this.evaluateInPage(
|
|
58
|
+
`(function(msgs){var ds=window.__deepspider__;if(!ds)return;msgs.forEach(function(m){ds.addStructuredMessage?.(m.type,m.data);})})(${escaped})`
|
|
59
|
+
);
|
|
72
60
|
}
|
|
73
61
|
|
|
74
62
|
/**
|
|
75
|
-
*
|
|
63
|
+
* 发送结构化消息到前端面板
|
|
76
64
|
*/
|
|
77
|
-
async
|
|
78
|
-
if (!this.textBuffer.trim()) return;
|
|
79
|
-
|
|
80
|
-
const browser = this.getBrowser();
|
|
81
|
-
const page = browser?.getPage?.();
|
|
82
|
-
if (!page) {
|
|
83
|
-
this.textBuffer = '';
|
|
84
|
-
return;
|
|
85
|
-
}
|
|
86
|
-
|
|
65
|
+
async sendMessage(type, data) {
|
|
87
66
|
try {
|
|
88
|
-
const
|
|
89
|
-
const
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
const fn = window.__deepspider__?.addMessage;
|
|
94
|
-
if (typeof fn === 'function') {
|
|
95
|
-
fn('assistant', ${escaped});
|
|
96
|
-
return { ok: true };
|
|
97
|
-
}
|
|
98
|
-
return { ok: false };
|
|
99
|
-
})()`;
|
|
100
|
-
await this.evaluateInPage(code);
|
|
101
|
-
this.hasStartedAssistantMsg = true;
|
|
102
|
-
} else {
|
|
103
|
-
const code = `(function() {
|
|
104
|
-
const fn = window.__deepspider__?.appendToLastMessage;
|
|
105
|
-
if (typeof fn === 'function') {
|
|
106
|
-
fn('assistant', ${escaped});
|
|
107
|
-
return { ok: true };
|
|
108
|
-
}
|
|
109
|
-
return { ok: false };
|
|
110
|
-
})()`;
|
|
111
|
-
await this.evaluateInPage(code);
|
|
112
|
-
}
|
|
67
|
+
const escapedType = JSON.stringify(type);
|
|
68
|
+
const escapedData = JSON.stringify(data);
|
|
69
|
+
await this.evaluateInPage(
|
|
70
|
+
`window.__deepspider__?.addStructuredMessage?.(${escapedType}, ${escapedData})`
|
|
71
|
+
);
|
|
113
72
|
} catch {
|
|
114
73
|
// ignore
|
|
115
74
|
}
|
|
75
|
+
}
|
|
116
76
|
|
|
117
|
-
|
|
77
|
+
/**
|
|
78
|
+
* 按 role 发送消息到前端面板
|
|
79
|
+
*/
|
|
80
|
+
async sendToPanel(role, content) {
|
|
81
|
+
if (!content?.trim()) return;
|
|
82
|
+
const type = role === 'system' ? 'system' : role === 'user' ? 'user' : 'text';
|
|
83
|
+
await this.sendMessage(type, { content: content.trim() });
|
|
118
84
|
}
|
|
119
85
|
|
|
120
86
|
/**
|
|
@@ -125,9 +91,21 @@ export class PanelBridge {
|
|
|
125
91
|
}
|
|
126
92
|
|
|
127
93
|
/**
|
|
128
|
-
*
|
|
94
|
+
* 删除面板中最后一条 assistant 消息
|
|
95
|
+
* 用于 interrupt 场景:LLM 在调用 interrupt 工具前输出的冗余描述文字需要清除
|
|
129
96
|
*/
|
|
130
|
-
async
|
|
131
|
-
await this.evaluateInPage(`
|
|
97
|
+
async removeLastAssistantMessage() {
|
|
98
|
+
await this.evaluateInPage(`
|
|
99
|
+
(function() {
|
|
100
|
+
const ds = window.__deepspider__;
|
|
101
|
+
if (!ds?.chatMessages) return;
|
|
102
|
+
for (let i = ds.chatMessages.length - 1; i >= 0; i--) {
|
|
103
|
+
if (ds.chatMessages[i].role === 'assistant') {
|
|
104
|
+
ds.chatMessages.splice(i, 1);
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
})()
|
|
109
|
+
`);
|
|
132
110
|
}
|
|
133
111
|
}
|