deepspider 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.env.example +3 -0
  2. package/README.md +21 -15
  3. package/package.json +9 -7
  4. package/src/agent/core/PanelBridge.js +56 -78
  5. package/src/agent/core/StreamHandler.js +244 -20
  6. package/src/agent/index.js +120 -23
  7. package/src/agent/logger.js +183 -8
  8. package/src/agent/middleware/memoryFlush.js +48 -0
  9. package/src/agent/middleware/report.js +95 -37
  10. package/src/agent/middleware/subagent.js +236 -0
  11. package/src/agent/middleware/toolAvailability.js +37 -0
  12. package/src/agent/middleware/toolGuard.js +187 -0
  13. package/src/agent/middleware/validationWorkflow.js +171 -0
  14. package/src/agent/prompts/system.js +310 -59
  15. package/src/agent/run.js +168 -20
  16. package/src/agent/sessions.js +88 -0
  17. package/src/agent/skills/anti-detect/SKILL.md +89 -14
  18. package/src/agent/skills/captcha/SKILL.md +93 -19
  19. package/src/agent/skills/crawler/SKILL.md +64 -3
  20. package/src/agent/skills/crawler/evolved.md +9 -1
  21. package/src/agent/skills/dynamic-analysis/SKILL.md +74 -7
  22. package/src/agent/skills/env/SKILL.md +75 -0
  23. package/src/agent/skills/js2python/evolved.md +5 -1
  24. package/src/agent/skills/sandbox/SKILL.md +35 -0
  25. package/src/agent/skills/static-analysis/SKILL.md +98 -2
  26. package/src/agent/skills/static-analysis/evolved.md +5 -1
  27. package/src/agent/subagents/anti-detect.js +36 -24
  28. package/src/agent/subagents/captcha.js +35 -28
  29. package/src/agent/subagents/crawler.js +40 -105
  30. package/src/agent/subagents/factory.js +129 -9
  31. package/src/agent/subagents/index.js +4 -13
  32. package/src/agent/subagents/js2python.js +25 -35
  33. package/src/agent/subagents/reverse.js +180 -0
  34. package/src/agent/tools/analysis.js +101 -8
  35. package/src/agent/tools/anti-detect.js +5 -2
  36. package/src/agent/tools/browser.js +186 -13
  37. package/src/agent/tools/capture.js +24 -3
  38. package/src/agent/tools/correlate.js +129 -15
  39. package/src/agent/tools/crawler.js +3 -2
  40. package/src/agent/tools/crawlerGenerator.js +90 -0
  41. package/src/agent/tools/debug.js +43 -6
  42. package/src/agent/tools/evolve.js +5 -2
  43. package/src/agent/tools/extractor.js +5 -1
  44. package/src/agent/tools/file.js +14 -5
  45. package/src/agent/tools/generateHook.js +66 -0
  46. package/src/agent/tools/hookManager.js +19 -9
  47. package/src/agent/tools/index.js +36 -21
  48. package/src/agent/tools/nodejs.js +41 -6
  49. package/src/agent/tools/patch.js +1 -1
  50. package/src/agent/tools/sandbox.js +21 -1
  51. package/src/agent/tools/scratchpad.js +70 -0
  52. package/src/agent/tools/store.js +1 -1
  53. package/src/agent/tools/tracing.js +26 -0
  54. package/src/agent/tools/verifyAlgorithm.js +117 -0
  55. package/src/browser/EnvBridge.js +27 -13
  56. package/src/browser/client.js +128 -18
  57. package/src/browser/collector.js +101 -22
  58. package/src/browser/defaultHooks.js +3 -1
  59. package/src/browser/hooks/index.js +5 -0
  60. package/src/browser/interceptors/AntiDebugInterceptor.js +132 -0
  61. package/src/browser/interceptors/NetworkInterceptor.js +76 -12
  62. package/src/browser/interceptors/ScriptInterceptor.js +32 -7
  63. package/src/browser/interceptors/index.js +1 -0
  64. package/src/browser/ui/analysisPanel.js +541 -464
  65. package/src/cli/commands/config.js +11 -3
  66. package/src/config/paths.js +9 -1
  67. package/src/config/settings.js +7 -1
  68. package/src/core/PatchGenerator.js +24 -4
  69. package/src/core/Sandbox.js +140 -3
  70. package/src/env/EnvCodeGenerator.js +60 -88
  71. package/src/env/modules/bom/history.js +6 -0
  72. package/src/env/modules/bom/location.js +6 -0
  73. package/src/env/modules/bom/navigator.js +13 -0
  74. package/src/env/modules/bom/screen.js +6 -0
  75. package/src/env/modules/bom/storage.js +7 -0
  76. package/src/env/modules/dom/document.js +14 -0
  77. package/src/env/modules/dom/event.js +4 -0
  78. package/src/env/modules/index.js +27 -10
  79. package/src/env/modules/webapi/fetch.js +4 -0
  80. package/src/env/modules/webapi/url.js +4 -0
  81. package/src/env/modules/webapi/xhr.js +8 -0
  82. package/src/store/DataStore.js +125 -42
  83. package/src/store/Store.js +2 -1
  84. package/src/agent/subagents/dynamic.js +0 -64
  85. package/src/agent/subagents/env-agent.js +0 -82
  86. package/src/agent/subagents/sandbox.js +0 -55
  87. package/src/agent/subagents/static.js +0 -66
package/.env.example CHANGED
@@ -5,6 +5,9 @@ DEEPSPIDER_API_KEY=your_api_key_here
5
5
  DEEPSPIDER_BASE_URL=https://api.openai.com/v1
6
6
  DEEPSPIDER_MODEL=gpt-4o
7
7
 
8
+ # 浏览器持久化(可选,保持登录态)
9
+ # DEEPSPIDER_PERSIST_BROWSER=true
10
+
8
11
  # LangSmith 追踪配置(可选)
9
12
  LANGSMITH_TRACING=true
10
13
  LANGSMITH_API_KEY=your_langsmith_api_key_here
package/README.md CHANGED
@@ -5,6 +5,8 @@
5
5
 
6
6
  > 智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent
7
7
 
8
+ [English](README_EN.md)
9
+
8
10
  从 JS 逆向到完整爬虫脚本的一站式 AI Agent 解决方案。
9
11
 
10
12
  ## 特性
@@ -53,6 +55,7 @@ DeepSpider 需要配置 LLM API 才能运行。支持任何兼容 OpenAI 格式
53
55
  | `apiKey` | `DEEPSPIDER_API_KEY` | API 密钥 |
54
56
  | `baseUrl` | `DEEPSPIDER_BASE_URL` | API 地址 |
55
57
  | `model` | `DEEPSPIDER_MODEL` | 模型名称 |
58
+ | `persistBrowserData` | `DEEPSPIDER_PERSIST_BROWSER` | 持久化浏览器数据(保持登录态) |
56
59
 
57
60
  优先级:环境变量 > 配置文件 (`~/.deepspider/config/settings.json`) > 默认值
58
61
 
@@ -92,6 +95,9 @@ deepspider config set model deepseek-chat
92
95
  # 启动 Agent - 指定目标网站
93
96
  deepspider https://example.com
94
97
 
98
+ # 启动 Agent - 持久化浏览器数据(一次性)
99
+ deepspider --persist https://example.com
100
+
95
101
  # 启动 Agent - 纯交互模式
96
102
  deepspider
97
103
 
@@ -103,6 +109,9 @@ deepspider config list # 查看所有配置
103
109
  deepspider config set apiKey sk-xxx
104
110
  deepspider config set model gpt-4o
105
111
 
112
+ # 持久化浏览器数据(需要登录的网站,下次启动自动恢复登录态)
113
+ deepspider config set persistBrowserData true
114
+
106
115
  # 检查更新
107
116
  deepspider update
108
117
  ```
@@ -132,11 +141,15 @@ pnpm test
132
141
 
133
142
  ### 使用流程
134
143
 
135
- 1. **启动**: `pnpm run agent https://target-site.com`
144
+ 1. **启动**: `deepspider https://target-site.com`
136
145
  2. **等待**: 浏览器打开,系统自动记录数据(不消耗 API)
137
146
  3. **操作**: 在网站上登录、翻页、触发目标请求
138
147
  4. **选择**: 点击面板的选择按钮 ⦿,进入选择模式
139
- 5. **分析**: 点击目标数据,确认后发送给 Agent
148
+ 5. **分析**: 点击目标数据元素,选择快捷操作:
149
+ - **追踪数据来源** — 定位选中数据的 API 接口
150
+ - **分析加密参数** — 识别并逆向加密参数
151
+ - **完整分析并生成爬虫** — 端到端:逆向、验证、生成代码
152
+ - **提取页面结构** — 分析 DOM 结构,生成选择器和字段配置
140
153
  6. **对话**: 在面板或 CLI 继续提问,深入分析
141
154
 
142
155
  ## 架构
@@ -150,19 +163,14 @@ pnpm test
150
163
  ┌───────────────┼───────────────┐
151
164
  ▼ ▼ ▼
152
165
  ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
153
- static-agent │ │captcha-agent│ │anti-detect │
154
- 静态分析 │ │ 验证码处理 │ │ 反检测 │
166
+ reverse-agent│ │captcha-agent│ │anti-detect │
167
+ 逆向分析 │ │ 验证码处理 │ │ 反检测 │
155
168
  └──────┬──────┘ └─────────────┘ └─────────────┘
156
169
 
157
170
  ┌─────────────┐
158
- dynamic-agent
159
- 动态调试
160
- └──────┬──────┘
161
-
162
- ┌─────────────┐ ┌─────────────┐
163
- │sandbox-agent│ ──▶ │js2python │
164
- │ 沙箱验证 │ │ 代码转换 │
165
- └─────────────┘ └─────────────┘
171
+ js2python
172
+ 代码转换
173
+ └─────────────┘
166
174
  ```
167
175
 
168
176
  ### 子代理体系
@@ -170,9 +178,7 @@ pnpm test
170
178
  | 子代理 | 职责 | 核心工具 |
171
179
  |--------|------|----------|
172
180
  | crawler | 爬虫编排:整合各模块、生成完整脚本 | file, store, crawler |
173
- | static | 静态分析:解包、反混淆、加密定位 | webcrack, deobfuscate, analyze |
174
- | dynamic | 动态分析:浏览器控制、Hook、数据采集 | browser, debug, capture |
175
- | sandbox | 沙箱执行:环境补全、代码执行 | sandbox, env, patch |
181
+ | reverse | 逆向分析全流程:反混淆、断点调试、Hook、沙箱验证、补环境 | tracing, deobfuscate, debug, capture, sandbox, env |
176
182
  | js2python | JS转Python:加密代码转换、验证 | python, analyzer |
177
183
  | captcha | 验证码处理:OCR、滑块、点选 | captcha_ocr, captcha_slide |
178
184
  | anti-detect | 反检测:指纹管理、代理池 | proxy, fingerprint |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "deepspider",
3
- "version": "0.3.1",
3
+ "version": "0.4.0",
4
4
  "description": "智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
@@ -19,11 +19,11 @@
19
19
  "cli": "node bin/cli.js",
20
20
  "mcp": "node src/mcp/server.js",
21
21
  "agent": "node bin/cli.js",
22
- "test": "node --test test/",
22
+ "test": "node --test 'test/*.test.js'",
23
23
  "lint": "eslint src/",
24
24
  "lint:fix": "eslint src/ --fix",
25
25
  "setup:crypto": "uv venv .venv --python 3.11 2>/dev/null || true && uv pip install -r requirements-crypto.txt",
26
- "postinstall": "patchright install chromium && npm rebuild isolated-vm 2>/dev/null || true",
26
+ "postinstall": "patchright install chromium && npm rebuild isolated-vm better-sqlite3 2>/dev/null || true",
27
27
  "prepare": "husky"
28
28
  },
29
29
  "keywords": [
@@ -54,20 +54,22 @@
54
54
  "@babel/parser": "^7.28.6",
55
55
  "@babel/traverse": "^7.28.6",
56
56
  "@babel/types": "^7.28.6",
57
- "@langchain/anthropic": "^1.3.12",
58
- "@langchain/core": "^1.1.17",
57
+ "@langchain/anthropic": "^1.3.17",
58
+ "@langchain/core": "^1.1.24",
59
59
  "@langchain/langgraph": "^1.1.2",
60
+ "@langchain/langgraph-checkpoint-sqlite": "^1.0.1",
60
61
  "@langchain/openai": "^1.2.3",
61
62
  "@modelcontextprotocol/sdk": "^1.26.0",
63
+ "better-sqlite3": "^12.6.2",
62
64
  "crypto-js": "^4.2.0",
63
- "deepagents": "^1.6.0",
65
+ "deepagents": "^1.7.6",
64
66
  "dotenv": "^17.2.3",
65
67
  "hono": "4.11.7",
66
68
  "isolated-vm": "^6.0.2",
67
69
  "js-md5": "^0.8.3",
68
70
  "js-sha256": "^0.11.1",
69
71
  "jsencrypt": "^3.5.4",
70
- "langchain": "^1.2.15",
72
+ "langchain": "^1.2.24",
71
73
  "marked": "^17.0.1",
72
74
  "patchright": "^1.57.0",
73
75
  "sm-crypto": "^0.4.0",
@@ -1,22 +1,12 @@
1
1
  /**
2
2
  * DeepSpider - 面板通信桥接
3
- * 处理与浏览器面板的消息通信
3
+ * 处理与浏览器面板的结构化消息通信
4
4
  */
5
5
 
6
6
  export class PanelBridge {
7
7
  constructor(browserGetter, debugFn = () => {}) {
8
8
  this.getBrowser = browserGetter;
9
9
  this.debug = debugFn;
10
- this.textBuffer = '';
11
- this.hasStartedAssistantMsg = false;
12
- }
13
-
14
- /**
15
- * 重置状态
16
- */
17
- reset() {
18
- this.textBuffer = '';
19
- this.hasStartedAssistantMsg = false;
20
10
  }
21
11
 
22
12
  /**
@@ -28,10 +18,16 @@ export class PanelBridge {
28
18
  if (!cdp) return null;
29
19
 
30
20
  try {
31
- const result = await cdp.send('Runtime.evaluate', {
32
- expression: code,
33
- returnByValue: true,
34
- });
21
+ // 3s 超时:断点暂停时 Runtime.evaluate 会永远挂住,必须限时
22
+ const result = await Promise.race([
23
+ cdp.send('Runtime.evaluate', {
24
+ expression: code,
25
+ returnByValue: true,
26
+ }),
27
+ new Promise((_, reject) =>
28
+ setTimeout(() => reject(new Error('evaluateInPage timeout (debugger paused?)')), 3000)
29
+ ),
30
+ ]);
35
31
  return result.result?.value;
36
32
  } catch (e) {
37
33
  this.debug('evaluateInPage 失败:', e.message);
@@ -40,81 +36,51 @@ export class PanelBridge {
40
36
  }
41
37
 
42
38
  /**
43
- * 发送消息到前端面板
39
+ * 等待面板 JS 初始化完成
44
40
  */
45
- async sendToPanel(role, content) {
46
- if (!content?.trim()) return;
47
-
48
- const browser = this.getBrowser();
49
- const page = browser?.getPage?.();
50
- if (!page) return;
51
-
52
- try {
53
- const escaped = JSON.stringify(content.trim());
54
- const code = `window.__deepspider__?.addMessage?.('${role}', ${escaped})`;
55
- await this.evaluateInPage(code);
56
- } catch {
57
- // ignore
41
+ async waitForPanel(timeoutMs = 5000) {
42
+ const start = Date.now();
43
+ while (Date.now() - start < timeoutMs) {
44
+ const ready = await this.evaluateInPage('!!window.__deepspider__?.addStructuredMessage');
45
+ if (ready) return true;
46
+ await new Promise(r => setTimeout(r, 200));
58
47
  }
48
+ return false;
59
49
  }
60
50
 
61
51
  /**
62
- * 累积文本到缓冲区(用于 LLM 流式输出)
52
+ * 批量发送消息到面板(单次 CDP 调用)
63
53
  */
64
- async appendToPanel(text) {
65
- if (!text) return;
66
- this.textBuffer += text;
67
-
68
- // 每累积一定量或遇到换行时刷新
69
- if (this.textBuffer.length > 200 || text.includes('\n')) {
70
- await this.flushPanelText();
71
- }
54
+ async sendBatch(messages) {
55
+ if (!messages?.length) return;
56
+ const escaped = JSON.stringify(messages);
57
+ await this.evaluateInPage(
58
+ `(function(msgs){var ds=window.__deepspider__;if(!ds)return;msgs.forEach(function(m){ds.addStructuredMessage?.(m.type,m.data);})})(${escaped})`
59
+ );
72
60
  }
73
61
 
74
62
  /**
75
- * 刷新累积的文本到面板
63
+ * 发送结构化消息到前端面板
76
64
  */
77
- async flushPanelText() {
78
- if (!this.textBuffer.trim()) return;
79
-
80
- const browser = this.getBrowser();
81
- const page = browser?.getPage?.();
82
- if (!page) {
83
- this.textBuffer = '';
84
- return;
85
- }
86
-
65
+ async sendMessage(type, data) {
87
66
  try {
88
- const content = this.textBuffer.trim();
89
- const escaped = JSON.stringify(content);
90
-
91
- if (!this.hasStartedAssistantMsg) {
92
- const code = `(function() {
93
- const fn = window.__deepspider__?.addMessage;
94
- if (typeof fn === 'function') {
95
- fn('assistant', ${escaped});
96
- return { ok: true };
97
- }
98
- return { ok: false };
99
- })()`;
100
- await this.evaluateInPage(code);
101
- this.hasStartedAssistantMsg = true;
102
- } else {
103
- const code = `(function() {
104
- const fn = window.__deepspider__?.appendToLastMessage;
105
- if (typeof fn === 'function') {
106
- fn('assistant', ${escaped});
107
- return { ok: true };
108
- }
109
- return { ok: false };
110
- })()`;
111
- await this.evaluateInPage(code);
112
- }
67
+ const escapedType = JSON.stringify(type);
68
+ const escapedData = JSON.stringify(data);
69
+ await this.evaluateInPage(
70
+ `window.__deepspider__?.addStructuredMessage?.(${escapedType}, ${escapedData})`
71
+ );
113
72
  } catch {
114
73
  // ignore
115
74
  }
75
+ }
116
76
 
117
- this.textBuffer = '';
77
+ /**
78
+ * 按 role 发送消息到前端面板
79
+ */
80
+ async sendToPanel(role, content) {
81
+ if (!content?.trim()) return;
82
+ const type = role === 'system' ? 'system' : role === 'user' ? 'user' : 'text';
83
+ await this.sendMessage(type, { content: content.trim() });
118
84
  }
119
85
 
120
86
  /**
@@ -125,9 +91,21 @@ export class PanelBridge {
125
91
  }
126
92
 
127
93
  /**
128
- * 完成消息,触发渲染
94
+ * 删除面板中最后一条 assistant 消息
95
+ * 用于 interrupt 场景:LLM 在调用 interrupt 工具前输出的冗余描述文字需要清除
129
96
  */
130
- async finalizeMessage(role) {
131
- await this.evaluateInPage(`window.__deepspider__?.finalizeMessage?.("${role}")`);
97
+ async removeLastAssistantMessage() {
98
+ await this.evaluateInPage(`
99
+ (function() {
100
+ const ds = window.__deepspider__;
101
+ if (!ds?.chatMessages) return;
102
+ for (let i = ds.chatMessages.length - 1; i >= 0; i--) {
103
+ if (ds.chatMessages[i].role === 'assistant') {
104
+ ds.chatMessages.splice(i, 1);
105
+ break;
106
+ }
107
+ }
108
+ })()
109
+ `);
132
110
  }
133
111
  }