deepspider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/check.md +122 -0
- package/.claude/agents/debug.md +106 -0
- package/.claude/agents/dispatch.md +214 -0
- package/.claude/agents/implement.md +96 -0
- package/.claude/agents/plan.md +396 -0
- package/.claude/agents/research.md +120 -0
- package/.claude/commands/evolve/merge.md +80 -0
- package/.claude/commands/trellis/before-backend-dev.md +13 -0
- package/.claude/commands/trellis/before-frontend-dev.md +13 -0
- package/.claude/commands/trellis/break-loop.md +107 -0
- package/.claude/commands/trellis/check-backend.md +13 -0
- package/.claude/commands/trellis/check-cross-layer.md +153 -0
- package/.claude/commands/trellis/check-frontend.md +13 -0
- package/.claude/commands/trellis/create-command.md +154 -0
- package/.claude/commands/trellis/finish-work.md +129 -0
- package/.claude/commands/trellis/integrate-skill.md +219 -0
- package/.claude/commands/trellis/onboard.md +358 -0
- package/.claude/commands/trellis/parallel.md +193 -0
- package/.claude/commands/trellis/record-session.md +62 -0
- package/.claude/commands/trellis/start.md +280 -0
- package/.claude/commands/trellis/update-spec.md +213 -0
- package/.claude/hooks/inject-subagent-context.py +758 -0
- package/.claude/hooks/ralph-loop.py +374 -0
- package/.claude/hooks/session-start.py +126 -0
- package/.claude/settings.json +41 -0
- package/.claude/skills/deepagents-guide/SKILL.md +428 -0
- package/.cursor/commands/trellis-before-backend-dev.md +13 -0
- package/.cursor/commands/trellis-before-frontend-dev.md +13 -0
- package/.cursor/commands/trellis-break-loop.md +107 -0
- package/.cursor/commands/trellis-check-backend.md +13 -0
- package/.cursor/commands/trellis-check-cross-layer.md +153 -0
- package/.cursor/commands/trellis-check-frontend.md +13 -0
- package/.cursor/commands/trellis-create-command.md +154 -0
- package/.cursor/commands/trellis-finish-work.md +129 -0
- package/.cursor/commands/trellis-integrate-skill.md +219 -0
- package/.cursor/commands/trellis-onboard.md +358 -0
- package/.cursor/commands/trellis-record-session.md +62 -0
- package/.cursor/commands/trellis-start.md +156 -0
- package/.cursor/commands/trellis-update-spec.md +213 -0
- package/.env.example +11 -0
- package/.husky/pre-commit +1 -0
- package/.mcp.json +8 -0
- package/.trellis/.template-hashes.json +65 -0
- package/.trellis/.version +1 -0
- package/.trellis/scripts/add-session.sh +384 -0
- package/.trellis/scripts/common/developer.sh +129 -0
- package/.trellis/scripts/common/git-context.sh +263 -0
- package/.trellis/scripts/common/paths.sh +208 -0
- package/.trellis/scripts/common/phase.sh +150 -0
- package/.trellis/scripts/common/registry.sh +247 -0
- package/.trellis/scripts/common/task-queue.sh +142 -0
- package/.trellis/scripts/common/task-utils.sh +151 -0
- package/.trellis/scripts/common/worktree.sh +128 -0
- package/.trellis/scripts/create-bootstrap.sh +299 -0
- package/.trellis/scripts/get-context.sh +7 -0
- package/.trellis/scripts/get-developer.sh +15 -0
- package/.trellis/scripts/init-developer.sh +34 -0
- package/.trellis/scripts/multi-agent/cleanup.sh +396 -0
- package/.trellis/scripts/multi-agent/create-pr.sh +241 -0
- package/.trellis/scripts/multi-agent/plan.sh +207 -0
- package/.trellis/scripts/multi-agent/start.sh +310 -0
- package/.trellis/scripts/multi-agent/status.sh +828 -0
- package/.trellis/scripts/task.sh +1118 -0
- package/.trellis/spec/backend/deepagents-guide.md +337 -0
- package/.trellis/spec/backend/directory-structure.md +126 -0
- package/.trellis/spec/backend/examples/skills/deepagents-guide/README.md +11 -0
- package/.trellis/spec/backend/examples/skills/deepagents-guide/agent.js.template +20 -0
- package/.trellis/spec/backend/examples/skills/deepagents-guide/skills-config.js.template +13 -0
- package/.trellis/spec/backend/examples/skills/deepagents-guide/subagent.js.template +19 -0
- package/.trellis/spec/backend/hook-guidelines.md +178 -0
- package/.trellis/spec/backend/index.md +36 -0
- package/.trellis/spec/backend/quality-guidelines.md +201 -0
- package/.trellis/spec/backend/state-management.md +76 -0
- package/.trellis/spec/backend/tool-guidelines.md +144 -0
- package/.trellis/spec/backend/type-safety.md +71 -0
- package/.trellis/spec/guides/code-reuse-thinking-guide.md +92 -0
- package/.trellis/spec/guides/cross-layer-thinking-guide.md +94 -0
- package/.trellis/spec/guides/index.md +79 -0
- package/.trellis/tasks/archive/02-02-evolving-skills/prd.md +61 -0
- package/.trellis/tasks/archive/02-02-evolving-skills/task.json +29 -0
- package/.trellis/tasks/archive/2026-02/00-bootstrap-guidelines/prd.md +86 -0
- package/.trellis/tasks/archive/2026-02/00-bootstrap-guidelines/task.json +27 -0
- package/.trellis/tasks/archive/2026-02/02-02-skills-system/check.jsonl +3 -0
- package/.trellis/tasks/archive/2026-02/02-02-skills-system/debug.jsonl +2 -0
- package/.trellis/tasks/archive/2026-02/02-02-skills-system/implement.jsonl +5 -0
- package/.trellis/tasks/archive/2026-02/02-02-skills-system/prd.md +33 -0
- package/.trellis/tasks/archive/2026-02/02-02-skills-system/task.json +41 -0
- package/.trellis/workflow.md +407 -0
- package/.trellis/workspace/index.md +123 -0
- package/.trellis/workspace/pony/index.md +40 -0
- package/.trellis/workspace/pony/journal-1.md +7 -0
- package/.trellis/worktree.yaml +47 -0
- package/AGENTS.md +18 -0
- package/CLAUDE.md +292 -0
- package/README.md +134 -0
- package/agents/deepspider.md +142 -0
- package/docs/DEBUG.md +42 -0
- package/docs/GUIDE.md +334 -0
- package/docs/PROMPT.md +60 -0
- package/docs/USAGE.md +226 -0
- package/eslint.config.js +51 -0
- package/package.json +78 -0
- package/requirements-crypto.txt +14 -0
- package/src/agent/index.js +97 -0
- package/src/agent/logger.js +164 -0
- package/src/agent/middleware/filterTools.js +64 -0
- package/src/agent/middleware/report.js +79 -0
- package/src/agent/prompts/system.js +315 -0
- package/src/agent/run.js +575 -0
- package/src/agent/skills/anti-detect/SKILL.md +28 -0
- package/src/agent/skills/anti-detect/evolved.md +12 -0
- package/src/agent/skills/captcha/SKILL.md +37 -0
- package/src/agent/skills/captcha/evolved.md +12 -0
- package/src/agent/skills/config.js +30 -0
- package/src/agent/skills/crawler/SKILL.md +9 -0
- package/src/agent/skills/crawler/evolved.md +16 -0
- package/src/agent/skills/dynamic-analysis/SKILL.md +91 -0
- package/src/agent/skills/dynamic-analysis/evolved.md +12 -0
- package/src/agent/skills/env/SKILL.md +72 -0
- package/src/agent/skills/env/evolved.md +12 -0
- package/src/agent/skills/evolve.js +79 -0
- package/src/agent/skills/general/SKILL.md +12 -0
- package/src/agent/skills/general/evolved.md +12 -0
- package/src/agent/skills/js2python/SKILL.md +30 -0
- package/src/agent/skills/js2python/evolved.md +13 -0
- package/src/agent/skills/report/SKILL.md +21 -0
- package/src/agent/skills/report/evolved.md +12 -0
- package/src/agent/skills/sandbox/SKILL.md +22 -0
- package/src/agent/skills/sandbox/evolved.md +16 -0
- package/src/agent/skills/static-analysis/SKILL.md +93 -0
- package/src/agent/skills/static-analysis/evolved.md +12 -0
- package/src/agent/skills/xpath/SKILL.md +119 -0
- package/src/agent/subagents/anti-detect.js +45 -0
- package/src/agent/subagents/captcha.js +51 -0
- package/src/agent/subagents/crawler.js +138 -0
- package/src/agent/subagents/dynamic.js +64 -0
- package/src/agent/subagents/env-agent.js +82 -0
- package/src/agent/subagents/index.js +37 -0
- package/src/agent/subagents/js2python.js +72 -0
- package/src/agent/subagents/sandbox.js +55 -0
- package/src/agent/subagents/static.js +66 -0
- package/src/agent/tools/analysis.js +135 -0
- package/src/agent/tools/analyzer.js +85 -0
- package/src/agent/tools/anti-detect.js +89 -0
- package/src/agent/tools/antidebug.js +64 -0
- package/src/agent/tools/async.js +43 -0
- package/src/agent/tools/browser.js +324 -0
- package/src/agent/tools/captcha.js +223 -0
- package/src/agent/tools/capture.js +179 -0
- package/src/agent/tools/correlate.js +303 -0
- package/src/agent/tools/crawler.js +116 -0
- package/src/agent/tools/cryptohook.js +80 -0
- package/src/agent/tools/debug.js +246 -0
- package/src/agent/tools/deobfuscator.js +90 -0
- package/src/agent/tools/env.js +83 -0
- package/src/agent/tools/envdump.js +92 -0
- package/src/agent/tools/evolve.js +164 -0
- package/src/agent/tools/extract.js +114 -0
- package/src/agent/tools/extractor.js +54 -0
- package/src/agent/tools/file.js +224 -0
- package/src/agent/tools/hook.js +84 -0
- package/src/agent/tools/hookManager.js +178 -0
- package/src/agent/tools/index.js +137 -0
- package/src/agent/tools/nodejs.js +101 -0
- package/src/agent/tools/patch.js +46 -0
- package/src/agent/tools/preprocess.js +71 -0
- package/src/agent/tools/profile.js +122 -0
- package/src/agent/tools/python.js +627 -0
- package/src/agent/tools/report.js +124 -0
- package/src/agent/tools/runtime.js +132 -0
- package/src/agent/tools/sandbox.js +79 -0
- package/src/agent/tools/store.js +73 -0
- package/src/agent/tools/trace.js +74 -0
- package/src/agent/tools/tracing.js +201 -0
- package/src/agent/tools/utils.js +51 -0
- package/src/agent/tools/verify.js +184 -0
- package/src/agent/tools/webcrack.js +109 -0
- package/src/analyzer/ASTAnalyzer.js +387 -0
- package/src/analyzer/CallStackAnalyzer.js +379 -0
- package/src/analyzer/Deobfuscator.js +289 -0
- package/src/analyzer/EncryptionAnalyzer.js +99 -0
- package/src/analyzer/index.js +22 -0
- package/src/browser/EnvBridge.js +186 -0
- package/src/browser/cdp.js +168 -0
- package/src/browser/client.js +197 -0
- package/src/browser/collector.js +444 -0
- package/src/browser/collectors/RequestCryptoLinker.js +109 -0
- package/src/browser/collectors/ResponseSearcher.js +107 -0
- package/src/browser/collectors/ScriptCollector.js +158 -0
- package/src/browser/collectors/index.js +26 -0
- package/src/browser/defaultHooks.js +932 -0
- package/src/browser/hooks/crypto.js +55 -0
- package/src/browser/hooks/index.js +64 -0
- package/src/browser/hooks/native.js +9 -0
- package/src/browser/hooks/network.js +33 -0
- package/src/browser/index.js +42 -0
- package/src/browser/interceptors/NetworkInterceptor.js +116 -0
- package/src/browser/interceptors/ScriptInterceptor.js +76 -0
- package/src/browser/interceptors/index.js +6 -0
- package/src/browser/ui/analysisPanel.js +1782 -0
- package/src/browser/ui/confirmDialog.js +158 -0
- package/src/browser/ui/panel.html +152 -0
- package/src/browser/ui/selector.js +170 -0
- package/src/config/index.js +5 -0
- package/src/config/paths.js +71 -0
- package/src/config/patterns/crypto.js +36 -0
- package/src/config/profiles/chrome.json +71 -0
- package/src/config/profiles/firefox.json +44 -0
- package/src/config/profiles/safari.json +38 -0
- package/src/core/EnvMonitor.js +200 -0
- package/src/core/PatchGenerator.js +278 -0
- package/src/core/Sandbox.js +181 -0
- package/src/env/AntiAntiDebug.js +111 -0
- package/src/env/AsyncHook.js +68 -0
- package/src/env/BrowserAPIList.js +265 -0
- package/src/env/CookieHook.js +48 -0
- package/src/env/CryptoHook.js +205 -0
- package/src/env/EnvCodeGenerator.js +157 -0
- package/src/env/EnvDumper.js +356 -0
- package/src/env/EnvExtractor.js +220 -0
- package/src/env/HookBase.js +618 -0
- package/src/env/NetworkHook.js +159 -0
- package/src/env/modules/bom/history.js +29 -0
- package/src/env/modules/bom/location.js +26 -0
- package/src/env/modules/bom/navigator.js +70 -0
- package/src/env/modules/bom/screen.js +26 -0
- package/src/env/modules/bom/storage.js +23 -0
- package/src/env/modules/dom/document.js +110 -0
- package/src/env/modules/dom/event.js +51 -0
- package/src/env/modules/index.js +34 -0
- package/src/env/modules/webapi/fetch.js +46 -0
- package/src/env/modules/webapi/url.js +47 -0
- package/src/env/modules/webapi/xhr.js +48 -0
- package/src/index.js +27 -0
- package/src/mcp/server.js +89 -0
- package/src/store/DataStore.js +708 -0
- package/src/store/Store.js +158 -0
- package/src/store/Validator.js +24 -0
- package/test/analyze.test.js +90 -0
- package/test/envdump.test.js +74 -0
- package/test/flow.test.js +90 -0
- package/test/hooks.test.js +138 -0
- package/test/plugin.test.js +35 -0
- package/test/refactor-full.test.js +30 -0
- package/test/refactor.test.js +21 -0
- package/test/samples/obfuscated.js +61 -0
- package/test/samples/original.js +66 -0
- package/test/samples/v10_eval_chain.js +52 -0
- package/test/samples/v11_bytecode_vm.js +81 -0
- package/test/samples/v12_polymorphic.js +69 -0
- package/test/samples/v1_ob_basic.js +98 -0
- package/test/samples/v2_ob_advanced.js +99 -0
- package/test/samples/v3_jjencode.js +77 -0
- package/test/samples/v4_aaencode.js +73 -0
- package/test/samples/v5_control_flow.js +86 -0
- package/test/samples/v6_string_encryption.js +71 -0
- package/test/samples/v7_jsvmp.js +83 -0
- package/test/samples/v8_anti_debug.js +79 -0
- package/test/samples/v9_proxy_trap.js +49 -0
- package/test/samples.test.js +96 -0
- package/test/webcrack.test.js +55 -0
package/CLAUDE.md
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
# DeepSpider - 智能爬虫工程平台
|
|
2
|
+
|
|
3
|
+
> 基于 DeepAgents + Patchright 的智能爬虫 Agent,覆盖爬虫全生命周期
|
|
4
|
+
|
|
5
|
+
## 功能
|
|
6
|
+
|
|
7
|
+
### 逆向分析
|
|
8
|
+
- 真实浏览器动态分析 (Patchright + CDP)
|
|
9
|
+
- Webpack/Browserify 解包 (webcrack)
|
|
10
|
+
- 混淆代码分析与反混淆
|
|
11
|
+
- 加密算法识别 (CryptoJS/RSA Hook)
|
|
12
|
+
- 请求参数追踪
|
|
13
|
+
- JS 转 Python 代码生成
|
|
14
|
+
|
|
15
|
+
### 验证码处理
|
|
16
|
+
- 图片验证码 OCR 识别 (ddddocr)
|
|
17
|
+
- 滑块验证码轨迹模拟
|
|
18
|
+
- 点选验证码目标检测
|
|
19
|
+
- 打码平台集成
|
|
20
|
+
|
|
21
|
+
### 反检测与风控
|
|
22
|
+
- 浏览器指纹管理
|
|
23
|
+
- 代理 IP 池管理
|
|
24
|
+
- 请求特征伪装
|
|
25
|
+
- 风控规避策略
|
|
26
|
+
|
|
27
|
+
### 爬虫编排
|
|
28
|
+
- 智能流程规划
|
|
29
|
+
- 完整爬虫脚本生成
|
|
30
|
+
- 端到端测试验证
|
|
31
|
+
- 按需调用,灵活组合
|
|
32
|
+
|
|
33
|
+
## 项目结构
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
deepspider/
|
|
37
|
+
├── src/
|
|
38
|
+
│ ├── agent/ # DeepAgent 系统
|
|
39
|
+
│ │ ├── index.js # 主入口
|
|
40
|
+
│ │ ├── run.js # Agent 运行入口
|
|
41
|
+
│ │ ├── tools/ # 工具集(90+)
|
|
42
|
+
│ │ ├── subagents/ # 子代理
|
|
43
|
+
│ │ └── prompts/ # 系统提示
|
|
44
|
+
│ ├── browser/ # 浏览器运行时
|
|
45
|
+
│ │ ├── client.js # Patchright 客户端
|
|
46
|
+
│ │ ├── cdp.js # CDP 会话管理
|
|
47
|
+
│ │ ├── defaultHooks.js # 默认注入的 Hook
|
|
48
|
+
│ │ ├── interceptors/ # CDP 拦截器
|
|
49
|
+
│ │ │ ├── NetworkInterceptor.js
|
|
50
|
+
│ │ │ └── ScriptInterceptor.js
|
|
51
|
+
│ │ ├── ui/ # 浏览器内 UI
|
|
52
|
+
│ │ │ └── analysisPanel.js
|
|
53
|
+
│ │ └── hooks/ # Hook 脚本
|
|
54
|
+
│ ├── store/ # 数据存储
|
|
55
|
+
│ │ └── DataStore.js # 文件系统存储
|
|
56
|
+
│ ├── analyzer/ # 静态分析器
|
|
57
|
+
│ ├── core/ # 核心模块
|
|
58
|
+
│ ├── env/ # 环境补丁模块
|
|
59
|
+
│ └── mcp/ # MCP 服务
|
|
60
|
+
├── bin/cli.js # CLI 入口
|
|
61
|
+
└── test/ # 测试
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 依赖版本
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"@babel/parser": "^7.26.0",
|
|
69
|
+
"@babel/traverse": "^7.26.0",
|
|
70
|
+
"@babel/generator": "^7.26.0",
|
|
71
|
+
"deepagents": "^1.6.0",
|
|
72
|
+
"@langchain/core": "^1.1.17",
|
|
73
|
+
"@langchain/anthropic": "^1.3.12",
|
|
74
|
+
"patchright": "^1.51.1",
|
|
75
|
+
"webcrack": "^2.15.1",
|
|
76
|
+
"isolated-vm": "^6.0.2",
|
|
77
|
+
"zod": "^4.3.6"
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## 架构
|
|
82
|
+
|
|
83
|
+
### 子代理体系
|
|
84
|
+
|
|
85
|
+
| 子代理 | 职责 | 核心工具 |
|
|
86
|
+
|--------|------|----------|
|
|
87
|
+
| crawler | 爬虫编排:整合各模块、生成完整脚本 | file, store, crawler |
|
|
88
|
+
| static | 静态分析:解包、反混淆、加密定位 | webcrack, deobfuscate, analyze |
|
|
89
|
+
| dynamic | 动态分析:浏览器控制、Hook、数据采集 | browser, debug, capture |
|
|
90
|
+
| sandbox | 沙箱执行:环境补全、代码执行 | sandbox, env, patch |
|
|
91
|
+
| js2python | JS转Python:加密代码转换、验证 | python, analyzer |
|
|
92
|
+
| env-agent | 环境补全:生成浏览器环境模拟代码 | env, sandbox |
|
|
93
|
+
| captcha | 验证码处理:OCR、滑块、点选 | captcha_ocr, captcha_slide |
|
|
94
|
+
| anti-detect | 反检测:指纹管理、代理池 | proxy, fingerprint |
|
|
95
|
+
|
|
96
|
+
### 智能调度流程
|
|
97
|
+
|
|
98
|
+
根据目标网站复杂度,按需调用子代理:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
用户:爬取目标网站
|
|
102
|
+
↓
|
|
103
|
+
┌─────────────────────────────────────┐
|
|
104
|
+
│ crawler-agent 分析目标 │
|
|
105
|
+
│ 判断网站复杂度,规划流程 │
|
|
106
|
+
└─────────────────────────────────────┘
|
|
107
|
+
↓
|
|
108
|
+
┌─────────────────────────────────────┐
|
|
109
|
+
│ 按需调用子代理 │
|
|
110
|
+
│ │
|
|
111
|
+
│ Level 1 简单: static → js2python │
|
|
112
|
+
│ Level 2 中等: + captcha + dynamic │
|
|
113
|
+
│ Level 3 复杂: + anti-detect + e2e │
|
|
114
|
+
└─────────────────────────────────────┘
|
|
115
|
+
↓
|
|
116
|
+
┌─────────────────────────────────────┐
|
|
117
|
+
│ 输出完整爬虫脚本 │
|
|
118
|
+
│ 简单: 单文件脚本 │
|
|
119
|
+
│ 复杂: 完整项目结构 │
|
|
120
|
+
└─────────────────────────────────────┘
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 浏览器交互流程
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
pnpm run agent https://example.com
|
|
127
|
+
↓
|
|
128
|
+
┌─────────────────────────────────────┐
|
|
129
|
+
│ 浏览器启动,自动注入 Hook │
|
|
130
|
+
│ CDP 拦截器记录请求/脚本 │
|
|
131
|
+
│ 数据存储到 .deepspider-data/ │
|
|
132
|
+
└─────────────────────────────────────┘
|
|
133
|
+
↓
|
|
134
|
+
┌─────────────────────────────────────┐
|
|
135
|
+
│ 用户在网站操作(登录、翻页等) │
|
|
136
|
+
│ 系统持续记录数据 │
|
|
137
|
+
└─────────────────────────────────────┘
|
|
138
|
+
↓
|
|
139
|
+
┌─────────────────────────────────────┐
|
|
140
|
+
│ 用户点击面板选择按钮(⦿) │
|
|
141
|
+
│ 选择元素 → 显示操作菜单 │
|
|
142
|
+
│ │
|
|
143
|
+
│ 操作选项: │
|
|
144
|
+
│ - 添加为字段(爬虫配置) │
|
|
145
|
+
│ - 追踪数据来源 │
|
|
146
|
+
│ - 分析加密逻辑 │
|
|
147
|
+
│ - 完整流程分析 │
|
|
148
|
+
└─────────────────────────────────────┘
|
|
149
|
+
↓
|
|
150
|
+
┌─────────────────────────────────────┐
|
|
151
|
+
│ 选择多个字段后点击"生成配置" │
|
|
152
|
+
│ crawler 子代理整合分析结果 │
|
|
153
|
+
│ 输出 config.json + crawler.py │
|
|
154
|
+
└─────────────────────────────────────┘
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## 代码规范
|
|
158
|
+
|
|
159
|
+
### 浏览器交互
|
|
160
|
+
|
|
161
|
+
与浏览器的交互优先使用 CDP(Chrome DevTools Protocol)方式,而非 `page.evaluate()`。
|
|
162
|
+
|
|
163
|
+
CDP session 应复用,通过 `browser.getCDPSession()` 获取:
|
|
164
|
+
|
|
165
|
+
```javascript
|
|
166
|
+
// 复用 CDP session 执行 JS
|
|
167
|
+
async function evaluateViaCDP(browser, expression) {
|
|
168
|
+
const cdp = await browser.getCDPSession();
|
|
169
|
+
if (!cdp) return null;
|
|
170
|
+
const result = await cdp.send('Runtime.evaluate', {
|
|
171
|
+
expression,
|
|
172
|
+
returnByValue: true,
|
|
173
|
+
});
|
|
174
|
+
return result.result?.value;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// 使用示例
|
|
178
|
+
const logs = await evaluateViaCDP(browser, `window.__deepspider__?.getAllLogs?.()`);
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Babel AST 遍历
|
|
182
|
+
|
|
183
|
+
使用 `@babel/traverse` 而非 acorn-walk:
|
|
184
|
+
|
|
185
|
+
```javascript
|
|
186
|
+
import { parse } from '@babel/parser';
|
|
187
|
+
import traverse from '@babel/traverse';
|
|
188
|
+
|
|
189
|
+
// 解析代码
|
|
190
|
+
const ast = parse(code, {
|
|
191
|
+
sourceType: 'unambiguous',
|
|
192
|
+
plugins: ['jsx', 'typescript', 'decorators-legacy'],
|
|
193
|
+
errorRecovery: true,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
// 遍历 AST
|
|
197
|
+
traverse.default(ast, {
|
|
198
|
+
FunctionDeclaration(path) {
|
|
199
|
+
const node = path.node;
|
|
200
|
+
// 处理函数声明
|
|
201
|
+
},
|
|
202
|
+
CallExpression(path) {
|
|
203
|
+
const node = path.node;
|
|
204
|
+
// 处理调用表达式
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// 遍历子节点(在 visitor 内部)
|
|
209
|
+
path.traverse({
|
|
210
|
+
Identifier(innerPath) {
|
|
211
|
+
// 处理内部标识符
|
|
212
|
+
}
|
|
213
|
+
});
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### LangChain 工具定义
|
|
217
|
+
|
|
218
|
+
使用 `@langchain/core/tools`:
|
|
219
|
+
|
|
220
|
+
```javascript
|
|
221
|
+
import { z } from 'zod';
|
|
222
|
+
import { tool } from '@langchain/core/tools';
|
|
223
|
+
|
|
224
|
+
const myTool = tool(
|
|
225
|
+
async ({ param1, param2 }) => {
|
|
226
|
+
// 工具逻辑
|
|
227
|
+
return result;
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
name: 'tool_name',
|
|
231
|
+
description: '工具描述',
|
|
232
|
+
schema: z.object({
|
|
233
|
+
param1: z.string().describe('参数1描述'),
|
|
234
|
+
param2: z.number().optional().default(100),
|
|
235
|
+
}),
|
|
236
|
+
}
|
|
237
|
+
);
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### DeepAgent 创建
|
|
241
|
+
|
|
242
|
+
```javascript
|
|
243
|
+
import { ChatAnthropic } from '@langchain/anthropic';
|
|
244
|
+
import { createDeepAgent } from 'deepagents';
|
|
245
|
+
|
|
246
|
+
export const agent = createDeepAgent({
|
|
247
|
+
model: new ChatAnthropic({
|
|
248
|
+
model: 'claude-sonnet-4-20250514',
|
|
249
|
+
temperature: 0,
|
|
250
|
+
}),
|
|
251
|
+
tools: [tool1, tool2],
|
|
252
|
+
systemPrompt: '系统提示',
|
|
253
|
+
});
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## 运行
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
# 安装依赖
|
|
260
|
+
pnpm install
|
|
261
|
+
|
|
262
|
+
# 安装 Python 加密库(用于运行生成的 Python 代码)
|
|
263
|
+
pnpm run setup:crypto
|
|
264
|
+
|
|
265
|
+
# 配置环境变量
|
|
266
|
+
cp .env.example .env
|
|
267
|
+
# 编辑 .env 填入:
|
|
268
|
+
# LLM_API_KEY=your-api-key
|
|
269
|
+
# LLM_BASE_URL=https://api.openai.com/v1 # 可选,兼容 OpenAI 格式的任意供应商
|
|
270
|
+
# LLM_MODEL=gpt-4o # 可选,默认 gpt-4o
|
|
271
|
+
|
|
272
|
+
# Agent 模式(推荐)- 指定目标网站
|
|
273
|
+
pnpm run agent https://example.com
|
|
274
|
+
|
|
275
|
+
# Agent 模式 - 纯交互(不启动浏览器)
|
|
276
|
+
pnpm run agent
|
|
277
|
+
|
|
278
|
+
# MCP 服务(供 Claude Code 等调用)
|
|
279
|
+
pnpm run mcp
|
|
280
|
+
|
|
281
|
+
# 测试
|
|
282
|
+
pnpm test
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Agent 使用流程
|
|
286
|
+
|
|
287
|
+
1. **启动**: `pnpm run agent https://target-site.com`
|
|
288
|
+
2. **等待**: 浏览器打开,系统自动记录数据(不消耗 API)
|
|
289
|
+
3. **操作**: 在网站上登录、翻页、触发目标请求
|
|
290
|
+
4. **选择**: 点击面板的选择按钮(⦿),进入选择模式
|
|
291
|
+
5. **分析**: 点击目标数据,确认后发送给 Agent
|
|
292
|
+
6. **对话**: 在面板或 CLI 继续提问,深入分析
|
package/README.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# DeepSpider
|
|
2
|
+
|
|
3
|
+
> 智能爬虫工程平台 - 基于 DeepAgents + Patchright
|
|
4
|
+
|
|
5
|
+
从 JS 逆向分析到完整爬虫脚本的一站式 AI Agent 解决方案。
|
|
6
|
+
|
|
7
|
+
## 特性
|
|
8
|
+
|
|
9
|
+
- **逆向分析**: Webpack 解包、反混淆、加密算法识别与定位
|
|
10
|
+
- **动态调试**: 真实浏览器 + CDP 断点调试、Hook 注入
|
|
11
|
+
- **代码转换**: JS 加密逻辑自动转 Python
|
|
12
|
+
- **验证码处理**: 滑块、点选、图片验证码
|
|
13
|
+
- **反检测**: 指纹伪装、代理轮换、风控规避
|
|
14
|
+
- **爬虫编排**: 智能调度,输出可运行的 Python 爬虫
|
|
15
|
+
|
|
16
|
+
## 快速开始
|
|
17
|
+
|
|
18
|
+
### 安装
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# 安装依赖
|
|
22
|
+
pnpm install
|
|
23
|
+
|
|
24
|
+
# 安装 Python 加密库(用于运行生成的 Python 代码)
|
|
25
|
+
pnpm run setup:crypto
|
|
26
|
+
|
|
27
|
+
# 配置环境变量
|
|
28
|
+
cp .env.example .env
|
|
29
|
+
# 编辑 .env 填入 LLM_API_KEY
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### 使用
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Agent 模式(推荐)- 指定目标网站
|
|
36
|
+
pnpm run agent https://example.com
|
|
37
|
+
|
|
38
|
+
# Agent 模式 - 纯交互(不启动浏览器)
|
|
39
|
+
pnpm run agent
|
|
40
|
+
|
|
41
|
+
# MCP 服务(供 Claude Code 等调用)
|
|
42
|
+
pnpm run mcp
|
|
43
|
+
|
|
44
|
+
# 运行测试
|
|
45
|
+
pnpm test
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 使用流程
|
|
49
|
+
|
|
50
|
+
1. **启动**: `pnpm run agent https://target-site.com`
|
|
51
|
+
2. **等待**: 浏览器打开,系统自动记录数据(不消耗 API)
|
|
52
|
+
3. **操作**: 在网站上登录、翻页、触发目标请求
|
|
53
|
+
4. **选择**: 点击面板的选择按钮 ⦿,进入选择模式
|
|
54
|
+
5. **分析**: 点击目标数据,确认后发送给 Agent
|
|
55
|
+
6. **对话**: 在面板或 CLI 继续提问,深入分析
|
|
56
|
+
|
|
57
|
+
## 架构
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
┌─────────────────────────────────────────────────────┐
|
|
61
|
+
│ DeepSpider │
|
|
62
|
+
│ (爬虫编排 - 智能调度) │
|
|
63
|
+
└──────────────────────┬──────────────────────────────┘
|
|
64
|
+
│ 按需调用
|
|
65
|
+
┌───────────────┼───────────────┐
|
|
66
|
+
▼ ▼ ▼
|
|
67
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
68
|
+
│static-agent │ │captcha-agent│ │anti-detect │
|
|
69
|
+
│ 静态分析 │ │ 验证码处理 │ │ 反检测 │
|
|
70
|
+
└──────┬──────┘ └─────────────┘ └─────────────┘
|
|
71
|
+
▼
|
|
72
|
+
┌─────────────┐
|
|
73
|
+
│dynamic-agent│
|
|
74
|
+
│ 动态调试 │
|
|
75
|
+
└──────┬──────┘
|
|
76
|
+
▼
|
|
77
|
+
┌─────────────┐ ┌─────────────┐
|
|
78
|
+
│sandbox-agent│ ──▶ │js2python │
|
|
79
|
+
│ 沙箱验证 │ │ 代码转换 │
|
|
80
|
+
└─────────────┘ └─────────────┘
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 子代理体系
|
|
84
|
+
|
|
85
|
+
| 子代理 | 职责 | 核心工具 |
|
|
86
|
+
|--------|------|----------|
|
|
87
|
+
| crawler | 爬虫编排:整合各模块、生成完整脚本 | file, store, crawler |
|
|
88
|
+
| static | 静态分析:解包、反混淆、加密定位 | webcrack, deobfuscate, analyze |
|
|
89
|
+
| dynamic | 动态分析:浏览器控制、Hook、数据采集 | browser, debug, capture |
|
|
90
|
+
| sandbox | 沙箱执行:环境补全、代码执行 | sandbox, env, patch |
|
|
91
|
+
| js2python | JS转Python:加密代码转换、验证 | python, analyzer |
|
|
92
|
+
| captcha | 验证码处理:OCR、滑块、点选 | captcha_ocr, captcha_slide |
|
|
93
|
+
| anti-detect | 反检测:指纹管理、代理池 | proxy, fingerprint |
|
|
94
|
+
|
|
95
|
+
## 项目结构
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
deepspider/
|
|
99
|
+
├── src/
|
|
100
|
+
│ ├── agent/ # DeepAgent 系统
|
|
101
|
+
│ │ ├── tools/ # 工具集(90+)
|
|
102
|
+
│ │ ├── subagents/ # 子代理
|
|
103
|
+
│ │ ├── skills/ # 领域技能
|
|
104
|
+
│ │ └── prompts/ # 系统提示
|
|
105
|
+
│ ├── browser/ # 浏览器运行时
|
|
106
|
+
│ │ ├── client.js # Patchright 客户端
|
|
107
|
+
│ │ ├── cdp.js # CDP 会话管理
|
|
108
|
+
│ │ ├── defaultHooks.js # 默认注入的 Hook
|
|
109
|
+
│ │ ├── interceptors/ # CDP 拦截器
|
|
110
|
+
│ │ └── ui/ # 浏览器内 UI 面板
|
|
111
|
+
│ ├── analyzer/ # 静态分析器
|
|
112
|
+
│ ├── env/ # 环境补丁模块
|
|
113
|
+
│ ├── store/ # 数据存储
|
|
114
|
+
│ └── mcp/ # MCP 服务
|
|
115
|
+
├── bin/cli.js # CLI 入口
|
|
116
|
+
└── test/ # 测试
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## 核心技术
|
|
120
|
+
|
|
121
|
+
- **DeepAgents**: 多代理协作框架
|
|
122
|
+
- **Patchright**: 反检测浏览器自动化
|
|
123
|
+
- **CDP**: Chrome DevTools Protocol 深度集成
|
|
124
|
+
- **webcrack**: Webpack/Browserify 解包
|
|
125
|
+
- **isolated-vm**: 安全沙箱执行
|
|
126
|
+
|
|
127
|
+
## 文档
|
|
128
|
+
|
|
129
|
+
- [开发使用指南](docs/GUIDE.md)
|
|
130
|
+
- [调试指南](docs/DEBUG.md)
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: JavaScript 逆向分析专家。动态调试、代码解包、反混淆、加密捕获、环境补全。
|
|
3
|
+
capabilities: ["动态调试", "代码解包", "反混淆", "加密捕获", "环境补全"]
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
你是 DeepSpider,一个专业的 JavaScript 逆向工程专家。
|
|
7
|
+
|
|
8
|
+
## 核心能力
|
|
9
|
+
|
|
10
|
+
1. **动态调试** - Patchright 反检测浏览器 + CDP 断点调试
|
|
11
|
+
2. **代码解包** - Webpack/Browserify 自动解包 (webcrack)
|
|
12
|
+
3. **反混淆** - AST 解析、控制流还原、字符串解密
|
|
13
|
+
4. **加密捕获** - Hook 捕获 CryptoJS/RSA 加密调用
|
|
14
|
+
5. **环境补全** - 检测并补全浏览器环境依赖
|
|
15
|
+
|
|
16
|
+
## 工作原则
|
|
17
|
+
|
|
18
|
+
1. **最小补丁**: 只补充必要的环境,避免过度补丁
|
|
19
|
+
2. **迭代验证**: 每次补丁后验证执行结果
|
|
20
|
+
3. **清晰输出**: 生成可独立运行的代码
|
|
21
|
+
|
|
22
|
+
## 工作流程
|
|
23
|
+
|
|
24
|
+
### 完整分析流程
|
|
25
|
+
```
|
|
26
|
+
1. preprocess_code 预处理(自动解包或反混淆)
|
|
27
|
+
2. deobfuscate 深度反混淆
|
|
28
|
+
3. analyze_encryption 定位加密入口
|
|
29
|
+
4. launch_browser 启动浏览器
|
|
30
|
+
5. set_breakpoint 设置断点
|
|
31
|
+
6. collect_env 采集环境数据
|
|
32
|
+
7. generate_patch 生成补丁
|
|
33
|
+
8. sandbox_execute 沙箱验证
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### 补环境任务
|
|
37
|
+
```
|
|
38
|
+
1. 读取目标 JS 代码
|
|
39
|
+
2. 使用 sandbox_execute 执行,捕获错误
|
|
40
|
+
3. 分析缺失的环境属性
|
|
41
|
+
4. 使用 generate_patch 生成补丁
|
|
42
|
+
5. 使用 sandbox_inject 注入补丁
|
|
43
|
+
6. 重复直到成功
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 加密分析任务
|
|
47
|
+
```
|
|
48
|
+
1. launch_browser 打开目标页面
|
|
49
|
+
2. 注入 Hook 脚本捕获加密调用
|
|
50
|
+
3. 触发页面操作
|
|
51
|
+
4. get_hook_logs 获取捕获的密钥和参数
|
|
52
|
+
5. 输出分析报告
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## 环境补丁模板
|
|
56
|
+
|
|
57
|
+
### Navigator
|
|
58
|
+
```javascript
|
|
59
|
+
const navigator = {
|
|
60
|
+
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
61
|
+
platform: 'Win32',
|
|
62
|
+
language: 'zh-CN',
|
|
63
|
+
languages: ['zh-CN', 'en'],
|
|
64
|
+
cookieEnabled: true,
|
|
65
|
+
onLine: true,
|
|
66
|
+
hardwareConcurrency: 8
|
|
67
|
+
};
|
|
68
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Document
|
|
72
|
+
```javascript
|
|
73
|
+
const document = {
|
|
74
|
+
cookie: '',
|
|
75
|
+
referrer: '',
|
|
76
|
+
domain: 'example.com',
|
|
77
|
+
title: '',
|
|
78
|
+
createElement: (tag) => ({ tagName: tag, style: {} }),
|
|
79
|
+
getElementById: () => null,
|
|
80
|
+
querySelector: () => null,
|
|
81
|
+
querySelectorAll: () => []
|
|
82
|
+
};
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Window
|
|
86
|
+
```javascript
|
|
87
|
+
const window = {
|
|
88
|
+
innerWidth: 1920,
|
|
89
|
+
innerHeight: 1080,
|
|
90
|
+
devicePixelRatio: 1,
|
|
91
|
+
location: { href: 'https://example.com/', hostname: 'example.com', protocol: 'https:' },
|
|
92
|
+
navigator,
|
|
93
|
+
document,
|
|
94
|
+
localStorage: { getItem: () => null, setItem: () => {} },
|
|
95
|
+
sessionStorage: { getItem: () => null, setItem: () => {} },
|
|
96
|
+
atob: (s) => Buffer.from(s, 'base64').toString(),
|
|
97
|
+
btoa: (s) => Buffer.from(s).toString('base64')
|
|
98
|
+
};
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## 加密模式识别
|
|
102
|
+
|
|
103
|
+
| 模式 | 特征 |
|
|
104
|
+
|------|------|
|
|
105
|
+
| MD5 | `md5(`, 32位十六进制输出 |
|
|
106
|
+
| SHA256 | `sha256`, `SHA256`, 64位十六进制输出 |
|
|
107
|
+
| AES | `CryptoJS.AES`, `aes.*encrypt` |
|
|
108
|
+
| RSA | `JSEncrypt`, `RSAKey` |
|
|
109
|
+
| Base64 | `btoa`, `atob` |
|
|
110
|
+
| HMAC | `hmac`, `HMAC` |
|
|
111
|
+
|
|
112
|
+
## 输出格式
|
|
113
|
+
|
|
114
|
+
分析完成后,输出:
|
|
115
|
+
1. **执行结果**: 代码执行的返回值
|
|
116
|
+
2. **补丁代码**: 完整的环境补丁(可独立运行)
|
|
117
|
+
3. **分析报告**: 检测到的加密算法、关键函数等
|
|
118
|
+
|
|
119
|
+
## Hook 模板
|
|
120
|
+
|
|
121
|
+
### CryptoJS Hook (自动注入)
|
|
122
|
+
```javascript
|
|
123
|
+
// 由 browser/hooks/crypto.js 自动注入
|
|
124
|
+
// 捕获 CryptoJS.AES.encrypt 等调用
|
|
125
|
+
// 输出: Key, IV, Mode, 明文, 密文
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### RSA Hook (自动注入)
|
|
129
|
+
```javascript
|
|
130
|
+
// 由 browser/hooks/crypto.js 自动注入
|
|
131
|
+
// 捕获 JSEncrypt.encrypt 调用
|
|
132
|
+
// 输出: 公钥, 明文, 密文
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### 自定义函数 Hook
|
|
136
|
+
```javascript
|
|
137
|
+
const _orig = window.func;
|
|
138
|
+
window.func = function(...args) {
|
|
139
|
+
console.log('[Hook] func:', args);
|
|
140
|
+
return _orig.apply(this, args);
|
|
141
|
+
};
|
|
142
|
+
```
|
package/docs/DEBUG.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# DeepSpider 调试指南
|
|
2
|
+
|
|
3
|
+
## 1. MCP 服务测试
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
# 启动 MCP 服务
|
|
7
|
+
pnpm run mcp
|
|
8
|
+
|
|
9
|
+
# MCP Inspector 测试
|
|
10
|
+
npx @modelcontextprotocol/inspector node src/mcp/server.js
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## 2. 浏览器调试
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# 测试浏览器启动
|
|
17
|
+
node test/browser.test.js
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
浏览器使用 Patchright (反检测 Playwright),默认非 headless 模式。
|
|
21
|
+
|
|
22
|
+
## 3. 工具验证
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# 验证所有工具导入
|
|
26
|
+
node -e "import('./src/agent/tools/index.js').then(m => console.log('工具数:', m.allTools.length))"
|
|
27
|
+
|
|
28
|
+
# 测试单个工具
|
|
29
|
+
node -e "
|
|
30
|
+
import { preprocessCode } from './src/agent/tools/preprocess.js';
|
|
31
|
+
preprocessCode.invoke({ code: 'var a=1' }).then(console.log);
|
|
32
|
+
"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## 4. 常见问题
|
|
36
|
+
|
|
37
|
+
| 问题 | 排查方法 |
|
|
38
|
+
|------|----------|
|
|
39
|
+
| MCP 启动失败 | `node src/mcp/server.js` |
|
|
40
|
+
| 浏览器启动失败 | 检查 patchright 安装 |
|
|
41
|
+
| 沙箱执行失败 | 检查 isolated-vm 依赖 |
|
|
42
|
+
| webcrack 解包失败 | 确认是 Webpack/Browserify 格式 |
|