deepspider 0.2.11 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/README.md +71 -24
  2. package/bin/cli.js +45 -0
  3. package/package.json +10 -4
  4. package/src/agent/core/PanelBridge.js +133 -0
  5. package/src/agent/core/RetryManager.js +51 -0
  6. package/src/agent/core/StreamHandler.js +263 -0
  7. package/src/agent/core/index.js +7 -0
  8. package/src/agent/errors/ErrorClassifier.js +43 -0
  9. package/src/agent/errors/SpiderError.js +68 -0
  10. package/src/agent/errors/index.js +19 -0
  11. package/src/agent/run.js +67 -460
  12. package/src/agent/setup.js +14 -14
  13. package/src/agent/subagents/factory.js +60 -0
  14. package/src/agent/subagents/index.js +3 -0
  15. package/src/agent/tools/report.js +36 -4
  16. package/src/browser/client.js +47 -10
  17. package/src/cli/commands/config.js +94 -0
  18. package/src/cli/commands/help.js +34 -0
  19. package/src/cli/commands/update.js +78 -0
  20. package/src/cli/commands/version.js +9 -0
  21. package/src/cli/config.js +15 -0
  22. package/src/config/settings.js +102 -0
  23. package/.claude/agents/check.md +0 -122
  24. package/.claude/agents/debug.md +0 -106
  25. package/.claude/agents/dispatch.md +0 -214
  26. package/.claude/agents/implement.md +0 -96
  27. package/.claude/agents/plan.md +0 -396
  28. package/.claude/agents/research.md +0 -120
  29. package/.claude/commands/evolve/merge.md +0 -80
  30. package/.claude/commands/trellis/before-backend-dev.md +0 -13
  31. package/.claude/commands/trellis/before-frontend-dev.md +0 -13
  32. package/.claude/commands/trellis/break-loop.md +0 -107
  33. package/.claude/commands/trellis/check-backend.md +0 -13
  34. package/.claude/commands/trellis/check-cross-layer.md +0 -153
  35. package/.claude/commands/trellis/check-frontend.md +0 -13
  36. package/.claude/commands/trellis/create-command.md +0 -154
  37. package/.claude/commands/trellis/finish-work.md +0 -129
  38. package/.claude/commands/trellis/integrate-skill.md +0 -219
  39. package/.claude/commands/trellis/onboard.md +0 -358
  40. package/.claude/commands/trellis/parallel.md +0 -193
  41. package/.claude/commands/trellis/record-session.md +0 -62
  42. package/.claude/commands/trellis/start.md +0 -280
  43. package/.claude/commands/trellis/update-spec.md +0 -213
  44. package/.claude/hooks/inject-subagent-context.py +0 -758
  45. package/.claude/hooks/ralph-loop.py +0 -374
  46. package/.claude/hooks/session-start.py +0 -126
  47. package/.claude/settings.json +0 -41
  48. package/.claude/skills/deepagents-guide/SKILL.md +0 -428
  49. package/.cursor/commands/trellis-before-backend-dev.md +0 -13
  50. package/.cursor/commands/trellis-before-frontend-dev.md +0 -13
  51. package/.cursor/commands/trellis-break-loop.md +0 -107
  52. package/.cursor/commands/trellis-check-backend.md +0 -13
  53. package/.cursor/commands/trellis-check-cross-layer.md +0 -153
  54. package/.cursor/commands/trellis-check-frontend.md +0 -13
  55. package/.cursor/commands/trellis-create-command.md +0 -154
  56. package/.cursor/commands/trellis-finish-work.md +0 -129
  57. package/.cursor/commands/trellis-integrate-skill.md +0 -219
  58. package/.cursor/commands/trellis-onboard.md +0 -358
  59. package/.cursor/commands/trellis-record-session.md +0 -62
  60. package/.cursor/commands/trellis-start.md +0 -156
  61. package/.cursor/commands/trellis-update-spec.md +0 -213
  62. package/.github/workflows/publish.yml +0 -63
  63. package/.husky/pre-commit +0 -1
  64. package/.mcp.json +0 -8
  65. package/.trellis/.template-hashes.json +0 -65
  66. package/.trellis/.version +0 -1
  67. package/.trellis/scripts/add-session.sh +0 -384
  68. package/.trellis/scripts/common/developer.sh +0 -129
  69. package/.trellis/scripts/common/git-context.sh +0 -263
  70. package/.trellis/scripts/common/paths.sh +0 -208
  71. package/.trellis/scripts/common/phase.sh +0 -150
  72. package/.trellis/scripts/common/registry.sh +0 -247
  73. package/.trellis/scripts/common/task-queue.sh +0 -142
  74. package/.trellis/scripts/common/task-utils.sh +0 -151
  75. package/.trellis/scripts/common/worktree.sh +0 -128
  76. package/.trellis/scripts/create-bootstrap.sh +0 -299
  77. package/.trellis/scripts/get-context.sh +0 -7
  78. package/.trellis/scripts/get-developer.sh +0 -15
  79. package/.trellis/scripts/init-developer.sh +0 -34
  80. package/.trellis/scripts/multi-agent/cleanup.sh +0 -396
  81. package/.trellis/scripts/multi-agent/create-pr.sh +0 -241
  82. package/.trellis/scripts/multi-agent/plan.sh +0 -207
  83. package/.trellis/scripts/multi-agent/start.sh +0 -310
  84. package/.trellis/scripts/multi-agent/status.sh +0 -828
  85. package/.trellis/scripts/task.sh +0 -1118
  86. package/.trellis/spec/backend/ci-cd-guidelines.md +0 -73
  87. package/.trellis/spec/backend/deepagents-guide.md +0 -380
  88. package/.trellis/spec/backend/directory-structure.md +0 -126
  89. package/.trellis/spec/backend/examples/skills/deepagents-guide/README.md +0 -11
  90. package/.trellis/spec/backend/examples/skills/deepagents-guide/agent.js.template +0 -20
  91. package/.trellis/spec/backend/examples/skills/deepagents-guide/skills-config.js.template +0 -13
  92. package/.trellis/spec/backend/examples/skills/deepagents-guide/subagent.js.template +0 -19
  93. package/.trellis/spec/backend/hook-guidelines.md +0 -218
  94. package/.trellis/spec/backend/index.md +0 -37
  95. package/.trellis/spec/backend/quality-guidelines.md +0 -302
  96. package/.trellis/spec/backend/state-management.md +0 -76
  97. package/.trellis/spec/backend/tool-guidelines.md +0 -144
  98. package/.trellis/spec/backend/type-safety.md +0 -71
  99. package/.trellis/spec/guides/code-reuse-thinking-guide.md +0 -92
  100. package/.trellis/spec/guides/cross-layer-thinking-guide.md +0 -94
  101. package/.trellis/spec/guides/index.md +0 -79
  102. package/.trellis/tasks/archive/02-02-evolving-skills/prd.md +0 -61
  103. package/.trellis/tasks/archive/02-02-evolving-skills/task.json +0 -29
  104. package/.trellis/tasks/archive/2026-02/00-bootstrap-guidelines/prd.md +0 -86
  105. package/.trellis/tasks/archive/2026-02/00-bootstrap-guidelines/task.json +0 -27
  106. package/.trellis/tasks/archive/2026-02/02-02-skills-system/check.jsonl +0 -3
  107. package/.trellis/tasks/archive/2026-02/02-02-skills-system/debug.jsonl +0 -2
  108. package/.trellis/tasks/archive/2026-02/02-02-skills-system/implement.jsonl +0 -5
  109. package/.trellis/tasks/archive/2026-02/02-02-skills-system/prd.md +0 -33
  110. package/.trellis/tasks/archive/2026-02/02-02-skills-system/task.json +0 -41
  111. package/.trellis/workflow.md +0 -407
  112. package/.trellis/workspace/index.md +0 -123
  113. package/.trellis/workspace/pony/index.md +0 -42
  114. package/.trellis/workspace/pony/journal-1.md +0 -125
  115. package/.trellis/worktree.yaml +0 -47
  116. package/AGENTS.md +0 -18
  117. package/CLAUDE.md +0 -315
  118. package/agents/deepspider.md +0 -142
  119. package/docs/DEBUG.md +0 -42
  120. package/docs/GUIDE.md +0 -334
  121. package/docs/PROMPT.md +0 -60
  122. package/docs/USAGE.md +0 -226
  123. package/eslint.config.js +0 -51
  124. package/test/analyze.test.js +0 -90
  125. package/test/envdump.test.js +0 -74
  126. package/test/flow.test.js +0 -90
  127. package/test/hooks.test.js +0 -138
  128. package/test/plugin.test.js +0 -35
  129. package/test/refactor-full.test.js +0 -30
  130. package/test/refactor.test.js +0 -21
  131. package/test/samples/obfuscated.js +0 -61
  132. package/test/samples/original.js +0 -66
  133. package/test/samples/v10_eval_chain.js +0 -52
  134. package/test/samples/v11_bytecode_vm.js +0 -81
  135. package/test/samples/v12_polymorphic.js +0 -69
  136. package/test/samples/v1_ob_basic.js +0 -98
  137. package/test/samples/v2_ob_advanced.js +0 -99
  138. package/test/samples/v3_jjencode.js +0 -77
  139. package/test/samples/v4_aaencode.js +0 -73
  140. package/test/samples/v5_control_flow.js +0 -86
  141. package/test/samples/v6_string_encryption.js +0 -71
  142. package/test/samples/v7_jsvmp.js +0 -83
  143. package/test/samples/v8_anti_debug.js +0 -79
  144. package/test/samples/v9_proxy_trap.js +0 -49
  145. package/test/samples.test.js +0 -96
  146. package/test/webcrack.test.js +0 -55
@@ -1,125 +0,0 @@
1
- # Journal - pony (Part 1)
2
-
3
- > AI development session journal
4
- > Started: 2026-01-30
5
-
6
- ---
7
-
8
-
9
- ## Session 1: 环境变量重命名与配置检测
10
-
11
- **Date**: 2026-02-03
12
- **Task**: 环境变量重命名与配置检测
13
-
14
- ### Summary
15
-
16
- (Add summary)
17
-
18
- ### Main Changes
19
-
20
- ## 完成内容
21
-
22
- 重命名环境变量为项目专属前缀,并添加启动时配置检测。
23
-
24
- | 变更 | 说明 |
25
- |------|------|
26
- | 环境变量重命名 | LLM_* → DEEPSPIDER_API_KEY/BASE_URL/MODEL |
27
- | 配置检测模块 | 新增 setup.js,启动时检测必要配置 |
28
- | 文档更新 | README.md, CLAUDE.md 同步更新 |
29
-
30
- ## 设计决策
31
-
32
- 从第一性原理分析,采用简化方案:
33
- - 移除交互式配置向导(200行→47行)
34
- - 只做检测+提示,不做选择
35
- - 符合 Unix 哲学
36
-
37
- ## 变更文件
38
-
39
- - `.env.example` - 环境变量模板
40
- - `src/agent/index.js` - 读取新变量名
41
- - `src/agent/run.js` - 添加配置检测调用
42
- - `src/agent/setup.js` - 新增配置检测模块
43
- - `README.md`, `CLAUDE.md` - 文档更新
44
-
45
- ### Git Commits
46
-
47
- | Hash | Message |
48
- |------|---------|
49
- | `4aa6cad` | (see git log) |
50
-
51
- ### Testing
52
-
53
- - [OK] (Add test results)
54
-
55
- ### Status
56
-
57
- [OK] **Completed**
58
-
59
- ### Next Steps
60
-
61
- - None - task complete
62
-
63
- ## Session 2: GitHub Actions 自动发布 npm
64
-
65
- **Date**: 2026-02-03
66
- **Task**: GitHub Actions 自动发布 npm
67
-
68
- ### Summary
69
-
70
- (Add summary)
71
-
72
- ### Main Changes
73
-
74
- ## 完成内容
75
-
76
- 实现 GitHub Actions 自动发布到 npm。
77
-
78
- | 变更 | 说明 |
79
- |------|------|
80
- | GitHub Actions | 添加 .github/workflows/publish.yml |
81
- | 触发条件 | 推送 v* 标签时自动发布 |
82
- | CI 流程 | lint → publish |
83
- | Node.js | 使用 v20 + --ignore-scripts 跳过原生模块编译 |
84
-
85
- ## 遇到的问题与解决
86
-
87
- 1. **pnpm lockfile 不匹配** → 添加 --no-frozen-lockfile
88
- 2. **isolated-vm 编译失败** → 添加 --ignore-scripts
89
- 3. **NPM_TOKEN 认证失败** → 使用 NODE_AUTH_TOKEN 环境变量
90
-
91
- ## 发布流程
92
-
93
- ```bash
94
- npm version patch && git push && git push --tags
95
- ```
96
-
97
- ## 变更文件
98
-
99
- - `.github/workflows/publish.yml` - GitHub Actions 配置
100
-
101
- ### Git Commits
102
-
103
- | Hash | Message |
104
- |------|---------|
105
- | `4ff9a25` | (see git log) |
106
- | `debdc4e` | (see git log) |
107
- | `ab56fe2` | (see git log) |
108
- | `67f9c55` | (see git log) |
109
- | `b13b03d` | (see git log) |
110
- | `63a6304` | (see git log) |
111
- | `327ca39` | (see git log) |
112
- | `78de837` | (see git log) |
113
- | `46ce73e` | (see git log) |
114
-
115
- ### Testing
116
-
117
- - [OK] (Add test results)
118
-
119
- ### Status
120
-
121
- [OK] **Completed**
122
-
123
- ### Next Steps
124
-
125
- - None - task complete
@@ -1,47 +0,0 @@
1
- # Worktree Configuration for Multi-Agent Pipeline
2
- # Used for worktree initialization in multi-agent workflows
3
- #
4
- # All paths are relative to project root
5
-
6
- #-------------------------------------------------------------------------------
7
- # Paths
8
- #-------------------------------------------------------------------------------
9
-
10
- # Worktree storage directory (relative to project root)
11
- worktree_dir: ../trellis-worktrees
12
-
13
- #-------------------------------------------------------------------------------
14
- # Files to Copy
15
- #-------------------------------------------------------------------------------
16
-
17
- # Files to copy to each worktree (each worktree needs independent copy)
18
- # These files contain sensitive info or need worktree-independent config
19
- copy:
20
- # Environment variables (uncomment and customize as needed)
21
- # - .env
22
- # - .env.local
23
- # Workflow config
24
- - .trellis/.developer
25
-
26
- #-------------------------------------------------------------------------------
27
- # Post-Create Hooks
28
- #-------------------------------------------------------------------------------
29
-
30
- # Commands to run after creating worktree
31
- # Executed in worktree directory, in order, abort on failure
32
- post_create:
33
- # Install dependencies (uncomment based on your package manager)
34
- # - npm install
35
- # - pnpm install --frozen-lockfile
36
- # - yarn install --frozen-lockfile
37
-
38
- #-------------------------------------------------------------------------------
39
- # Check Agent Verification (Ralph Loop)
40
- #-------------------------------------------------------------------------------
41
-
42
- # Commands to verify code quality before allowing check agent to finish
43
- # If configured, Ralph Loop will run these commands - all must pass to allow completion
44
- # If not configured or empty, trusts agent's completion markers
45
- verify:
46
- # - pnpm lint
47
- # - pnpm typecheck
package/AGENTS.md DELETED
@@ -1,18 +0,0 @@
1
- <!-- TRELLIS:START -->
2
- # Trellis Instructions
3
-
4
- These instructions are for AI assistants working in this project.
5
-
6
- Use the `/trellis:start` command when starting a new session to:
7
- - Initialize your developer identity
8
- - Understand current project context
9
- - Read relevant guidelines
10
-
11
- Use `@/.trellis/` to learn:
12
- - Development workflow (`workflow.md`)
13
- - Project structure guidelines (`spec/`)
14
- - Developer workspace (`workspace/`)
15
-
16
- Keep this managed block so 'trellis update' can refresh the instructions.
17
-
18
- <!-- TRELLIS:END -->
package/CLAUDE.md DELETED
@@ -1,315 +0,0 @@
1
- # DeepSpider - 智能爬虫工程平台
2
-
3
- > 基于 DeepAgents + Patchright 的智能爬虫 Agent,覆盖爬虫全生命周期
4
-
5
- ## 分析方法论
6
-
7
- ** 每次都分别从资深爬虫工程师和资深技术架构师的两个角度进行理性的辩论性的分析。**
8
- 从最佳实践出发,结合当前项目的实际架构。
9
-
10
- ## 功能
11
-
12
- ### 逆向分析
13
- - 真实浏览器动态分析 (Patchright + CDP)
14
- - Webpack/Browserify 解包 (webcrack)
15
- - 混淆代码分析与反混淆
16
- - 加密算法识别 (CryptoJS/RSA Hook)
17
- - 请求参数追踪
18
- - JS 转 Python 代码生成
19
-
20
- ### 验证码处理
21
- - 图片验证码 OCR 识别 (ddddocr)
22
- - 滑块验证码轨迹模拟
23
- - 点选验证码目标检测
24
- - 打码平台集成
25
-
26
- ### 反检测与风控
27
- - 浏览器指纹管理
28
- - 代理 IP 池管理
29
- - 请求特征伪装
30
- - 风控规避策略
31
-
32
- ### 爬虫编排
33
- - 智能流程规划
34
- - 完整爬虫脚本生成
35
- - 端到端测试验证
36
- - 按需调用,灵活组合
37
-
38
- ## 项目结构
39
-
40
- ```
41
- deepspider/
42
- ├── src/
43
- │ ├── agent/ # DeepAgent 系统
44
- │ │ ├── index.js # 主入口
45
- │ │ ├── run.js # Agent 运行入口
46
- │ │ ├── tools/ # 工具集(90+)
47
- │ │ ├── subagents/ # 子代理
48
- │ │ └── prompts/ # 系统提示
49
- │ ├── browser/ # 浏览器运行时
50
- │ │ ├── client.js # Patchright 客户端
51
- │ │ ├── cdp.js # CDP 会话管理
52
- │ │ ├── defaultHooks.js # 默认注入的 Hook
53
- │ │ ├── interceptors/ # CDP 拦截器
54
- │ │ │ ├── NetworkInterceptor.js
55
- │ │ │ └── ScriptInterceptor.js
56
- │ │ ├── ui/ # 浏览器内 UI
57
- │ │ │ └── analysisPanel.js
58
- │ │ └── hooks/ # Hook 脚本
59
- │ ├── store/ # 数据存储
60
- │ │ └── DataStore.js # 文件系统存储
61
- │ ├── analyzer/ # 静态分析器
62
- │ ├── core/ # 核心模块
63
- │ ├── env/ # 环境补丁模块
64
- │ └── mcp/ # MCP 服务
65
- ├── bin/cli.js # CLI 入口
66
- └── test/ # 测试
67
- ```
68
-
69
- ## 依赖版本
70
-
71
- ```json
72
- {
73
- "@babel/parser": "^7.26.0",
74
- "@babel/traverse": "^7.26.0",
75
- "@babel/generator": "^7.26.0",
76
- "deepagents": "^1.6.0",
77
- "@langchain/core": "^1.1.17",
78
- "@langchain/anthropic": "^1.3.12",
79
- "patchright": "^1.51.1",
80
- "webcrack": "^2.15.1",
81
- "isolated-vm": "^6.0.2",
82
- "zod": "^4.3.6"
83
- }
84
- ```
85
-
86
- ## 架构
87
-
88
- ### 子代理体系
89
-
90
- | 子代理 | 职责 | 核心工具 |
91
- |--------|------|----------|
92
- | crawler | 爬虫编排:整合各模块、生成完整脚本 | file, store, crawler |
93
- | static | 静态分析:解包、反混淆、加密定位 | webcrack, deobfuscate, analyze |
94
- | dynamic | 动态分析:浏览器控制、Hook、数据采集 | browser, debug, capture |
95
- | sandbox | 沙箱执行:环境补全、代码执行 | sandbox, env, patch |
96
- | js2python | JS转Python:加密代码转换、验证 | python, analyzer |
97
- | env-agent | 环境补全:生成浏览器环境模拟代码 | env, sandbox |
98
- | captcha | 验证码处理:OCR、滑块、点选 | captcha_ocr, captcha_slide |
99
- | anti-detect | 反检测:指纹管理、代理池 | proxy, fingerprint |
100
-
101
- ### 智能调度流程
102
-
103
- 根据目标网站复杂度,按需调用子代理:
104
-
105
- ```
106
- 用户:爬取目标网站
107
-
108
- ┌─────────────────────────────────────┐
109
- │ crawler-agent 分析目标 │
110
- │ 判断网站复杂度,规划流程 │
111
- └─────────────────────────────────────┘
112
-
113
- ┌─────────────────────────────────────┐
114
- │ 按需调用子代理 │
115
- │ │
116
- │ Level 1 简单: static → js2python │
117
- │ Level 2 中等: + captcha + dynamic │
118
- │ Level 3 复杂: + anti-detect + e2e │
119
- └─────────────────────────────────────┘
120
-
121
- ┌─────────────────────────────────────┐
122
- │ 输出完整爬虫脚本 │
123
- │ 简单: 单文件脚本 │
124
- │ 复杂: 完整项目结构 │
125
- └─────────────────────────────────────┘
126
- ```
127
-
128
- ### 浏览器交互流程
129
-
130
- ```
131
- pnpm run agent https://example.com
132
-
133
- ┌─────────────────────────────────────┐
134
- │ 浏览器启动,自动注入 Hook │
135
- │ CDP 拦截器记录请求/脚本 │
136
- │ 数据存储到 .deepspider-data/ │
137
- └─────────────────────────────────────┘
138
-
139
- ┌─────────────────────────────────────┐
140
- │ 用户在网站操作(登录、翻页等) │
141
- │ 系统持续记录数据 │
142
- └─────────────────────────────────────┘
143
-
144
- ┌─────────────────────────────────────┐
145
- │ 用户点击面板选择按钮(⦿) │
146
- │ 选择元素 → 显示操作菜单 │
147
- │ │
148
- │ 操作选项: │
149
- │ - 添加为字段(爬虫配置) │
150
- │ - 追踪数据来源 │
151
- │ - 分析加密逻辑 │
152
- │ - 完整流程分析 │
153
- └─────────────────────────────────────┘
154
-
155
- ┌─────────────────────────────────────┐
156
- │ 选择多个字段后点击"生成配置" │
157
- │ crawler 子代理整合分析结果 │
158
- │ 输出 config.json + crawler.py │
159
- └─────────────────────────────────────┘
160
- ```
161
-
162
- ## 代码规范
163
-
164
- ### Hook 内部数据过滤
165
-
166
- 系统内部操作(消息存储、前后端通信)不应触发 Hook 记录。使用统一标记过滤:
167
-
168
- ```javascript
169
- // Storage: 使用 deepspider_ 前缀
170
- sessionStorage.setItem('deepspider_messages', data); // 不触发 Hook
171
-
172
- // JSON: 使用 __ds__ 标记
173
- const msg = { __ds__: true, type: 'chat', text: '...' }; // 不触发 Hook
174
- ```
175
-
176
- | 场景 | 过滤方式 | 示例 |
177
- |------|----------|------|
178
- | sessionStorage | `deepspider_` 前缀 | `deepspider_chat_messages` |
179
- | 发送到后端的消息 | `__ds__: true` | `{ __ds__: true, type: 'chat' }` |
180
- | 面板消息对象 | `__ds__: true` | `{ __ds__: true, role, content }` |
181
-
182
- ### 浏览器交互
183
-
184
- 与浏览器的交互优先使用 CDP(Chrome DevTools Protocol)方式,而非 `page.evaluate()`。
185
-
186
- CDP session 应复用,通过 `browser.getCDPSession()` 获取:
187
-
188
- ```javascript
189
- // 复用 CDP session 执行 JS
190
- async function evaluateViaCDP(browser, expression) {
191
- const cdp = await browser.getCDPSession();
192
- if (!cdp) return null;
193
- const result = await cdp.send('Runtime.evaluate', {
194
- expression,
195
- returnByValue: true,
196
- });
197
- return result.result?.value;
198
- }
199
-
200
- // 使用示例
201
- const logs = await evaluateViaCDP(browser, `window.__deepspider__?.getAllLogs?.()`);
202
- ```
203
-
204
- ### Babel AST 遍历
205
-
206
- 使用 `@babel/traverse` 而非 acorn-walk:
207
-
208
- ```javascript
209
- import { parse } from '@babel/parser';
210
- import traverse from '@babel/traverse';
211
-
212
- // 解析代码
213
- const ast = parse(code, {
214
- sourceType: 'unambiguous',
215
- plugins: ['jsx', 'typescript', 'decorators-legacy'],
216
- errorRecovery: true,
217
- });
218
-
219
- // 遍历 AST
220
- traverse.default(ast, {
221
- FunctionDeclaration(path) {
222
- const node = path.node;
223
- // 处理函数声明
224
- },
225
- CallExpression(path) {
226
- const node = path.node;
227
- // 处理调用表达式
228
- }
229
- });
230
-
231
- // 遍历子节点(在 visitor 内部)
232
- path.traverse({
233
- Identifier(innerPath) {
234
- // 处理内部标识符
235
- }
236
- });
237
- ```
238
-
239
- ### LangChain 工具定义
240
-
241
- 使用 `@langchain/core/tools`:
242
-
243
- ```javascript
244
- import { z } from 'zod';
245
- import { tool } from '@langchain/core/tools';
246
-
247
- const myTool = tool(
248
- async ({ param1, param2 }) => {
249
- // 工具逻辑
250
- return result;
251
- },
252
- {
253
- name: 'tool_name',
254
- description: '工具描述',
255
- schema: z.object({
256
- param1: z.string().describe('参数1描述'),
257
- param2: z.number().optional().default(100),
258
- }),
259
- }
260
- );
261
- ```
262
-
263
- ### DeepAgent 创建
264
-
265
- ```javascript
266
- import { ChatAnthropic } from '@langchain/anthropic';
267
- import { createDeepAgent } from 'deepagents';
268
-
269
- export const agent = createDeepAgent({
270
- model: new ChatAnthropic({
271
- model: 'claude-sonnet-4-20250514',
272
- temperature: 0,
273
- }),
274
- tools: [tool1, tool2],
275
- systemPrompt: '系统提示',
276
- });
277
- ```
278
-
279
- ## 运行
280
-
281
- ```bash
282
- # 安装依赖
283
- pnpm install
284
-
285
- # 安装 Python 加密库(用于运行生成的 Python 代码)
286
- pnpm run setup:crypto
287
-
288
- # 配置环境变量
289
- cp .env.example .env
290
- # 编辑 .env 填入:
291
- # DEEPSPIDER_API_KEY=your-api-key
292
- # DEEPSPIDER_BASE_URL=https://api.openai.com/v1
293
- # DEEPSPIDER_MODEL=gpt-4o
294
-
295
- # Agent 模式(推荐)- 指定目标网站
296
- pnpm run agent https://example.com
297
-
298
- # Agent 模式 - 纯交互(不启动浏览器)
299
- pnpm run agent
300
-
301
- # MCP 服务(供 Claude Code 等调用)
302
- pnpm run mcp
303
-
304
- # 测试
305
- pnpm test
306
- ```
307
-
308
- ### Agent 使用流程
309
-
310
- 1. **启动**: `pnpm run agent https://target-site.com`
311
- 2. **等待**: 浏览器打开,系统自动记录数据(不消耗 API)
312
- 3. **操作**: 在网站上登录、翻页、触发目标请求
313
- 4. **选择**: 点击面板的选择按钮(⦿),进入选择模式
314
- 5. **分析**: 点击目标数据,确认后发送给 Agent
315
- 6. **对话**: 在面板或 CLI 继续提问,深入分析
@@ -1,142 +0,0 @@
1
- ---
2
- description: 智能爬虫 Agent。JS逆向、动态调试、代码解包、反混淆、加密捕获、爬虫生成。
3
- capabilities: ["JS逆向", "动态调试", "代码解包", "反混淆", "加密捕获", "爬虫生成"]
4
- ---
5
-
6
- 你是 DeepSpider,一个智能爬虫 Agent。
7
-
8
- ## 核心能力
9
-
10
- 1. **动态调试** - Patchright 反检测浏览器 + CDP 断点调试
11
- 2. **代码解包** - Webpack/Browserify 自动解包 (webcrack)
12
- 3. **反混淆** - AST 解析、控制流还原、字符串解密
13
- 4. **加密捕获** - Hook 捕获 CryptoJS/RSA 加密调用
14
- 5. **环境补全** - 检测并补全浏览器环境依赖
15
-
16
- ## 工作原则
17
-
18
- 1. **最小补丁**: 只补充必要的环境,避免过度补丁
19
- 2. **迭代验证**: 每次补丁后验证执行结果
20
- 3. **清晰输出**: 生成可独立运行的代码
21
-
22
- ## 工作流程
23
-
24
- ### 完整分析流程
25
- ```
26
- 1. preprocess_code 预处理(自动解包或反混淆)
27
- 2. deobfuscate 深度反混淆
28
- 3. analyze_encryption 定位加密入口
29
- 4. launch_browser 启动浏览器
30
- 5. set_breakpoint 设置断点
31
- 6. collect_env 采集环境数据
32
- 7. generate_patch 生成补丁
33
- 8. sandbox_execute 沙箱验证
34
- ```
35
-
36
- ### 补环境任务
37
- ```
38
- 1. 读取目标 JS 代码
39
- 2. 使用 sandbox_execute 执行,捕获错误
40
- 3. 分析缺失的环境属性
41
- 4. 使用 generate_patch 生成补丁
42
- 5. 使用 sandbox_inject 注入补丁
43
- 6. 重复直到成功
44
- ```
45
-
46
- ### 加密分析任务
47
- ```
48
- 1. launch_browser 打开目标页面
49
- 2. 注入 Hook 脚本捕获加密调用
50
- 3. 触发页面操作
51
- 4. get_hook_logs 获取捕获的密钥和参数
52
- 5. 输出分析报告
53
- ```
54
-
55
- ## 环境补丁模板
56
-
57
- ### Navigator
58
- ```javascript
59
- const navigator = {
60
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
61
- platform: 'Win32',
62
- language: 'zh-CN',
63
- languages: ['zh-CN', 'en'],
64
- cookieEnabled: true,
65
- onLine: true,
66
- hardwareConcurrency: 8
67
- };
68
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
69
- ```
70
-
71
- ### Document
72
- ```javascript
73
- const document = {
74
- cookie: '',
75
- referrer: '',
76
- domain: 'example.com',
77
- title: '',
78
- createElement: (tag) => ({ tagName: tag, style: {} }),
79
- getElementById: () => null,
80
- querySelector: () => null,
81
- querySelectorAll: () => []
82
- };
83
- ```
84
-
85
- ### Window
86
- ```javascript
87
- const window = {
88
- innerWidth: 1920,
89
- innerHeight: 1080,
90
- devicePixelRatio: 1,
91
- location: { href: 'https://example.com/', hostname: 'example.com', protocol: 'https:' },
92
- navigator,
93
- document,
94
- localStorage: { getItem: () => null, setItem: () => {} },
95
- sessionStorage: { getItem: () => null, setItem: () => {} },
96
- atob: (s) => Buffer.from(s, 'base64').toString(),
97
- btoa: (s) => Buffer.from(s).toString('base64')
98
- };
99
- ```
100
-
101
- ## 加密模式识别
102
-
103
- | 模式 | 特征 |
104
- |------|------|
105
- | MD5 | `md5(`, 32位十六进制输出 |
106
- | SHA256 | `sha256`, `SHA256`, 64位十六进制输出 |
107
- | AES | `CryptoJS.AES`, `aes.*encrypt` |
108
- | RSA | `JSEncrypt`, `RSAKey` |
109
- | Base64 | `btoa`, `atob` |
110
- | HMAC | `hmac`, `HMAC` |
111
-
112
- ## 输出格式
113
-
114
- 分析完成后,输出:
115
- 1. **执行结果**: 代码执行的返回值
116
- 2. **补丁代码**: 完整的环境补丁(可独立运行)
117
- 3. **分析报告**: 检测到的加密算法、关键函数等
118
-
119
- ## Hook 模板
120
-
121
- ### CryptoJS Hook (自动注入)
122
- ```javascript
123
- // 由 browser/hooks/crypto.js 自动注入
124
- // 捕获 CryptoJS.AES.encrypt 等调用
125
- // 输出: Key, IV, Mode, 明文, 密文
126
- ```
127
-
128
- ### RSA Hook (自动注入)
129
- ```javascript
130
- // 由 browser/hooks/crypto.js 自动注入
131
- // 捕获 JSEncrypt.encrypt 调用
132
- // 输出: 公钥, 明文, 密文
133
- ```
134
-
135
- ### 自定义函数 Hook
136
- ```javascript
137
- const _orig = window.func;
138
- window.func = function(...args) {
139
- console.log('[Hook] func:', args);
140
- return _orig.apply(this, args);
141
- };
142
- ```
package/docs/DEBUG.md DELETED
@@ -1,42 +0,0 @@
1
- # DeepSpider 调试指南
2
-
3
- ## 1. MCP 服务测试
4
-
5
- ```bash
6
- # 启动 MCP 服务
7
- pnpm run mcp
8
-
9
- # MCP Inspector 测试
10
- npx @modelcontextprotocol/inspector node src/mcp/server.js
11
- ```
12
-
13
- ## 2. 浏览器调试
14
-
15
- ```bash
16
- # 测试浏览器启动
17
- node test/browser.test.js
18
- ```
19
-
20
- 浏览器使用 Patchright (反检测 Playwright),默认非 headless 模式。
21
-
22
- ## 3. 工具验证
23
-
24
- ```bash
25
- # 验证所有工具导入
26
- node -e "import('./src/agent/tools/index.js').then(m => console.log('工具数:', m.allTools.length))"
27
-
28
- # 测试单个工具
29
- node -e "
30
- import { preprocessCode } from './src/agent/tools/preprocess.js';
31
- preprocessCode.invoke({ code: 'var a=1' }).then(console.log);
32
- "
33
- ```
34
-
35
- ## 4. 常见问题
36
-
37
- | 问题 | 排查方法 |
38
- |------|----------|
39
- | MCP 启动失败 | `node src/mcp/server.js` |
40
- | 浏览器启动失败 | 检查 patchright 安装 |
41
- | 沙箱执行失败 | 检查 isolated-vm 依赖 |
42
- | webcrack 解包失败 | 确认是 Webpack/Browserify 格式 |