deepspider 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +3 -0
- package/README.md +13 -13
- package/package.json +6 -6
- package/src/agent/core/PanelBridge.js +29 -77
- package/src/agent/core/StreamHandler.js +139 -14
- package/src/agent/index.js +51 -12
- package/src/agent/logger.js +184 -9
- package/src/agent/middleware/report.js +42 -16
- package/src/agent/middleware/subagent.js +233 -0
- package/src/agent/middleware/toolGuard.js +77 -0
- package/src/agent/middleware/validationWorkflow.js +171 -0
- package/src/agent/prompts/system.js +181 -59
- package/src/agent/run.js +41 -6
- package/src/agent/skills/crawler/SKILL.md +64 -3
- package/src/agent/skills/crawler/evolved.md +9 -1
- package/src/agent/skills/dynamic-analysis/SKILL.md +74 -7
- package/src/agent/skills/env/SKILL.md +75 -0
- package/src/agent/skills/evolve.js +0 -3
- package/src/agent/skills/sandbox/SKILL.md +35 -0
- package/src/agent/skills/static-analysis/SKILL.md +98 -2
- package/src/agent/subagents/anti-detect.js +10 -20
- package/src/agent/subagents/captcha.js +7 -19
- package/src/agent/subagents/crawler.js +25 -37
- package/src/agent/subagents/factory.js +109 -9
- package/src/agent/subagents/index.js +4 -13
- package/src/agent/subagents/js2python.js +7 -19
- package/src/agent/subagents/reverse.js +180 -0
- package/src/agent/tools/analysis.js +84 -1
- package/src/agent/tools/anti-detect.js +5 -2
- package/src/agent/tools/browser.js +160 -0
- package/src/agent/tools/captcha.js +1 -1
- package/src/agent/tools/capture.js +24 -3
- package/src/agent/tools/correlate.js +129 -15
- package/src/agent/tools/crawler.js +2 -1
- package/src/agent/tools/crawlerGenerator.js +90 -0
- package/src/agent/tools/debug.js +43 -6
- package/src/agent/tools/evolve.js +6 -3
- package/src/agent/tools/extractor.js +5 -1
- package/src/agent/tools/file.js +16 -7
- package/src/agent/tools/generateHook.js +66 -0
- package/src/agent/tools/hookManager.js +19 -9
- package/src/agent/tools/index.js +33 -20
- package/src/agent/tools/nodejs.js +41 -6
- package/src/agent/tools/python.js +4 -4
- package/src/agent/tools/report.js +2 -2
- package/src/agent/tools/runtime.js +1 -1
- package/src/agent/tools/sandbox.js +21 -1
- package/src/agent/tools/scratchpad.js +70 -0
- package/src/agent/tools/tracing.js +26 -0
- package/src/agent/tools/verifyAlgorithm.js +117 -0
- package/src/analyzer/EncryptionAnalyzer.js +2 -2
- package/src/browser/EnvBridge.js +27 -13
- package/src/browser/client.js +124 -18
- package/src/browser/collector.js +101 -22
- package/src/browser/defaultHooks.js +3 -1
- package/src/browser/hooks/index.js +5 -0
- package/src/browser/interceptors/AntiDebugInterceptor.js +132 -0
- package/src/browser/interceptors/NetworkInterceptor.js +77 -13
- package/src/browser/interceptors/ScriptInterceptor.js +34 -9
- package/src/browser/interceptors/index.js +1 -0
- package/src/browser/ui/analysisPanel.js +469 -464
- package/src/cli/commands/config.js +11 -3
- package/src/config/paths.js +9 -1
- package/src/config/settings.js +7 -1
- package/src/core/PatchGenerator.js +26 -6
- package/src/core/Sandbox.js +140 -3
- package/src/env/EnvCodeGenerator.js +60 -88
- package/src/env/modules/bom/history.js +6 -0
- package/src/env/modules/bom/location.js +6 -0
- package/src/env/modules/bom/navigator.js +13 -0
- package/src/env/modules/bom/screen.js +6 -0
- package/src/env/modules/bom/storage.js +7 -0
- package/src/env/modules/dom/document.js +14 -0
- package/src/env/modules/dom/event.js +4 -0
- package/src/env/modules/index.js +27 -10
- package/src/env/modules/webapi/fetch.js +4 -0
- package/src/env/modules/webapi/url.js +4 -0
- package/src/env/modules/webapi/xhr.js +8 -0
- package/src/store/DataStore.js +130 -47
- package/src/store/Store.js +2 -1
- package/src/agent/subagents/dynamic.js +0 -64
- package/src/agent/subagents/env-agent.js +0 -82
- package/src/agent/subagents/sandbox.js +0 -55
- package/src/agent/subagents/static.js +0 -66
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepSpider - 统一算法验证工具
|
|
3
|
+
* 合并 verify_md5/sha256/hmac/aes + identify_encryption
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import { tool } from '@langchain/core/tools';
|
|
8
|
+
import crypto from 'crypto';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* 识别密文特征
|
|
12
|
+
*/
|
|
13
|
+
function identifyPattern(ciphertext) {
|
|
14
|
+
const features = [];
|
|
15
|
+
const len = ciphertext.length;
|
|
16
|
+
|
|
17
|
+
if (len === 32) features.push('可能是 MD5');
|
|
18
|
+
if (len === 40) features.push('可能是 SHA1');
|
|
19
|
+
if (len === 64) features.push('可能是 SHA256');
|
|
20
|
+
if (len === 128) features.push('可能是 SHA512');
|
|
21
|
+
if (/^[A-Za-z0-9+/]+=*$/.test(ciphertext)) features.push('Base64 编码');
|
|
22
|
+
if (/^[0-9a-fA-F]+$/.test(ciphertext)) features.push('Hex 编码');
|
|
23
|
+
if (/^eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$/.test(ciphertext)) features.push('JWT Token');
|
|
24
|
+
|
|
25
|
+
return features;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export const verifyAlgorithm = tool(
|
|
29
|
+
async ({ algorithm, input, expected, key, iv, hmacHash, aesMode }) => {
|
|
30
|
+
// 识别模式:不传 algorithm,只传 expected
|
|
31
|
+
if (!algorithm) {
|
|
32
|
+
return JSON.stringify({
|
|
33
|
+
ciphertext: expected.slice(0, 50) + (expected.length > 50 ? '...' : ''),
|
|
34
|
+
length: expected.length,
|
|
35
|
+
features: identifyPattern(expected),
|
|
36
|
+
}, null, 2);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// 验证模式:需要 input + expected
|
|
40
|
+
if (!input) {
|
|
41
|
+
return JSON.stringify({ error: `验证 ${algorithm} 需要 input 参数` });
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
let computed;
|
|
45
|
+
let algoLabel;
|
|
46
|
+
|
|
47
|
+
switch (algorithm) {
|
|
48
|
+
case 'md5':
|
|
49
|
+
case 'sha1':
|
|
50
|
+
case 'sha256':
|
|
51
|
+
case 'sha512': {
|
|
52
|
+
computed = crypto.createHash(algorithm).update(input).digest('hex');
|
|
53
|
+
algoLabel = algorithm.toUpperCase();
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
case 'hmac': {
|
|
57
|
+
if (!key) return JSON.stringify({ error: 'HMAC 需要 key 参数' });
|
|
58
|
+
const hash = hmacHash || 'sha256';
|
|
59
|
+
computed = crypto.createHmac(hash, key).update(input).digest('hex');
|
|
60
|
+
algoLabel = `HMAC-${hash.toUpperCase()}`;
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
case 'aes': {
|
|
64
|
+
if (!key) return JSON.stringify({ error: 'AES 需要 key 参数' });
|
|
65
|
+
try {
|
|
66
|
+
const keyBuf = Buffer.from(key, 'utf8');
|
|
67
|
+
const ivBuf = iv ? Buffer.from(iv, 'utf8') : Buffer.alloc(16, 0);
|
|
68
|
+
const mode = aesMode || 'cbc';
|
|
69
|
+
const cipher = crypto.createCipheriv(
|
|
70
|
+
`aes-${keyBuf.length * 8}-${mode}`,
|
|
71
|
+
keyBuf,
|
|
72
|
+
mode === 'ecb' ? null : ivBuf
|
|
73
|
+
);
|
|
74
|
+
computed = cipher.update(input, 'utf8', 'base64') + cipher.final('base64');
|
|
75
|
+
algoLabel = `AES-${keyBuf.length * 8}-${mode.toUpperCase()}`;
|
|
76
|
+
} catch (e) {
|
|
77
|
+
return JSON.stringify({ error: e.message });
|
|
78
|
+
}
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
default:
|
|
82
|
+
return JSON.stringify({ error: `未知算法: ${algorithm}` });
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const match = algorithm === 'aes'
|
|
86
|
+
? computed === expected
|
|
87
|
+
: computed.toLowerCase() === expected.toLowerCase();
|
|
88
|
+
|
|
89
|
+
return JSON.stringify({
|
|
90
|
+
algorithm: algoLabel,
|
|
91
|
+
input,
|
|
92
|
+
computed,
|
|
93
|
+
expected,
|
|
94
|
+
match,
|
|
95
|
+
conclusion: match ? `标准 ${algoLabel}` : '可能魔改或参数不同',
|
|
96
|
+
}, null, 2);
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
name: 'verify_algorithm',
|
|
100
|
+
description: `验证是否为标准加密算法,或根据密文特征识别算法类型。
|
|
101
|
+
|
|
102
|
+
验证模式:传入 algorithm + input + expected,对比计算结果
|
|
103
|
+
识别模式:只传 expected(密文),自动识别可能的算法类型`,
|
|
104
|
+
schema: z.object({
|
|
105
|
+
algorithm: z.enum(['md5', 'sha1', 'sha256', 'sha512', 'hmac', 'aes']).optional()
|
|
106
|
+
.describe('算法类型。不传则进入识别模式'),
|
|
107
|
+
input: z.string().optional().describe('原始输入'),
|
|
108
|
+
expected: z.string().describe('目标加密结果(验证模式)或待识别的密文(识别模式)'),
|
|
109
|
+
key: z.string().optional().describe('密钥(HMAC/AES 需要)'),
|
|
110
|
+
iv: z.string().optional().describe('IV 向量(AES 可选)'),
|
|
111
|
+
hmacHash: z.enum(['md5', 'sha1', 'sha256', 'sha512']).optional().describe('HMAC 哈希算法,默认 sha256'),
|
|
112
|
+
aesMode: z.enum(['cbc', 'ecb']).optional().describe('AES 模式,默认 cbc'),
|
|
113
|
+
}),
|
|
114
|
+
}
|
|
115
|
+
);
|
|
116
|
+
|
|
117
|
+
export const verifyAlgorithmTools = [verifyAlgorithm];
|
|
@@ -51,7 +51,7 @@ export class EncryptionAnalyzer {
|
|
|
51
51
|
}
|
|
52
52
|
|
|
53
53
|
traceParam(code, paramName) {
|
|
54
|
-
const
|
|
54
|
+
const _ast = this.astAnalyzer.parse(code);
|
|
55
55
|
const traces = [];
|
|
56
56
|
|
|
57
57
|
// 简化实现:查找参数使用位置
|
|
@@ -80,7 +80,7 @@ export class EncryptionAnalyzer {
|
|
|
80
80
|
// 使用模式库进行深度检测
|
|
81
81
|
detectWithPatterns(code) {
|
|
82
82
|
const detected = [];
|
|
83
|
-
for (const [
|
|
83
|
+
for (const [_key, pattern] of Object.entries(cryptoPatterns)) {
|
|
84
84
|
for (const sig of pattern.signatures) {
|
|
85
85
|
if (sig.test(code)) {
|
|
86
86
|
detected.push({
|
package/src/browser/EnvBridge.js
CHANGED
|
@@ -26,24 +26,38 @@ export class EnvBridge {
|
|
|
26
26
|
|
|
27
27
|
for (const path of missingPaths) {
|
|
28
28
|
try {
|
|
29
|
-
// 1.
|
|
29
|
+
// 1. 尝试从真实浏览器采集
|
|
30
30
|
const collected = await this.collector.collect(path, { depth: 2 });
|
|
31
31
|
|
|
32
|
-
if (
|
|
33
|
-
results.
|
|
34
|
-
|
|
35
|
-
}
|
|
32
|
+
if (collected.success) {
|
|
33
|
+
results.collected.push(path);
|
|
34
|
+
this.collectedData.set(path, collected);
|
|
36
35
|
|
|
37
|
-
|
|
38
|
-
|
|
36
|
+
// 2. 生成补丁代码
|
|
37
|
+
const patch = this._generatePatch(path, collected);
|
|
38
|
+
if (patch) {
|
|
39
|
+
results.patched.push({ path, code: patch });
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
39
43
|
|
|
40
|
-
//
|
|
41
|
-
const
|
|
42
|
-
if (
|
|
43
|
-
results.patched.push({ path, code:
|
|
44
|
+
// 3. 采集失败时 fallback 到 PatchGenerator
|
|
45
|
+
const fallback = await this.patchGenerator.generate(path);
|
|
46
|
+
if (fallback.code) {
|
|
47
|
+
results.patched.push({ path, code: fallback.code, source: fallback.source });
|
|
48
|
+
} else {
|
|
49
|
+
results.failed.push({ path, reason: 'no_patch' });
|
|
44
50
|
}
|
|
45
51
|
|
|
46
52
|
} catch (e) {
|
|
53
|
+
// 浏览器不可用时也 fallback
|
|
54
|
+
try {
|
|
55
|
+
const fallback = await this.patchGenerator.generate(path);
|
|
56
|
+
if (fallback.code) {
|
|
57
|
+
results.patched.push({ path, code: fallback.code, source: fallback.source });
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
} catch { /* ignore */ }
|
|
47
61
|
results.failed.push({ path, reason: 'error', error: e.message });
|
|
48
62
|
}
|
|
49
63
|
}
|
|
@@ -66,7 +80,7 @@ export class EnvBridge {
|
|
|
66
80
|
// 根据数据类型生成不同的补丁
|
|
67
81
|
switch (data.type) {
|
|
68
82
|
case 'string':
|
|
69
|
-
return `${parentPath}.${propName} =
|
|
83
|
+
return `${parentPath}.${propName} = ${JSON.stringify(data.value)};`;
|
|
70
84
|
|
|
71
85
|
case 'number':
|
|
72
86
|
return `${parentPath}.${propName} = ${data.value};`;
|
|
@@ -123,7 +137,7 @@ export class EnvBridge {
|
|
|
123
137
|
|
|
124
138
|
_serializeValue(data) {
|
|
125
139
|
switch (data.type) {
|
|
126
|
-
case 'string': return
|
|
140
|
+
case 'string': return JSON.stringify(data.value);
|
|
127
141
|
case 'number': return data.value;
|
|
128
142
|
case 'boolean': return data.value;
|
|
129
143
|
case 'null': return 'null';
|
package/src/browser/client.js
CHANGED
|
@@ -8,6 +8,7 @@ import { EventEmitter } from 'events';
|
|
|
8
8
|
import { getDefaultHookScript } from './defaultHooks.js';
|
|
9
9
|
import { NetworkInterceptor } from './interceptors/NetworkInterceptor.js';
|
|
10
10
|
import { ScriptInterceptor } from './interceptors/ScriptInterceptor.js';
|
|
11
|
+
import { AntiDebugInterceptor } from './interceptors/AntiDebugInterceptor.js';
|
|
11
12
|
import { getDataStore } from '../store/DataStore.js';
|
|
12
13
|
|
|
13
14
|
export class BrowserClient extends EventEmitter {
|
|
@@ -20,9 +21,13 @@ export class BrowserClient extends EventEmitter {
|
|
|
20
21
|
this.cdpSession = null;
|
|
21
22
|
this.networkInterceptor = null;
|
|
22
23
|
this.scriptInterceptor = null;
|
|
24
|
+
this.antiDebugInterceptor = null;
|
|
23
25
|
this.hookScript = null;
|
|
24
26
|
this.onMessage = null;
|
|
25
27
|
this._isCleaningUp = false;
|
|
28
|
+
// CDP session 健康检查节流
|
|
29
|
+
this._cdpLastCheck = 0;
|
|
30
|
+
this._cdpCheckInterval = 5000; // 5秒内不重复检查
|
|
26
31
|
}
|
|
27
32
|
|
|
28
33
|
/**
|
|
@@ -37,6 +42,7 @@ export class BrowserClient extends EventEmitter {
|
|
|
37
42
|
headless = false,
|
|
38
43
|
executablePath = null,
|
|
39
44
|
args = [],
|
|
45
|
+
userDataDir = null,
|
|
40
46
|
} = options;
|
|
41
47
|
|
|
42
48
|
const launchOptions = {
|
|
@@ -53,12 +59,22 @@ export class BrowserClient extends EventEmitter {
|
|
|
53
59
|
launchOptions.executablePath = executablePath;
|
|
54
60
|
}
|
|
55
61
|
|
|
56
|
-
this.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
ignoreHTTPSErrors
|
|
61
|
-
|
|
62
|
+
this._persistent = !!userDataDir;
|
|
63
|
+
|
|
64
|
+
if (userDataDir) {
|
|
65
|
+
// 持久化模式:launchPersistentContext 返回 BrowserContext
|
|
66
|
+
launchOptions.ignoreHTTPSErrors = true;
|
|
67
|
+
this.context = await chromium.launchPersistentContext(userDataDir, launchOptions);
|
|
68
|
+
this.browser = this.context.browser();
|
|
69
|
+
this.emit('launched', { headless, persistent: true });
|
|
70
|
+
} else {
|
|
71
|
+
// 临时模式(原有逻辑)
|
|
72
|
+
this.browser = await chromium.launch(launchOptions);
|
|
73
|
+
this.emit('launched', { headless });
|
|
74
|
+
this.context = await this.browser.newContext({
|
|
75
|
+
ignoreHTTPSErrors: true,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
62
78
|
|
|
63
79
|
// 保存 hook 脚本
|
|
64
80
|
this.hookScript = getDefaultHookScript();
|
|
@@ -66,11 +82,22 @@ export class BrowserClient extends EventEmitter {
|
|
|
66
82
|
// 使用 addInitScript 在 context 级别注入
|
|
67
83
|
await this.context.addInitScript(this.hookScript);
|
|
68
84
|
|
|
69
|
-
|
|
85
|
+
// 持久化上下文自带默认页面,临时模式需要新建
|
|
86
|
+
this.page = this._persistent
|
|
87
|
+
? (this.context.pages()[0] || await this.context.newPage())
|
|
88
|
+
: await this.context.newPage();
|
|
70
89
|
|
|
71
90
|
// 监听新页面创建(弹窗、新标签页)
|
|
72
91
|
this.context.on('page', async (newPage) => {
|
|
73
92
|
console.log('[BrowserClient] 检测到新页面');
|
|
93
|
+
|
|
94
|
+
// 清理旧页面的 CDP session(避免泄漏)
|
|
95
|
+
if (this.cdpSession && this._cdpSessionPage && this._cdpSessionPage !== newPage) {
|
|
96
|
+
await this.cdpSession.detach().catch(() => {});
|
|
97
|
+
this.cdpSession = null;
|
|
98
|
+
this._cdpSessionPage = null;
|
|
99
|
+
}
|
|
100
|
+
|
|
74
101
|
this.pages.push(newPage);
|
|
75
102
|
this.page = newPage; // 切换到新页面
|
|
76
103
|
await this.setupPage(newPage);
|
|
@@ -96,6 +123,13 @@ export class BrowserClient extends EventEmitter {
|
|
|
96
123
|
*/
|
|
97
124
|
async setupPage(page) {
|
|
98
125
|
try {
|
|
126
|
+
// 如果这是当前页面的重新设置,先清理旧的 session
|
|
127
|
+
if (page === this.page && this.cdpSession && this._cdpSessionPage === page) {
|
|
128
|
+
await this.cdpSession.detach().catch(() => {});
|
|
129
|
+
this.cdpSession = null;
|
|
130
|
+
this._cdpSessionPage = null;
|
|
131
|
+
}
|
|
132
|
+
|
|
99
133
|
const cdp = await page.context().newCDPSession(page);
|
|
100
134
|
|
|
101
135
|
// 1. 启用 Runtime 域
|
|
@@ -125,11 +159,22 @@ export class BrowserClient extends EventEmitter {
|
|
|
125
159
|
await networkInterceptor.start();
|
|
126
160
|
await scriptInterceptor.start();
|
|
127
161
|
|
|
128
|
-
//
|
|
162
|
+
// 反无限 debugger:必须在 ScriptInterceptor 之后(Debugger 域已启用)
|
|
163
|
+
const antiDebugInterceptor = new AntiDebugInterceptor(cdp);
|
|
164
|
+
await antiDebugInterceptor.start();
|
|
165
|
+
|
|
166
|
+
// ScriptInterceptor 拉取源码后通知 AntiDebugInterceptor,避免重复 CDP 调用
|
|
167
|
+
scriptInterceptor.onSource = (scriptId, source) => {
|
|
168
|
+
antiDebugInterceptor.checkScript(scriptId, source);
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
// 保存引用(仅对当前活动页面)
|
|
129
172
|
if (page === this.page) {
|
|
130
173
|
this.cdpSession = cdp;
|
|
174
|
+
this._cdpSessionPage = page; // 关键:设置标记,让 getCDPSession 知道这是当前页面的 session
|
|
131
175
|
this.networkInterceptor = networkInterceptor;
|
|
132
176
|
this.scriptInterceptor = scriptInterceptor;
|
|
177
|
+
this.antiDebugInterceptor = antiDebugInterceptor;
|
|
133
178
|
}
|
|
134
179
|
|
|
135
180
|
// 监听页面导航
|
|
@@ -146,17 +191,52 @@ export class BrowserClient extends EventEmitter {
|
|
|
146
191
|
}
|
|
147
192
|
|
|
148
193
|
/**
|
|
149
|
-
* 获取 CDP
|
|
194
|
+
* 获取 CDP 会话(复用已有 session,仅在 page 变化时重建)
|
|
150
195
|
*/
|
|
151
196
|
async getCDPSession() {
|
|
152
|
-
|
|
153
|
-
|
|
197
|
+
if (!this.page) return this.cdpSession;
|
|
198
|
+
|
|
199
|
+
// page 未变且 session 存在 → 复用
|
|
200
|
+
if (this.cdpSession && this._cdpSessionPage === this.page) {
|
|
201
|
+
// 节流:避免频繁健康检查
|
|
202
|
+
const now = Date.now();
|
|
203
|
+
if (now - this._cdpLastCheck < this._cdpCheckInterval) {
|
|
204
|
+
return this.cdpSession;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
try {
|
|
208
|
+
// 通过简单的 Runtime.evaluate 验证 session 是否还活着
|
|
209
|
+
await this.cdpSession.send('Runtime.evaluate', { expression: '1' });
|
|
210
|
+
this._cdpLastCheck = now;
|
|
211
|
+
return this.cdpSession;
|
|
212
|
+
} catch {
|
|
213
|
+
// session 已失效,需要重新创建
|
|
214
|
+
console.log('[BrowserClient] CDP session 已失效,重新创建');
|
|
215
|
+
this.cdpSession = null;
|
|
216
|
+
this._cdpSessionPage = null;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// page 变了或 session 失效 → detach 旧 session,创建新的
|
|
221
|
+
if (this.cdpSession) {
|
|
154
222
|
try {
|
|
155
|
-
|
|
156
|
-
} catch
|
|
157
|
-
|
|
158
|
-
return null;
|
|
223
|
+
await this.cdpSession.detach();
|
|
224
|
+
} catch {
|
|
225
|
+
// 忽略 detach 错误(session 可能已断开)
|
|
159
226
|
}
|
|
227
|
+
this.cdpSession = null;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
try {
|
|
231
|
+
this.cdpSession = await this.page.context().newCDPSession(this.page);
|
|
232
|
+
this._cdpSessionPage = this.page;
|
|
233
|
+
this._cdpLastCheck = Date.now();
|
|
234
|
+
console.log('[BrowserClient] CDP session 已创建');
|
|
235
|
+
} catch (e) {
|
|
236
|
+
console.error('[BrowserClient] 创建 CDP session 失败:', e.message);
|
|
237
|
+
this.cdpSession = null;
|
|
238
|
+
this._cdpSessionPage = null;
|
|
239
|
+
return null;
|
|
160
240
|
}
|
|
161
241
|
return this.cdpSession;
|
|
162
242
|
}
|
|
@@ -165,8 +245,19 @@ export class BrowserClient extends EventEmitter {
|
|
|
165
245
|
* 导航到 URL
|
|
166
246
|
*/
|
|
167
247
|
async navigate(url, options = {}) {
|
|
168
|
-
const { waitUntil = 'domcontentloaded' } = options;
|
|
169
|
-
|
|
248
|
+
const { waitUntil = 'domcontentloaded', timeout = 30000 } = options;
|
|
249
|
+
try {
|
|
250
|
+
await this.page.goto(url, { waitUntil, timeout });
|
|
251
|
+
} catch (e) {
|
|
252
|
+
// 超时不一定是错误,页面可能仍在加载,继续执行
|
|
253
|
+
if (e.message?.includes('timeout')) {
|
|
254
|
+
console.log('[BrowserClient] 导航超时,继续等待页面稳定...');
|
|
255
|
+
// 等待一小段时间让页面尽可能完成加载
|
|
256
|
+
await this.page.waitForTimeout(2000);
|
|
257
|
+
} else {
|
|
258
|
+
throw e;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
170
261
|
return this.page.url();
|
|
171
262
|
}
|
|
172
263
|
|
|
@@ -208,6 +299,9 @@ export class BrowserClient extends EventEmitter {
|
|
|
208
299
|
await this.scriptInterceptor.stop?.().catch(() => {});
|
|
209
300
|
this.scriptInterceptor = null;
|
|
210
301
|
}
|
|
302
|
+
if (this.antiDebugInterceptor) {
|
|
303
|
+
this.antiDebugInterceptor = null;
|
|
304
|
+
}
|
|
211
305
|
|
|
212
306
|
// 分离 CDP session
|
|
213
307
|
if (this.cdpSession) {
|
|
@@ -216,7 +310,16 @@ export class BrowserClient extends EventEmitter {
|
|
|
216
310
|
}
|
|
217
311
|
|
|
218
312
|
// 关闭浏览器
|
|
219
|
-
if (this.
|
|
313
|
+
if (this._persistent) {
|
|
314
|
+
// 持久化模式:关闭 context 即保存数据并关闭浏览器
|
|
315
|
+
if (this.context) {
|
|
316
|
+
await this.context.close();
|
|
317
|
+
this.context = null;
|
|
318
|
+
this.browser = null;
|
|
319
|
+
this.page = null;
|
|
320
|
+
this.pages = [];
|
|
321
|
+
}
|
|
322
|
+
} else if (this.browser) {
|
|
220
323
|
await this.browser.close();
|
|
221
324
|
this.browser = null;
|
|
222
325
|
this.context = null;
|
|
@@ -229,6 +332,9 @@ export class BrowserClient extends EventEmitter {
|
|
|
229
332
|
this.emit('error', e);
|
|
230
333
|
} finally {
|
|
231
334
|
this._isCleaningUp = false;
|
|
335
|
+
// 重置 CDP 相关状态
|
|
336
|
+
this._cdpLastCheck = 0;
|
|
337
|
+
this._cdpSessionPage = null;
|
|
232
338
|
}
|
|
233
339
|
}
|
|
234
340
|
}
|
package/src/browser/collector.js
CHANGED
|
@@ -15,13 +15,17 @@ export class EnvCollector {
|
|
|
15
15
|
* @param {object} options - 采集选项
|
|
16
16
|
*/
|
|
17
17
|
async collect(path, options = {}) {
|
|
18
|
-
const { depth = 1, includeProto = false, useCache = true } = options;
|
|
18
|
+
const { depth = 1, includeProto = false, useCache = true, timeout = 5000 } = options;
|
|
19
19
|
|
|
20
20
|
if (useCache && this.cache.has(path)) {
|
|
21
21
|
return this.cache.get(path);
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
// 使用 Promise.race 添加超时保护
|
|
25
|
+
const evaluatePromise = this.page.evaluate(({ path, depth, includeProto: _includeProto }) => {
|
|
26
|
+
// 用于检测循环引用的 WeakSet
|
|
27
|
+
const seen = new WeakSet();
|
|
28
|
+
|
|
25
29
|
function getByPath(obj, path) {
|
|
26
30
|
return path.split('.').reduce((o, k) => o && o[k], obj);
|
|
27
31
|
}
|
|
@@ -40,29 +44,55 @@ export class EnvCollector {
|
|
|
40
44
|
return { type, value: val };
|
|
41
45
|
}
|
|
42
46
|
|
|
47
|
+
// 检测循环引用
|
|
48
|
+
if (seen.has(val)) {
|
|
49
|
+
return { type: 'object', value: '[Circular]', circular: true };
|
|
50
|
+
}
|
|
51
|
+
|
|
43
52
|
if (currentDepth >= maxDepth) {
|
|
44
53
|
return { type: 'object', value: '[Object]', truncated: true };
|
|
45
54
|
}
|
|
46
55
|
|
|
56
|
+
seen.add(val);
|
|
57
|
+
|
|
47
58
|
if (Array.isArray(val)) {
|
|
48
59
|
return {
|
|
49
60
|
type: 'array',
|
|
50
|
-
|
|
61
|
+
length: val.length,
|
|
62
|
+
value: val.slice(0, 20).map(v => serialize(v, currentDepth + 1, maxDepth))
|
|
51
63
|
};
|
|
52
64
|
}
|
|
53
65
|
|
|
54
66
|
const result = { type: 'object', properties: {} };
|
|
55
|
-
|
|
67
|
+
let keys;
|
|
68
|
+
try {
|
|
69
|
+
keys = Object.getOwnPropertyNames(val);
|
|
70
|
+
} catch (e) {
|
|
71
|
+
return { type: 'object', value: '[Error accessing keys]', error: e.message };
|
|
72
|
+
}
|
|
56
73
|
|
|
57
|
-
for (const key of keys.slice(0,
|
|
74
|
+
for (const key of keys.slice(0, 30)) {
|
|
58
75
|
try {
|
|
59
76
|
const desc = Object.getOwnPropertyDescriptor(val, key);
|
|
77
|
+
if (!desc) continue;
|
|
78
|
+
|
|
79
|
+
// 安全处理:避免触发有副作用的 getter
|
|
60
80
|
if (desc.get) {
|
|
81
|
+
// 对于 getter,只记录描述符信息,不执行 getter
|
|
82
|
+
result.properties[key] = {
|
|
83
|
+
type: 'getter',
|
|
84
|
+
hasGetter: true,
|
|
85
|
+
enumerable: desc.enumerable,
|
|
86
|
+
configurable: desc.configurable
|
|
87
|
+
};
|
|
88
|
+
} else if (desc.set && desc.value === undefined) {
|
|
89
|
+
// 只有 setter 没有 getter
|
|
61
90
|
result.properties[key] = {
|
|
62
|
-
|
|
63
|
-
|
|
91
|
+
type: 'setter',
|
|
92
|
+
hasSetter: true
|
|
64
93
|
};
|
|
65
94
|
} else {
|
|
95
|
+
// 普通值
|
|
66
96
|
result.properties[key] = serialize(desc.value, currentDepth + 1, maxDepth);
|
|
67
97
|
}
|
|
68
98
|
} catch (e) {
|
|
@@ -89,15 +119,19 @@ export class EnvCollector {
|
|
|
89
119
|
|
|
90
120
|
let descriptor = null;
|
|
91
121
|
if (parent) {
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
122
|
+
try {
|
|
123
|
+
const desc = Object.getOwnPropertyDescriptor(parent, propName);
|
|
124
|
+
if (desc) {
|
|
125
|
+
descriptor = {
|
|
126
|
+
configurable: desc.configurable,
|
|
127
|
+
enumerable: desc.enumerable,
|
|
128
|
+
writable: desc.writable,
|
|
129
|
+
hasGetter: !!desc.get,
|
|
130
|
+
hasSetter: !!desc.set
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
} catch (e) {
|
|
134
|
+
// 忽略描述符读取错误
|
|
101
135
|
}
|
|
102
136
|
}
|
|
103
137
|
|
|
@@ -112,7 +146,19 @@ export class EnvCollector {
|
|
|
112
146
|
}
|
|
113
147
|
}, { path, depth, includeProto });
|
|
114
148
|
|
|
115
|
-
|
|
149
|
+
// 添加超时
|
|
150
|
+
const timeoutPromise = new Promise((_, reject) =>
|
|
151
|
+
setTimeout(() => reject(new Error('采集超时')), timeout)
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
let result;
|
|
155
|
+
try {
|
|
156
|
+
result = await Promise.race([evaluatePromise, timeoutPromise]);
|
|
157
|
+
} catch (e) {
|
|
158
|
+
result = { success: false, error: e.message };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (result?.success && useCache) {
|
|
116
162
|
this.cache.set(path, result);
|
|
117
163
|
}
|
|
118
164
|
|
|
@@ -154,9 +200,12 @@ export class EnvCollector {
|
|
|
154
200
|
* 深度采集整个对象
|
|
155
201
|
*/
|
|
156
202
|
async collectDeep(rootPath, options = {}) {
|
|
157
|
-
const { maxDepth = 3, maxProps = 100 } = options;
|
|
203
|
+
const { maxDepth = 3, maxProps = 100, timeout = 5000 } = options;
|
|
204
|
+
|
|
205
|
+
const evaluatePromise = this.page.evaluate(({ rootPath, maxDepth, maxProps }) => {
|
|
206
|
+
// 用于检测循环引用的 WeakSet
|
|
207
|
+
const seen = new WeakSet();
|
|
158
208
|
|
|
159
|
-
return await this.page.evaluate(({ rootPath, maxDepth, maxProps }) => {
|
|
160
209
|
function getByPath(obj, path) {
|
|
161
210
|
return path.split('.').reduce((o, k) => o && o[k], obj);
|
|
162
211
|
}
|
|
@@ -165,19 +214,38 @@ export class EnvCollector {
|
|
|
165
214
|
if (depth > maxDepth || collected.size > maxProps) return;
|
|
166
215
|
if (!obj || typeof obj !== 'object') return;
|
|
167
216
|
|
|
217
|
+
// 检测循环引用
|
|
218
|
+
if (seen.has(obj)) return;
|
|
219
|
+
seen.add(obj);
|
|
220
|
+
|
|
168
221
|
const keys = Object.getOwnPropertyNames(obj);
|
|
169
|
-
for (const key of keys) {
|
|
222
|
+
for (const key of keys.slice(0, 30)) {
|
|
170
223
|
if (collected.size > maxProps) break;
|
|
171
224
|
|
|
172
225
|
const fullPath = path ? `${path}.${key}` : key;
|
|
173
226
|
try {
|
|
174
|
-
const
|
|
175
|
-
|
|
227
|
+
const desc = Object.getOwnPropertyDescriptor(obj, key);
|
|
228
|
+
if (!desc) continue;
|
|
229
|
+
|
|
230
|
+
// 安全处理:避免触发有副作用的 getter
|
|
231
|
+
let val;
|
|
232
|
+
let type;
|
|
233
|
+
if (desc.get) {
|
|
234
|
+
type = 'getter';
|
|
235
|
+
val = '[Getter]';
|
|
236
|
+
} else if (desc.set && desc.value === undefined) {
|
|
237
|
+
type = 'setter';
|
|
238
|
+
val = '[Setter]';
|
|
239
|
+
} else {
|
|
240
|
+
val = desc.value;
|
|
241
|
+
type = typeof val;
|
|
242
|
+
}
|
|
176
243
|
|
|
177
244
|
collected.set(fullPath, {
|
|
178
245
|
type,
|
|
179
246
|
value: type === 'function' ? '[Function]' :
|
|
180
247
|
type === 'object' ? '[Object]' :
|
|
248
|
+
type === 'getter' || type === 'setter' ? val :
|
|
181
249
|
val
|
|
182
250
|
});
|
|
183
251
|
|
|
@@ -204,6 +272,17 @@ export class EnvCollector {
|
|
|
204
272
|
properties: Object.fromEntries(collected)
|
|
205
273
|
};
|
|
206
274
|
}, { rootPath, maxDepth, maxProps });
|
|
275
|
+
|
|
276
|
+
// 添加超时保护
|
|
277
|
+
const timeoutPromise = new Promise((_, reject) =>
|
|
278
|
+
setTimeout(() => reject(new Error('collectDeep timeout')), timeout)
|
|
279
|
+
);
|
|
280
|
+
|
|
281
|
+
try {
|
|
282
|
+
return await Promise.race([evaluatePromise, timeoutPromise]);
|
|
283
|
+
} catch (e) {
|
|
284
|
+
return { success: false, error: e.message };
|
|
285
|
+
}
|
|
207
286
|
}
|
|
208
287
|
|
|
209
288
|
// === 特殊环境采集 ===
|
|
@@ -160,7 +160,9 @@ function getCookieHook() {
|
|
|
160
160
|
return value;
|
|
161
161
|
},
|
|
162
162
|
set: function(val) {
|
|
163
|
-
|
|
163
|
+
// 解析 cookie name(cookie 格式: "name=value; expires=...; path=...")
|
|
164
|
+
const cookieName = val?.split('=')[0]?.trim();
|
|
165
|
+
deepspider.log('cookie', { action: 'write', name: cookieName, value: val });
|
|
164
166
|
return cookieDesc.set.call(document, val);
|
|
165
167
|
},
|
|
166
168
|
configurable: true
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
export class HookManager {
|
|
8
8
|
constructor() {
|
|
9
9
|
this.logs = [];
|
|
10
|
+
this.maxLogs = 5000;
|
|
10
11
|
this.onLog = null;
|
|
11
12
|
this.injected = false;
|
|
12
13
|
}
|
|
@@ -37,6 +38,10 @@ export class HookManager {
|
|
|
37
38
|
text,
|
|
38
39
|
timestamp: Date.now(),
|
|
39
40
|
});
|
|
41
|
+
// 超过上限时丢弃最旧的 20%
|
|
42
|
+
if (this.logs.length > this.maxLogs) {
|
|
43
|
+
this.logs = this.logs.slice(Math.floor(this.maxLogs * 0.2));
|
|
44
|
+
}
|
|
40
45
|
if (this.onLog) {
|
|
41
46
|
this.onLog({ type: msg.type(), text });
|
|
42
47
|
}
|