npm - deepspider - Versions diffs - 0.3.2 → 0.4.0 - Mend

deepspider 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +8 -2
package/package.json +4 -2
package/src/agent/core/PanelBridge.js +34 -8
package/src/agent/core/StreamHandler.js +114 -15
package/src/agent/index.js +72 -14
package/src/agent/middleware/memoryFlush.js +48 -0
package/src/agent/middleware/report.js +77 -45
package/src/agent/middleware/subagent.js +4 -1
package/src/agent/middleware/toolAvailability.js +37 -0
package/src/agent/middleware/toolGuard.js +141 -31
package/src/agent/prompts/system.js +130 -1
package/src/agent/run.js +127 -14
package/src/agent/sessions.js +88 -0
package/src/agent/skills/anti-detect/SKILL.md +89 -14
package/src/agent/skills/captcha/SKILL.md +93 -19
package/src/agent/skills/js2python/evolved.md +5 -1
package/src/agent/skills/static-analysis/evolved.md +5 -1
package/src/agent/subagents/anti-detect.js +27 -5
package/src/agent/subagents/captcha.js +28 -9
package/src/agent/subagents/crawler.js +26 -79
package/src/agent/subagents/factory.js +24 -4
package/src/agent/subagents/js2python.js +18 -16
package/src/agent/tools/analysis.js +17 -7
package/src/agent/tools/browser.js +26 -13
package/src/agent/tools/crawler.js +1 -1
package/src/agent/tools/crawlerGenerator.js +2 -2
package/src/agent/tools/index.js +3 -1
package/src/agent/tools/patch.js +1 -1
package/src/agent/tools/store.js +1 -1
package/src/browser/client.js +5 -1
package/src/browser/ui/analysisPanel.js +72 -0

package/README.md CHANGED Viewed

@@ -5,6 +5,8 @@
 > 智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent
+[English](README_EN.md)
 从 JS 逆向到完整爬虫脚本的一站式 AI Agent 解决方案。
 ## 特性
@@ -139,11 +141,15 @@ pnpm test
 ### 使用流程
-1. **启动**: `pnpm run agent https://target-site.com`
+1. **启动**: `deepspider https://target-site.com`
 2. **等待**: 浏览器打开，系统自动记录数据（不消耗 API）
 3. **操作**: 在网站上登录、翻页、触发目标请求
 4. **选择**: 点击面板的选择按钮 ⦿，进入选择模式
-5. **分析**: 点击目标数据，确认后发送给 Agent
+5. **分析**: 点击目标数据元素，选择快捷操作：
+   - **追踪数据来源** — 定位选中数据的 API 接口
+   - **分析加密参数** — 识别并逆向加密参数
+   - **完整分析并生成爬虫** — 端到端：逆向、验证、生成代码
+   - **提取页面结构** — 分析 DOM 结构，生成选择器和字段配置
 6. **对话**: 在面板或 CLI 继续提问，深入分析
 ## 架构

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "deepspider",
-  "version": "0.3.2",
+  "version": "0.4.0",
   "description": "智能爬虫工程平台 - 基于 DeepAgents + Patchright 的 AI 爬虫 Agent",
   "type": "module",
   "main": "src/index.js",
@@ -23,7 +23,7 @@
     "lint": "eslint src/",
     "lint:fix": "eslint src/ --fix",
     "setup:crypto": "uv venv .venv --python 3.11 2>/dev/null || true && uv pip install -r requirements-crypto.txt",
-    "postinstall": "patchright install chromium && npm rebuild isolated-vm 2>/dev/null || true",
+    "postinstall": "patchright install chromium && npm rebuild isolated-vm better-sqlite3 2>/dev/null || true",
     "prepare": "husky"
   },
   "keywords": [
@@ -57,8 +57,10 @@
     "@langchain/anthropic": "^1.3.17",
     "@langchain/core": "^1.1.24",
     "@langchain/langgraph": "^1.1.2",
+    "@langchain/langgraph-checkpoint-sqlite": "^1.0.1",
     "@langchain/openai": "^1.2.3",
     "@modelcontextprotocol/sdk": "^1.26.0",
+    "better-sqlite3": "^12.6.2",
     "crypto-js": "^4.2.0",
     "deepagents": "^1.7.6",
     "dotenv": "^17.2.3",

package/src/agent/core/PanelBridge.js CHANGED Viewed

@@ -18,10 +18,16 @@ export class PanelBridge {
     if (!cdp) return null;
     try {
-      const result = await cdp.send('Runtime.evaluate', {
-        expression: code,
-        returnByValue: true,
-      });
+      // 3s 超时：断点暂停时 Runtime.evaluate 会永远挂住，必须限时
+      const result = await Promise.race([
+        cdp.send('Runtime.evaluate', {
+          expression: code,
+          returnByValue: true,
+        }),
+        new Promise((_, reject) =>
+          setTimeout(() => reject(new Error('evaluateInPage timeout (debugger paused?)')), 3000)
+        ),
+      ]);
       return result.result?.value;
     } catch (e) {
       this.debug('evaluateInPage 失败:', e.message);
@@ -29,14 +35,34 @@ export class PanelBridge {
     }
   }
+  /**
+   * 等待面板 JS 初始化完成
+   */
+  async waitForPanel(timeoutMs = 5000) {
+    const start = Date.now();
+    while (Date.now() - start < timeoutMs) {
+      const ready = await this.evaluateInPage('!!window.__deepspider__?.addStructuredMessage');
+      if (ready) return true;
+      await new Promise(r => setTimeout(r, 200));
+    }
+    return false;
+  }
+  /**
+   * 批量发送消息到面板（单次 CDP 调用）
+   */
+  async sendBatch(messages) {
+    if (!messages?.length) return;
+    const escaped = JSON.stringify(messages);
+    await this.evaluateInPage(
+      `(function(msgs){var ds=window.__deepspider__;if(!ds)return;msgs.forEach(function(m){ds.addStructuredMessage?.(m.type,m.data);})})(${escaped})`
+    );
+  }
   /**
    * 发送结构化消息到前端面板
    */
   async sendMessage(type, data) {
-    const browser = this.getBrowser();
-    const page = browser?.getPage?.();
-    if (!page) return;
     try {
       const escapedType = JSON.stringify(type);
       const escapedData = JSON.stringify(data);

package/src/agent/core/StreamHandler.js CHANGED Viewed

@@ -13,6 +13,31 @@ function cleanDSML(text) {
   return text ? text.replace(DSML_PATTERN, '') : text;
 }
+// 流式事件停滞超时（单个事件间隔上限）
+const STALL_TIMEOUT_MS = 150000; // 150s — 超过此时间无新事件则中断流
+/**
+ * 包装异步迭代器，每个 next() 加独立超时
+ * 防止 LLM API 或 middleware 无响应时 for-await 永久挂起
+ */
+async function* withStallTimeout(asyncIterator, timeoutMs = STALL_TIMEOUT_MS) {
+  while (true) {
+    let timer;
+    const result = await Promise.race([
+      asyncIterator.next(),
+      new Promise((_, reject) => {
+        timer = setTimeout(
+          () => reject(new Error(`Stream timeout: no events for ${Math.round(timeoutMs / 1000)}s`)),
+          timeoutMs,
+        );
+      }),
+    ]);
+    clearTimeout(timer);
+    if (result.done) break;
+    yield result.value;
+  }
+}
 // 人工介入配置
 const INTERVENTION_CONFIG = {
   idleTimeoutMs: 120000,  // 2分钟无响应触发提示
@@ -63,7 +88,7 @@ export class StreamHandler {
       );
       this.debug('chatStream: 开始遍历事件');
-      for await (const event of eventStream) {
+      for await (const event of withStallTimeout(eventStream)) {
         lastEventTime = Date.now();
         eventCount++;
@@ -73,10 +98,12 @@ export class StreamHandler {
         await this._handleStreamEvent(event);
-        if (event.event === 'on_chat_model_end' && event.name === 'ChatOpenAI') {
+        if (event.event === 'on_chat_model_end') {
           const output = event.data?.output;
           if (output?.content) {
-            finalResponse = output.content;
+            finalResponse = typeof output.content === 'string'
+              ? output.content
+              : output.content.filter(c => c.type === 'text').map(c => c.text).join('');
             this.debug(`chatStream: 收到最终响应, 长度=${finalResponse.length}`);
           }
         }
@@ -86,10 +113,15 @@ export class StreamHandler {
       console.log(`\n[完成] 共处理 ${eventCount} 个事件`);
       // 发送剩余累积文本
-      await this._flushFullResponse();
+      const flushed = await this._flushFullResponse();
       // 检测 interrupt 并渲染到面板
-      await this._checkAndRenderInterrupt();
+      const hasInterrupt = await this._checkAndRenderInterrupt();
+      // 兜底：如果没有文本输出也没有 interrupt，发送完成通知
+      if (!flushed && !hasInterrupt && eventCount > 0 && lastToolCall) {
+        await this.panelBridge.sendToPanel('system', '✅ 任务完成');
+      }
       await this.panelBridge.setBusy(false);
@@ -126,23 +158,31 @@ export class StreamHandler {
         { ...this.config, version: 'v2' }
       );
-      for await (const event of eventStream) {
+      for await (const event of withStallTimeout(eventStream)) {
         lastEventTime = Date.now();
         eventCount++;
         await this._handleStreamEvent(event);
-        if (event.event === 'on_chat_model_end' && event.name === 'ChatOpenAI') {
+        if (event.event === 'on_chat_model_end') {
           const output = event.data?.output;
           if (output?.content) {
-            finalResponse = output.content;
+            finalResponse = typeof output.content === 'string'
+              ? output.content
+              : output.content.filter(c => c.type === 'text').map(c => c.text).join('');
           }
         }
       }
       clearInterval(heartbeat);
-      await this._flushFullResponse();
-      await this._checkAndRenderInterrupt();
+      const flushed = await this._flushFullResponse();
+      const hasInterrupt = await this._checkAndRenderInterrupt();
+      // 兜底：如果没有文本输出也没有 interrupt，发送完成通知
+      if (!flushed && !hasInterrupt && eventCount > 0) {
+        await this.panelBridge.sendToPanel('system', '✅ 任务完成');
+      }
       await this.panelBridge.setBusy(false);
       console.log(`\n[恢复完成] 共处理 ${eventCount} 个事件`);
@@ -168,6 +208,23 @@ export class StreamHandler {
     await this.panelBridge.setBusy(true);
     this.debug(`chatStreamResume: 从检查点恢复, retryCount=${retryCount}`);
+    // 恢复前：检查 checkpoint 是否有实际消息
+    if (retryCount === 0) {
+      try {
+        const state = await this.agent.getState(this.config);
+        const messages = state?.values?.messages;
+        if (!messages?.length) {
+          console.log('[恢复] checkpoint 无历史消息，跳过恢复');
+          await this.panelBridge.sendToPanel('system', '该会话无历史记录，请重新开始分析');
+          await this.panelBridge.setBusy(false);
+          return '[无历史消息]';
+        }
+        await this._restoreHistoryToPanel(messages);
+      } catch (e) {
+        this.debug('chatStreamResume: getState 失败:', e.message);
+      }
+    }
     const heartbeat = setInterval(() => {
       const elapsed = Math.round((Date.now() - lastEventTime) / 1000);
       if (elapsed > 30) {
@@ -181,23 +238,30 @@ export class StreamHandler {
         { ...this.config, version: 'v2' }
       );
-      for await (const event of eventStream) {
+      for await (const event of withStallTimeout(eventStream)) {
         lastEventTime = Date.now();
         eventCount++;
         await this._handleStreamEvent(event);
-        if (event.event === 'on_chat_model_end' && event.name === 'ChatOpenAI') {
+        if (event.event === 'on_chat_model_end') {
           const output = event.data?.output;
           if (output?.content) {
-            finalResponse = output.content;
+            finalResponse = typeof output.content === 'string'
+              ? output.content
+              : output.content.filter(c => c.type === 'text').map(c => c.text).join('');
           }
         }
       }
       clearInterval(heartbeat);
-      await this._flushFullResponse();
-      await this._checkAndRenderInterrupt();
+      const flushed2 = await this._flushFullResponse();
+      const hasInterrupt2 = await this._checkAndRenderInterrupt();
+      if (!flushed2 && !hasInterrupt2 && eventCount > 0) {
+        await this.panelBridge.sendToPanel('system', '✅ 任务完成');
+      }
       await this.panelBridge.setBusy(false);
       console.log(`\n[恢复完成] 共处理 ${eventCount} 个事件`);
@@ -219,14 +283,49 @@ export class StreamHandler {
     }
   }
+  /**
+   * 从 checkpoint 恢复历史消息到前端面板
+   */
+  async _restoreHistoryToPanel(messages) {
+    try {
+      if (!messages?.length) return;
+      this.debug(`_restoreHistoryToPanel: ${messages.length} 条历史消息`);
+      const batch = [];
+      for (const msg of messages) {
+        const type = msg._getType?.() || msg.constructor?.name;
+        const content = Array.isArray(msg.content)
+          ? msg.content.filter(c => c.type === 'text').map(c => c.text).join('')
+          : (typeof msg.content === 'string' ? msg.content : '');
+        if (!content.trim()) continue;
+        if (type === 'human') {
+          batch.push({ type: 'user', data: { content } });
+        } else if (type === 'ai') {
+          batch.push({ type: 'text', data: { content } });
+        } else if (type === 'tool') {
+          const summary = content.length > 200 ? content.slice(0, 200) + '...' : content;
+          batch.push({ type: 'system', data: { content: `[工具结果] ${summary}` } });
+        }
+      }
+      await this.panelBridge.sendBatch(batch);
+    } catch (e) {
+      this.debug('_restoreHistoryToPanel 失败:', e.message);
+    }
+  }
   /**
    * 发送剩余累积文本到面板
+   * 返回 true 如果有文本被发送
    */
   async _flushFullResponse() {
     if (this.fullResponse?.trim()) {
       await this.panelBridge.sendToPanel('assistant', this.fullResponse);
+      this.fullResponse = '';
+      return true;
     }
     this.fullResponse = '';
+    return false;
   }
   /**

package/src/agent/index.js CHANGED Viewed

@@ -7,8 +7,8 @@
 import 'dotenv/config';
 import { StateBackend, FilesystemBackend, createFilesystemMiddleware, createPatchToolCallsMiddleware } from 'deepagents';
 import { createAgent, toolRetryMiddleware, summarizationMiddleware, anthropicPromptCachingMiddleware, todoListMiddleware, humanInTheLoopMiddleware } from 'langchain';
-import { ChatOpenAI } from '@langchain/openai';
-import { MemorySaver } from '@langchain/langgraph';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { SqliteSaver } from '@langchain/langgraph-checkpoint-sqlite';
 import { coreTools } from './tools/index.js';
 import { allSubagents } from './subagents/index.js';
@@ -17,7 +17,10 @@ import { createReportMiddleware } from './middleware/report.js';
 import { createFilterToolsMiddleware } from './middleware/filterTools.js';
 import { createCustomSubAgentMiddleware } from './middleware/subagent.js';
 import { createToolGuardMiddleware } from './middleware/toolGuard.js';
+import { createToolCallLimitMiddleware } from './subagents/factory.js';
 import { createValidationWorkflowMiddleware } from './middleware/validationWorkflow.js';
+import { createMemoryFlushMiddleware } from './middleware/memoryFlush.js';
+import { createToolAvailabilityMiddleware } from './middleware/toolAvailability.js';
 // createDeepAgent 内部拼接的 BASE_PROMPT
 const BASE_PROMPT = 'In order to complete the objective that the user asks of you, you have access to a number of standard tools.';
@@ -29,9 +32,48 @@ const config = {
   model: process.env.DEEPSPIDER_MODEL || 'gpt-4o',
 };
+/**
+ * 递归移除 JSON Schema 中 Anthropic API 不支持的关键字
+ * Zod v4 的 toJSONSchema 会生成 $schema 和 propertyNames，Anthropic 拒绝
+ * additionalProperties: {} 空对象也不被接受，改成 true
+ */
+function stripUnsupportedSchemaKeys(obj) {
+  if (!obj || typeof obj !== 'object') return obj;
+  if (Array.isArray(obj)) return obj.map(stripUnsupportedSchemaKeys);
+  const res = {};
+  for (const k in obj) {
+    if (k === '$schema' || k === 'propertyNames') continue;
+    // additionalProperties: {} → true (空对象等于"任意类型"，但Anthropic不接受空对象)
+    if (k === 'additionalProperties' && obj[k] !== null && typeof obj[k] === 'object' && Object.keys(obj[k]).length === 0) {
+      res[k] = true;
+      continue;
+    }
+    res[k] = stripUnsupportedSchemaKeys(obj[k]);
+  }
+  return res;
+}
+/**
+ * 自定义 fetch：拦截 LLM API 请求，strip 工具 schema 中 Zod v4 生成的不兼容字段
+ * 保留作为安全网，防止 $schema / propertyNames / additionalProperties:{} 泄漏到 API
+ */
+const _origFetch = globalThis.fetch;
+globalThis.fetch = async function(url, opts) {
+  if (opts?.body && typeof opts.body === 'string' && opts.body.includes('"tools"')) {
+    try {
+      const body = JSON.parse(opts.body);
+      if (body.tools) {
+        body.tools = stripUnsupportedSchemaKeys(body.tools);
+        opts = { ...opts, body: JSON.stringify(body) };
+      }
+    } catch { /* ignore parse errors on non-LLM requests */ }
+  }
+  return _origFetch(url, opts);
+};
 /**
  * 创建 LLM 模型实例
- * 使用 ChatOpenAI 兼容 OpenAI 格式的任意供应商
+ * 使用 ChatAnthropic 发送原生 Anthropic 格式，避免代理的 OpenAI→Anthropic 转换引入 schema 错误
  */
 function createModel(options = {}) {
   const {
@@ -40,10 +82,13 @@ function createModel(options = {}) {
     baseUrl = config.baseUrl,
   } = options;
-  return new ChatOpenAI({
+  // ChatAnthropic 的 baseURL 不含 /v1（SDK 自动拼接）
+  const anthropicBaseUrl = baseUrl?.replace(/\/v1\/?$/, '') || undefined;
+  return new ChatAnthropic({
     model,
-    apiKey,
-    configuration: baseUrl ? { baseURL: baseUrl } : undefined,
+    anthropicApiKey: apiKey,
+    anthropicApiUrl: anthropicBaseUrl,
     temperature: 0,
   });
 }
@@ -59,18 +104,27 @@ export function createDeepSpiderAgent(options = {}) {
     enableMemory = true,
     enableInterrupt = false,
     onReportReady = null,  // 报告就绪回调
+    onFileSaved = null,    // 文件保存通知回调
+    checkpointer,
   } = options;
-  // 创建 LLM 模型实例
+  // 创建 LLM 模型实例（加 timeout 防止 API 无响应时 streamEvents 永久挂起）
   const llm = createModel({ model, apiKey, baseUrl });
+  llm.timeout = 120000; // 120s — 主 LLM 超时
+  // 摘要专用 LLM：故意不设 timeout
+  // 原因：summarizationMiddleware 的 createSummary 有 try-catch，超时会返回错误字符串，
+  // 但 beforeModel 仍会用这个错误字符串替换所有原始消息（REMOVE_ALL_MESSAGES），导致数据丢失。
+  // 安全网由 StreamHandler.withStallTimeout (150s) 提供 — 它在 BeforeModelNode 完成前触发，
+  // 不会写入 checkpoint，原始数据得以保留。
+  const summaryLlm = createModel({ model, apiKey, baseUrl });
   // 后端配置：使用文件系统持久化
   const backend = enableMemory
     ? new FilesystemBackend({ rootDir: './.deepspider-agent' })
     : new StateBackend();
-  // Checkpointer：保存对话状态，支持断点恢复
-  const checkpointer = new MemorySaver();
+  const resolvedCheckpointer = checkpointer ?? SqliteSaver.fromConnString(':memory:');
   // 人机交互配置
   const interruptOn = enableInterrupt
@@ -84,7 +138,7 @@ export function createDeepSpiderAgent(options = {}) {
   const subagentDefaultMiddleware = [
     todoListMiddleware(),
     createFilesystemMiddleware({ backend }),
-    summarizationMiddleware({ model: llm, trigger: { tokens: 170000 }, keep: { messages: 6 } }),
+    summarizationMiddleware({ model: summaryLlm, trigger: { tokens: 100000 }, keep: { messages: 6 } }),
     anthropicPromptCachingMiddleware({ unsupportedModelBehavior: 'ignore' }),
     createPatchToolCallsMiddleware(),
   ];
@@ -107,7 +161,10 @@ export function createDeepSpiderAgent(options = {}) {
         generalPurposeAgent: false,
         defaultInterruptOn: interruptOn,
       }),
-      summarizationMiddleware({ model: llm, trigger: { tokens: 170000 }, keep: { messages: 6 } }),
+      // === 预警 + 拦截（在 summarization 之前）===
+      createMemoryFlushMiddleware(),
+      createToolAvailabilityMiddleware(),
+      summarizationMiddleware({ model: summaryLlm, trigger: { tokens: 100000 }, keep: { messages: 6 } }),
       anthropicPromptCachingMiddleware({ unsupportedModelBehavior: 'ignore' }),
       createPatchToolCallsMiddleware(),
       // === HITL（如果启用）===
@@ -122,15 +179,16 @@ export function createDeepSpiderAgent(options = {}) {
         },
       }),
       createToolGuardMiddleware(),
+      createToolCallLimitMiddleware(200),
       createFilterToolsMiddleware(),
       createValidationWorkflowMiddleware(),
-      createReportMiddleware({ onReportReady }),
+      createReportMiddleware({ onReportReady, onFileSaved }),
     ],
-    checkpointer,
+    checkpointer: resolvedCheckpointer,
   });
 }
-// 默认导出
+// 默认导出（内存模式，兼容 MCP 等非 CLI 场景）
 export const agent = createDeepSpiderAgent();
 export default agent;

package/src/agent/middleware/memoryFlush.js ADDED Viewed

@@ -0,0 +1,48 @@
+/**
+ * DeepSpider - Memory Flush 中间件
+ * 在 summarization 触发前（85k token），注入 SystemMessage 提醒 Agent 保存关键进度
+ */
+import { createMiddleware, countTokensApproximately } from 'langchain';
+import { SystemMessage } from '@langchain/core/messages';
+const FLUSH_THRESHOLD = 85000;
+const FLUSH_REMINDER = `⚠️ 上下文即将被压缩（当前接近 token 上限）。
+请立即使用 save_memo 工具保存以下关键信息，否则压缩后将丢失：
+1. 当前分析目标和已完成的步骤
+2. 已发现的关键参数、加密逻辑、请求链路
+3. 下一步计划
+保存后继续正常工作。`;
+export function createMemoryFlushMiddleware() {
+  let flushed = false;
+  return createMiddleware({
+    name: 'memoryFlushMiddleware',
+    beforeModel: async (state) => {
+      const tokens = countTokensApproximately(state.messages);
+      // token 骤降（summarization 已执行），重置标记
+      if (flushed && tokens < FLUSH_THRESHOLD * 0.5) {
+        flushed = false;
+      }
+      // 达到阈值且未提醒过，注入提醒
+      if (!flushed && tokens >= FLUSH_THRESHOLD) {
+        flushed = true;
+        return {
+          ...state,
+          messages: [
+            ...state.messages,
+            new SystemMessage(FLUSH_REMINDER),
+          ],
+        };
+      }
+      return state;
+    },
+  });
+}