agentpage 0.0.12 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -6,9 +6,11 @@ import { Type } from "@sinclair/typebox";
6
6
  *
7
7
  * 统一集中在该文件,避免在主循环中散落“魔法数字”。
8
8
  */
9
- const DEFAULT_MAX_ROUNDS = 10;
9
+ const DEFAULT_MAX_ROUNDS = 40;
10
10
  const DEFAULT_RECOVERY_WAIT_MS = 100;
11
- const DEFAULT_ACTION_RECOVERY_ROUNDS = 5;
11
+ const DEFAULT_ACTION_RECOVERY_ROUNDS = 2;
12
+ const DEFAULT_NOT_FOUND_RETRY_ROUNDS = 2;
13
+ const DEFAULT_NOT_FOUND_RETRY_WAIT_MS = 2e3;
12
14
  /** 快照起始标记 — 用于在消息中识别快照边界 */
13
15
  const SNAPSHOT_START = "<!-- SNAPSHOT_START -->";
14
16
  /** 快照结束标记 */
@@ -18,15 +20,15 @@ const SNAPSHOT_OUTDATED = "[此快照已过期,请参考对话中最新的快
18
20
 
19
21
  //#endregion
20
22
  //#region src/core/agent-loop/helpers.ts
21
- /** 异步睡眠,确保恢复重试按顺序串行执行。 */
23
+ /** 异步睡眠(中)/ Async sleep utility (EN). */
22
24
  function sleep$1(ms) {
23
25
  return new Promise((resolve) => setTimeout(resolve, ms));
24
26
  }
25
- /** 将工具返回内容统一转为字符串,便于拼接进消息。 */
27
+ /** 统一内容为字符串(中)/ Normalize tool content to string (EN). */
26
28
  function toContentString(content) {
27
29
  return typeof content === "string" ? content : JSON.stringify(content, null, 2);
28
30
  }
29
- /** 判定工具失败是否属于“元素不存在”,用于触发快照恢复。 */
31
+ /** 元素不存在判定(中)/ Detect element-not-found failure (EN). */
30
32
  function isElementNotFoundResult(result) {
31
33
  const details = result.details;
32
34
  if (details && typeof details === "object") {
@@ -35,15 +37,14 @@ function isElementNotFoundResult(result) {
35
37
  const content = toContentString(result.content);
36
38
  return content.includes("未找到") && content.includes("元素");
37
39
  }
38
- /** 为同一动作构造稳定 key,用于统计恢复重试次数。 */
40
+ /** 生成稳定调用键(中)/ Build stable key for a tool call (EN). */
39
41
  function buildToolCallKey(name, input) {
40
42
  return `${name}:${JSON.stringify(input)}`;
41
43
  }
42
44
  /**
43
- * 解析恢复等待时长:
44
- * - 优先 `waitMs`
45
- * - 其次 `waitSeconds`
46
- * - 最后回退默认值
45
+ * 解析恢复等待时长(中)/ Resolve recovery wait duration (EN).
46
+ * 优先级:waitMs > waitSeconds > 默认值。
47
+ * Priority: waitMs > waitSeconds > default value.
47
48
  */
48
49
  function resolveRecoveryWaitMs(input) {
49
50
  if (!input || typeof input !== "object") return DEFAULT_RECOVERY_WAIT_MS;
@@ -54,74 +55,93 @@ function resolveRecoveryWaitMs(input) {
54
55
  if (typeof waitSeconds === "number" && Number.isFinite(waitSeconds)) return Math.max(0, Math.floor(waitSeconds * 1e3));
55
56
  return DEFAULT_RECOVERY_WAIT_MS;
56
57
  }
57
- /** 将工具输入压缩成简短文本,用于轨迹展示。 */
58
- function formatToolInputBrief(input) {
59
- if (!input || typeof input !== "object") return "";
60
- const params = input;
61
- const parts = [];
62
- for (const key of [
63
- "action",
64
- "selector",
65
- "waitMs",
66
- "waitSeconds",
67
- "url",
68
- "text"
69
- ]) {
70
- const value = params[key];
71
- if (value === void 0 || value === null) continue;
72
- if (typeof value === "string") parts.push(`${key}=${JSON.stringify(value).slice(0, 80)}`);
73
- else if (typeof value === "number" || typeof value === "boolean") parts.push(`${key}=${String(value)}`);
74
- }
75
- if (parts.length === 0) return "";
76
- return ` (${parts.join(", ")})`;
77
- }
78
- /** 从工具参数中读取 action。 */
58
+ /** 读取工具 action(中)/ Read tool action from input (EN). */
79
59
  function getToolAction(input) {
80
60
  if (!input || typeof input !== "object") return void 0;
81
61
  const action = input.action;
82
62
  return typeof action === "string" ? action : void 0;
83
63
  }
84
- /** 判定工具结果是否标记 error */
64
+ /** 判定错误标记(中)/ Check whether result is marked as error (EN). */
85
65
  function hasToolError(result) {
86
66
  return result.details && typeof result.details === "object" ? Boolean(result.details.error) : false;
87
67
  }
88
- /** 读取当前页面 URL(通过 page_info 工具)。 */
89
- async function readPageUrl(registry) {
90
- const result = await registry.dispatch("page_info", { action: "get_url" });
91
- return typeof result.content === "string" ? result.content : void 0;
92
- }
93
- /** 读取当前页面快照(通过 page_info 工具)。 */
94
- async function readPageSnapshot(registry, maxDepth = 8) {
68
+
69
+ //#endregion
70
+ //#region src/core/agent-loop/snapshot.ts
71
+ /**
72
+ * 读取页面快照(中)/ Read current page snapshot (EN).
73
+ *
74
+ * 默认关闭 viewportOnly,优先完整性。
75
+ * viewportOnly defaults to false to prioritize completeness.
76
+ */
77
+ async function readPageSnapshot(registry, options) {
95
78
  return toContentString((await registry.dispatch("page_info", {
96
79
  action: "snapshot",
97
- maxDepth
80
+ maxDepth: options?.maxDepth ?? 8,
81
+ viewportOnly: options?.viewportOnly ?? false,
82
+ pruneLayout: options?.pruneLayout ?? true,
83
+ maxNodes: options?.maxNodes ?? 500,
84
+ maxChildren: options?.maxChildren ?? 30,
85
+ maxTextLength: options?.maxTextLength ?? 40
98
86
  })).content);
99
87
  }
100
- /** 转义正则特殊字符 */
88
+ /** 包裹快照(中)/ Wrap snapshot with boundary markers (EN). */
89
+ function wrapSnapshot(snapshot) {
90
+ return `${SNAPSHOT_START}\n${snapshot}\n${SNAPSHOT_END}`;
91
+ }
92
+ /** 转义正则字符(中)/ Escape regex special chars (EN). */
101
93
  function escapeRegex(str) {
102
94
  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
103
95
  }
104
- /** 匹配快照标记对及其内容的正则 */
96
+ /** 快照块匹配正则(中)/ Regex for snapshot blocks (EN). */
105
97
  const SNAPSHOT_REGEX = new RegExp(`${escapeRegex(SNAPSHOT_START)}[\\s\\S]*?${escapeRegex(SNAPSHOT_END)}`, "g");
106
- /** 用标记包裹快照内容,便于后续去重识别。 */
107
- function wrapSnapshot(snapshot) {
108
- return `${SNAPSHOT_START}\n${snapshot}\n${SNAPSHOT_END}`;
109
- }
110
- /** 检测文本中是否包含快照标记。 */
98
+ /** 是否包含快照标记(中)/ Check whether text includes snapshot markers (EN). */
111
99
  function containsSnapshot(text) {
112
100
  return text.includes(SNAPSHOT_START);
113
101
  }
114
102
  /**
115
- * system prompt 中剥离已过期的快照内容。
116
- * 当消息历史中已有更新的快照时调用,避免 AI 参考过时信息。
103
+ * 剥离旧快照(中)/ Strip outdated snapshot blocks from system prompt (EN).
117
104
  */
118
105
  function stripSnapshotFromPrompt(prompt) {
119
106
  if (!containsSnapshot(prompt)) return prompt;
120
107
  return prompt.replace(SNAPSHOT_REGEX, SNAPSHOT_OUTDATED);
121
108
  }
109
+
110
+ //#endregion
111
+ //#region src/core/agent-loop/messages.ts
122
112
  /**
123
- * 格式化工具结果为简短一行摘要。
124
- * 成功操作保留首行描述;失败操作标注错误代码。
113
+ * 显式 UI 意图判定(中)/ Detect explicit intent to operate AutoPilot UI (EN).
114
+ */
115
+ function isExplicitAgentUiRequest(userMessage) {
116
+ const lower = userMessage.toLowerCase();
117
+ const compact = lower.replace(/[\s\p{P}\p{S}]+/gu, "");
118
+ const hasAgentUiKeyword = /(chat|dock|chatinput|sendbutton|shortcut|quicktest)/i.test(lower) || /(聊天|对话|指令输入框|消息输入框|输入框|发送按钮|发送|快捷测试|测试按钮|聊天面板)/.test(compact);
119
+ const hasActionVerb = /(press|click|type|fill|send|input|submit|enter)/i.test(lower) || /(输入|点击|发送|填写|填入|操作|提交|回车|按下)/.test(compact);
120
+ return hasAgentUiKeyword && hasActionVerb;
121
+ }
122
+ /** 输入摘要(中)/ Build brief text for tool input (EN). */
123
+ function formatToolInputBrief(input) {
124
+ if (!input || typeof input !== "object") return "";
125
+ const params = input;
126
+ const parts = [];
127
+ for (const key of [
128
+ "action",
129
+ "selector",
130
+ "waitMs",
131
+ "waitSeconds",
132
+ "url",
133
+ "text"
134
+ ]) {
135
+ const value = params[key];
136
+ if (value === void 0 || value === null) continue;
137
+ if (typeof value === "string") parts.push(`${key}=${JSON.stringify(value).slice(0, 80)}`);
138
+ else if (typeof value === "number" || typeof value === "boolean") parts.push(`${key}=${String(value)}`);
139
+ }
140
+ if (parts.length === 0) return "";
141
+ return ` (${parts.join(", ")})`;
142
+ }
143
+ /**
144
+ * 结果摘要(中)/ Build one-line summary for tool result (EN).
125
145
  */
126
146
  function formatToolResultBrief(result) {
127
147
  const firstLine = toContentString(result.content).split("\n").find((l) => l.trim())?.trim().slice(0, 80) ?? "";
@@ -132,25 +152,36 @@ function formatToolResultBrief(result) {
132
152
  return `✓ ${firstLine}`;
133
153
  }
134
154
  /**
135
- * 构建发送给 AI 的紧凑消息数组。
136
- *
137
- * 核心思路:保留用户原始消息与 system prompt 不变,
138
- * 只将循环中产出的 assistant(含 toolCalls)+ tool(结果)消息对
139
- * 压缩为一条 assistant 摘要 + 一条 user 上下文。
155
+ * 构建紧凑消息数组(中)/ Build compact AI message array (EN).
140
156
  *
141
- * 消息结构:
142
- * - 首轮:[...history, { user: 原始消息 }]
143
- * - 后续:[...history, { user: 原始消息 }, { assistant: 工具执行摘要 }, { user: 当前状态+快照 }]
157
+ * Round 0: task + snapshot.
158
+ * Round 1+: master goal + done steps + execution context + latest snapshot.
144
159
  *
145
- * 固定最多 history.length + 3 条消息,不随轮次增长。
160
+ * 新增渐进式语义(中)/ Progressive semantics (EN):
161
+ * - `remainingInstruction`:当前轮次仍待执行的文本。
162
+ * - `previousRoundTasks`:上一轮已执行的任务数组,避免重复计划。
163
+ * - 消息中要求模型输出 `REMAINING: ...` 或 `REMAINING: DONE`,供下一轮继续消费。
146
164
  */
147
- function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, history) {
165
+ function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, history, remainingInstruction, previousRoundTasks) {
148
166
  const messages = history ? [...history] : [];
149
- messages.push({
150
- role: "user",
151
- content: userMessage
152
- });
153
- if (trace.length === 0) return messages;
167
+ const allowAgentUiInteraction = isExplicitAgentUiRequest(userMessage);
168
+ const activeInstruction = remainingInstruction && remainingInstruction.trim() ? remainingInstruction.trim() : userMessage;
169
+ if (trace.length === 0) {
170
+ const parts = [
171
+ userMessage,
172
+ "",
173
+ "## Progressive execution state",
174
+ "Current remaining instruction to execute this round:",
175
+ activeInstruction
176
+ ];
177
+ if (currentUrl) parts.push("", `URL: ${currentUrl}`);
178
+ if (latestSnapshot) parts.push("", "## Current page snapshot", "Apply task-reduction model directly from this snapshot. Do NOT restate the task.", "Use hash IDs (e.g. #a1b2c) from the snapshot as selector params.", "Do NOT call page_info (get_url/get_title/query_all/snapshot).", "Batch independent visible actions in one round.", "If action changes DOM (open modal/navigate), stop that batch and continue next round.", "For dropdown/select fields, use dom with action=select_option (or fill on a select).", allowAgentUiInteraction ? "User explicitly asked to operate AutoPilot UI. You may interact with chat input/send/dock only as requested." : "Do NOT interact with any AI chat UI elements (chat input, send button, dock). Only operate on the actual page content.", "Output one line: REMAINING: <new remaining task after this round> or REMAINING: DONE", wrapSnapshot(latestSnapshot));
179
+ messages.push({
180
+ role: "user",
181
+ content: parts.join("\n")
182
+ });
183
+ return messages;
184
+ }
154
185
  const traceParts = [];
155
186
  for (let i = 0; i < trace.length; i++) {
156
187
  const entry = trace[i];
@@ -158,23 +189,40 @@ function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, hi
158
189
  const brief = formatToolResultBrief(entry.result);
159
190
  const status = isError ? "❌" : "✅";
160
191
  const marker = entry.marker ? ` ${entry.marker}` : "";
161
- traceParts.push(`${status} 步骤${i + 1}: ${entry.name}${formatToolInputBrief(entry.input)} → ${brief}${marker}`);
192
+ traceParts.push(`${status} ${i + 1}. ${entry.name}${formatToolInputBrief(entry.input)} → ${brief}${marker}`);
162
193
  }
163
194
  messages.push({
164
195
  role: "assistant",
165
- content: `## 已完成的操作步骤(以下步骤已执行,请勿重复)\n\n${traceParts.join("\n")}`
196
+ content: `Done steps (do NOT repeat):\n${traceParts.join("\n")}`
166
197
  });
167
- const contextParts = ["以上步骤已经执行完毕。请结合用户的原始请求、已完成的步骤和下方的当前页面快照,判断下一步该做什么。", "**注意:不要重复已成功(✅)的操作,只执行尚未完成的下一步。**"];
198
+ const hasErrors = trace.some((e) => hasToolError(e.result));
199
+ const contextParts = [
200
+ "## Execution context",
201
+ "Current remaining instruction:",
202
+ activeInstruction,
203
+ "",
204
+ "Task-reduction model:",
205
+ "Input: current remaining instruction + previous round executed actions + this-round actions.",
206
+ "Output: new remaining instruction after removing this-round actions.",
207
+ "Start from visible page state directly. Do NOT restate task. Do NOT output planning text.",
208
+ "Execute all independent visible sub-tasks in one round.",
209
+ "Do NOT act on elements not present in this snapshot yet.",
210
+ "If action changes DOM (open modal/navigate), stop after that batch and continue next round.",
211
+ "Do NOT call page_info (get_url/get_title/query_all/snapshot).",
212
+ "For dropdown/select fields, use dom with action=select_option (or fill on a select).",
213
+ allowAgentUiInteraction ? "User explicitly asked to operate AutoPilot UI. You may interact with chat input/send/dock only as requested." : "Do NOT interact with any AI chat UI elements (chat input, send button, dock). Only operate on the actual page content."
214
+ ];
215
+ if (hasErrors) contextParts.push("", "The last step failed. Retry with a different approach, or skip and continue with other visible targets.");
216
+ else contextParts.push("", "If the goal is fully done, reply with a short summary (no tool calls).");
217
+ if (previousRoundTasks && previousRoundTasks.length > 0) contextParts.push("", "Previous round planned task array (already executed):", ...previousRoundTasks.map((task, index) => `${index + 1}. ${task}`));
218
+ contextParts.push("", "After this round, include one plain text line:", "REMAINING: <new remaining instruction after this-round actions>", "or REMAINING: DONE");
168
219
  const lastEntry = trace[trace.length - 1];
169
220
  if (hasToolError(lastEntry.result)) {
170
221
  const stripped = toContentString(lastEntry.result.content).replace(SNAPSHOT_REGEX, "").trim();
171
- if (stripped && stripped.length < 500) {
172
- contextParts.push("", "### 最近失败操作详情", stripped);
173
- contextParts.push("请换一种方式完成该步骤,或跳过该步骤继续后续操作。");
174
- }
222
+ if (stripped && stripped.length < 300) contextParts.push("", "Last error: " + stripped);
175
223
  }
176
- if (currentUrl) contextParts.push("", `当前页面:${currentUrl}`);
177
- if (latestSnapshot) contextParts.push("", "## 当前页面 DOM 快照(这是页面的真实当前状态)", wrapSnapshot(latestSnapshot));
224
+ if (currentUrl) contextParts.push("", `URL: ${currentUrl}`);
225
+ if (latestSnapshot) contextParts.push("", "## Latest DOM snapshot", "Use hash IDs from this snapshot. Do NOT call page_info — this is already the latest.", wrapSnapshot(latestSnapshot));
178
226
  messages.push({
179
227
  role: "user",
180
228
  content: contextParts.join("\n")
@@ -182,39 +230,339 @@ function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, hi
182
230
  return messages;
183
231
  }
184
232
 
233
+ //#endregion
234
+ //#region src/core/agent-loop/recovery.ts
235
+ /** 冗余 page_info 动作(中)/ Redundant page_info actions to intercept (EN). */
236
+ const REDUNDANT_PAGE_INFO_ACTIONS = new Set([
237
+ "snapshot",
238
+ "query_all",
239
+ "get_url",
240
+ "get_title",
241
+ "get_viewport"
242
+ ]);
243
+ /**
244
+ * 冗余 page_info 检查(中)/ Check whether page_info call is redundant (EN).
245
+ */
246
+ function checkRedundantSnapshot(toolName, toolInput, _latestSnapshot, round) {
247
+ if (toolName !== "page_info") return null;
248
+ const action = getToolAction(toolInput);
249
+ if (action && REDUNDANT_PAGE_INFO_ACTIONS.has(action)) return {
250
+ content: `page_info.${action} is blocked in loop execution. A snapshot is provided by the framework; continue with actionable tools directly.`,
251
+ details: {
252
+ code: "REDUNDANT_PAGE_INFO_SKIPPED",
253
+ action,
254
+ round
255
+ }
256
+ };
257
+ return null;
258
+ }
259
+ /**
260
+ * 快照防抖(中)/ Debounce repeated snapshot calls (EN).
261
+ */
262
+ function applySnapshotDebounce(toolName, toolInput, result, consecutiveCount) {
263
+ if (toolName === "page_info" && getToolAction(toolInput) === "snapshot") {
264
+ const newCount = consecutiveCount + 1;
265
+ if (newCount >= 2) return {
266
+ consecutiveCount: newCount,
267
+ result: {
268
+ content: [toContentString(result.content), "Redundant snapshot detected. Continue with remaining actionable steps using the latest snapshot; avoid additional snapshot unless navigation or uncertainty changes."].join("\n"),
269
+ details: {
270
+ error: true,
271
+ code: "REDUNDANT_SNAPSHOT",
272
+ consecutiveSnapshotCalls: newCount
273
+ }
274
+ }
275
+ };
276
+ return {
277
+ result,
278
+ consecutiveCount: newCount
279
+ };
280
+ }
281
+ return {
282
+ result,
283
+ consecutiveCount: 0
284
+ };
285
+ }
286
+ /**
287
+ * 元素未找到恢复(中)/ Recover from element-not-found failures (EN).
288
+ *
289
+ * 前两次自动恢复,超过上限后返回终止提示。
290
+ * Auto-recovers for initial attempts, then returns max-recovery signal.
291
+ */
292
+ async function handleElementRecovery(toolName, toolInput, result, recoveryAttempts, registry, pageContext, callbacks) {
293
+ if (toolName !== "dom" || !isElementNotFoundResult(result)) return null;
294
+ const key = buildToolCallKey(toolName, toolInput);
295
+ const attempts = (recoveryAttempts.get(key) ?? 0) + 1;
296
+ recoveryAttempts.set(key, attempts);
297
+ const recoveryWaitMs = resolveRecoveryWaitMs(toolInput);
298
+ if (attempts <= DEFAULT_ACTION_RECOVERY_ROUNDS) {
299
+ await sleep$1(recoveryWaitMs);
300
+ callbacks?.onBeforeRecoverySnapshot?.();
301
+ pageContext.latestSnapshot = await readPageSnapshot(registry);
302
+ return {
303
+ content: [toContentString(result.content), `Recovery ${attempts}/${DEFAULT_ACTION_RECOVERY_ROUNDS}: snapshot refreshed, re-locate target.`].join("\n"),
304
+ details: {
305
+ error: true,
306
+ code: "ELEMENT_NOT_FOUND_RECOVERY",
307
+ recoveryAttempt: attempts,
308
+ recoveryMaxRounds: DEFAULT_ACTION_RECOVERY_ROUNDS
309
+ }
310
+ };
311
+ }
312
+ return {
313
+ content: [toContentString(result.content), `Max recovery attempts (${DEFAULT_ACTION_RECOVERY_ROUNDS}) reached. Try a different target.`].join("\n"),
314
+ details: {
315
+ error: true,
316
+ code: "ELEMENT_NOT_FOUND_MAX_RECOVERY_REACHED",
317
+ recoveryAttempt: attempts,
318
+ recoveryMaxRounds: DEFAULT_ACTION_RECOVERY_ROUNDS
319
+ }
320
+ };
321
+ }
322
+ /** 导航后快照刷新(中)/ Refresh snapshot after navigation actions (EN). */
323
+ async function handleNavigationUrlChange(toolName, toolInput, result, registry, pageContext, callbacks) {
324
+ if (toolName !== "navigate") return;
325
+ const action = getToolAction(toolInput);
326
+ if ((action === "goto" || action === "back" || action === "forward" || action === "reload") && !hasToolError(result)) {
327
+ callbacks?.onBeforeRecoverySnapshot?.();
328
+ pageContext.latestSnapshot = await readPageSnapshot(registry);
329
+ }
330
+ }
331
+ /** 只读工具集合(中)/ Read-only tool set (EN). */
332
+ const READ_ONLY_TOOLS = new Set(["page_info"]);
333
+ /**
334
+ * 空转检测(中)/ Detect idle loops dominated by read-only actions (EN).
335
+ * 返回 -1 表示应终止循环。
336
+ * Returns -1 when loop should terminate.
337
+ */
338
+ function detectIdleLoop(toolCallNames, consecutiveReadOnlyRounds) {
339
+ if (toolCallNames.every((name) => READ_ONLY_TOOLS.has(name))) {
340
+ const newCount = consecutiveReadOnlyRounds + 1;
341
+ return newCount >= 2 ? -1 : newCount;
342
+ }
343
+ return 0;
344
+ }
345
+
185
346
  //#endregion
186
347
  //#region src/core/agent-loop/index.ts
187
348
  /**
188
- * 执行 Agent 决策循环(环境无关)。
349
+ * Agent Loop 主流程(中)/ Core environment-agnostic agent loop (EN).
350
+ *
351
+ * 负责消息构建、AI 决策、工具执行、恢复保护与指标汇总。
352
+ * Orchestrates message build, AI decisions, tool execution, recovery, and metrics.
353
+ *
354
+ * 流程图(文本):
355
+ *
356
+ * 轮次开始
357
+ * │
358
+ * ├─ 确保快照可用
359
+ * ├─ 构建紧凑消息(目标 + 剩余任务 + 执行轨迹 + 快照)
360
+ * ├─ 调用模型
361
+ * ├─ 无 toolCalls ? 结束 : 执行工具
362
+ * ├─ 应用保护机制(冗余拦截/恢复/导航检测/空转/防自转)
363
+ * ├─ 刷新快照
364
+ * ▼
365
+ * 下一轮或停机
366
+ */
367
+ /**
368
+ * 执行 Agent 循环(中)/ Execute the agent loop (EN).
189
369
  *
190
- * 完整流程:
191
- * 1. 获取已注册的工具列表
192
- * 2. 循环:发消息给 AI → 检查是否返回 tool_call → 执行 → 反馈 → 继续
193
- * 3. AI 不再调用工具时,返回最终回复
370
+ * 每轮:确保快照 → 构建消息 → 调用 AI → 执行工具 → 保护处理 → 刷新快照。
371
+ * Per round: ensure snapshot -> build messages -> call AI -> execute tools -> apply protections -> refresh snapshot.
194
372
  */
195
373
  async function executeAgentLoop(params) {
196
- const { client, registry, systemPrompt, message, history, dryRun = false, maxRounds = DEFAULT_MAX_ROUNDS, callbacks } = params;
374
+ const { client, registry, systemPrompt, message, initialSnapshot, history, dryRun = false, maxRounds = DEFAULT_MAX_ROUNDS, callbacks } = params;
197
375
  const tools = registry.getDefinitions();
198
376
  const allToolCalls = [];
199
377
  const fullToolTrace = [];
200
378
  const actionRecoveryAttempts = /* @__PURE__ */ new Map();
201
- const pageContext = {};
379
+ const pageContext = { latestSnapshot: initialSnapshot };
202
380
  let finalReply = "";
381
+ let consecutiveSnapshotCalls = 0;
382
+ let consecutiveReadOnlyRounds = 0;
383
+ let usedRounds = 0;
384
+ let inputTokens = 0;
385
+ let outputTokens = 0;
386
+ let remainingInstruction = message.trim();
387
+ let previousRoundTasks = [];
388
+ let lastPlannedBatchKey = "";
389
+ let consecutiveSamePlannedBatch = 0;
390
+ let lastRoundHadError = false;
391
+ let recoveryCount = 0;
392
+ let redundantInterceptCount = 0;
393
+ let pendingNotFoundRetry;
394
+ let snapshotReadCount = 0;
395
+ let snapshotSizeTotal = 0;
396
+ let snapshotSizeMax = 0;
397
+ /**
398
+ * 记录快照统计(中)/ Record snapshot metrics (EN).
399
+ *
400
+ * 用于输出可观测指标:读取次数、平均长度、最大长度。
401
+ * Used for observability metrics: read count, avg size, max size.
402
+ */
403
+ const recordSnapshotStats = (snapshot) => {
404
+ if (typeof snapshot !== "string") return;
405
+ snapshotReadCount += 1;
406
+ snapshotSizeTotal += snapshot.length;
407
+ if (snapshot.length > snapshotSizeMax) snapshotSizeMax = snapshot.length;
408
+ };
409
+ /**
410
+ * 刷新页面快照(中)/ Refresh page snapshot (EN).
411
+ *
412
+ * 只做两件事:读取最新快照 + 更新快照统计。
413
+ * Does exactly two things: read latest snapshot + update metrics.
414
+ */
415
+ const refreshSnapshot = async () => {
416
+ pageContext.latestSnapshot = await readPageSnapshot(registry);
417
+ recordSnapshotStats(pageContext.latestSnapshot);
418
+ };
419
+ if (pageContext.latestSnapshot) recordSnapshotStats(pageContext.latestSnapshot);
420
+ /**
421
+ * 追加工具轨迹(中)/ Append tool trace entry (EN).
422
+ *
423
+ * 同时写入:
424
+ * - allToolCalls:对外返回结果
425
+ * - fullToolTrace:下一轮消息上下文
426
+ */
427
+ const appendToolTrace = (round, name, input, result) => {
428
+ allToolCalls.push({
429
+ name,
430
+ input,
431
+ result
432
+ });
433
+ fullToolTrace.push({
434
+ round,
435
+ name,
436
+ input,
437
+ result
438
+ });
439
+ };
440
+ /**
441
+ * 生成任务数组(中)/ Build normalized task array (EN).
442
+ *
443
+ * 将本轮 toolCalls 归一化成稳定字符串数组,便于:
444
+ * - 回传到下一轮消息上下文(提醒已执行计划)
445
+ * - 进行“是否与上一轮完全相同”的比较
446
+ */
447
+ const buildTaskArray = (toolCalls) => toolCalls.map((tc) => {
448
+ const inputText = JSON.stringify(tc.input);
449
+ return `${tc.name}:${inputText}`;
450
+ });
451
+ /**
452
+ * 判定动作是否会触发 DOM 结构变化(中)/ Whether action may cause DOM-shape change (EN).
453
+ *
454
+ * 触发后应强制断轮,等待下一轮新快照继续。
455
+ * Force round break after such action and continue with refreshed snapshot next round.
456
+ */
457
+ const shouldForceRoundBreak = (toolName, toolInput) => {
458
+ const action = getToolAction(toolInput);
459
+ if (toolName === "navigate") return action === "goto" || action === "back" || action === "forward" || action === "reload";
460
+ if (toolName === "dom") return action === "click" || action === "press";
461
+ if (toolName === "evaluate") return true;
462
+ return false;
463
+ };
464
+ /**
465
+ * 将“找不到元素”的失败任务整理成可重试清单(中)/ Build retry task list for not-found failures (EN).
466
+ */
467
+ const collectMissingTask = (name, input, result) => {
468
+ if (!isElementNotFoundResult(result)) return null;
469
+ return {
470
+ name,
471
+ input,
472
+ reason: toContentString(result.content).slice(0, 240)
473
+ };
474
+ };
475
+ /**
476
+ * 解析 REMAINING 协议(中)/ Parse REMAINING protocol from model text (EN).
477
+ *
478
+ * 支持:
479
+ * - `REMAINING: <text>` → 继续下一轮消费该剩余文本
480
+ * - `REMAINING: DONE` → 剩余任务为空
481
+ * 返回 null 表示本轮没有提供 REMAINING 标记。
482
+ */
483
+ const parseRemainingInstruction = (text) => {
484
+ if (!text) return null;
485
+ const match = text.match(/REMAINING\s*:\s*([\s\S]*)$/i);
486
+ if (!match) return null;
487
+ const value = match[1].trim();
488
+ return /^done$/i.test(value) ? "" : value;
489
+ };
490
+ /**
491
+ * 推进下一轮描述(中)/ Derive next-round instruction from model text (EN).
492
+ *
493
+ * 优先 REMAINING 协议;若未提供,则把本轮 content 视为“更新后的任务描述”。
494
+ * Priority: REMAINING protocol first; otherwise treat current content as updated instruction.
495
+ */
496
+ const deriveNextInstruction = (text, currentInstruction) => {
497
+ const parsed = parseRemainingInstruction(text);
498
+ if (parsed !== null) return {
499
+ nextInstruction: parsed,
500
+ hasRemainingProtocol: true
501
+ };
502
+ return {
503
+ nextInstruction: currentInstruction,
504
+ hasRemainingProtocol: false
505
+ };
506
+ };
203
507
  for (let round = 0; round < maxRounds; round++) {
204
508
  callbacks?.onRound?.(round);
205
- const effectivePrompt = pageContext.latestSnapshot ? stripSnapshotFromPrompt(systemPrompt) : systemPrompt;
206
- const chatMessages = buildCompactMessages(message, fullToolTrace, pageContext.latestSnapshot, pageContext.currentUrl, history);
509
+ usedRounds = round + 1;
510
+ if (!pageContext.latestSnapshot) await refreshSnapshot();
511
+ const effectivePrompt = stripSnapshotFromPrompt(systemPrompt);
512
+ const chatMessages = buildCompactMessages(message, fullToolTrace, pageContext.latestSnapshot, pageContext.currentUrl, history, remainingInstruction, previousRoundTasks);
513
+ if (pendingNotFoundRetry && pendingNotFoundRetry.tasks.length > 0) chatMessages.push({
514
+ role: "user",
515
+ content: [
516
+ "## Not-found retry context",
517
+ `Retry attempt: ${pendingNotFoundRetry.attempt}/${DEFAULT_NOT_FOUND_RETRY_ROUNDS}`,
518
+ "These tool targets were not found in previous execution:",
519
+ ...pendingNotFoundRetry.tasks.map((task, i) => `${i + 1}. ${task.name}(${JSON.stringify(task.input)}) -> ${task.reason}`),
520
+ "Only retry unresolved targets that are now visible in the latest snapshot.",
521
+ "If still not found, return no tool calls and include REMAINING with the unresolved part."
522
+ ].join("\n")
523
+ });
207
524
  const response = await client.chat({
208
525
  systemPrompt: effectivePrompt,
209
526
  messages: chatMessages,
210
527
  tools
211
528
  });
529
+ inputTokens += response.usage?.inputTokens ?? 0;
530
+ outputTokens += response.usage?.outputTokens ?? 0;
531
+ const nextInstructionState = deriveNextInstruction(response.text, remainingInstruction);
532
+ remainingInstruction = nextInstructionState.nextInstruction;
212
533
  if (!response.toolCalls || response.toolCalls.length === 0) {
534
+ if (pendingNotFoundRetry) {
535
+ const unresolvedHint = response.text?.toLowerCase() ?? "";
536
+ if ((unresolvedHint.includes("找不到") || unresolvedHint.includes("未找到") || unresolvedHint.includes("not found") || unresolvedHint.includes("cannot find") || unresolvedHint.includes("unable to locate")) && pendingNotFoundRetry.attempt < DEFAULT_NOT_FOUND_RETRY_ROUNDS) {
537
+ pendingNotFoundRetry = {
538
+ ...pendingNotFoundRetry,
539
+ attempt: pendingNotFoundRetry.attempt + 1
540
+ };
541
+ callbacks?.onText?.(`未命中目标,准备第 ${pendingNotFoundRetry.attempt} 次重试(等待 ${DEFAULT_NOT_FOUND_RETRY_WAIT_MS}ms)...`);
542
+ await sleep$1(DEFAULT_NOT_FOUND_RETRY_WAIT_MS);
543
+ await refreshSnapshot();
544
+ continue;
545
+ }
546
+ pendingNotFoundRetry = void 0;
547
+ }
213
548
  finalReply = response.text ?? "";
214
549
  if (finalReply) callbacks?.onText?.(finalReply);
215
550
  break;
216
551
  }
217
- if (response.text) callbacks?.onText?.(response.text);
552
+ const plannedBatchKey = JSON.stringify(response.toolCalls.map((tc) => ({
553
+ name: tc.name,
554
+ input: tc.input
555
+ })));
556
+ if (plannedBatchKey === lastPlannedBatchKey) consecutiveSamePlannedBatch += 1;
557
+ else {
558
+ consecutiveSamePlannedBatch = 1;
559
+ lastPlannedBatchKey = plannedBatchKey;
560
+ }
561
+ if (consecutiveSamePlannedBatch >= 2 && !lastRoundHadError) {
562
+ finalReply = response.text?.trim() || "任务已完成。";
563
+ if (finalReply) callbacks?.onText?.(finalReply);
564
+ break;
565
+ }
218
566
  if (dryRun) {
219
567
  finalReply = response.text ? response.text + "\n\n" : "";
220
568
  finalReply += "🔧 AI 请求调用以下工具(dry-run 模式,未执行):\n";
@@ -229,103 +577,57 @@ async function executeAgentLoop(params) {
229
577
  }
230
578
  break;
231
579
  }
580
+ let roundHasError = false;
581
+ const executedTaskCalls = [];
582
+ const roundMissingTasks = [];
232
583
  for (const tc of response.toolCalls) {
233
- callbacks?.onToolCall?.(tc.name, tc.input);
234
- const latestUrl = await readPageUrl(registry);
235
- if (latestUrl) {
236
- if (!pageContext.currentUrl) pageContext.currentUrl = latestUrl;
237
- else if (latestUrl !== pageContext.currentUrl) {
238
- pageContext.currentUrl = latestUrl;
239
- callbacks?.onBeforeRecoverySnapshot?.(latestUrl);
240
- pageContext.latestSnapshot = await readPageSnapshot(registry, 8);
241
- if (tc.name === "dom") {
242
- const result = {
243
- content: `URL 已变更为 ${latestUrl},请基于最新快照重新定位目标元素。`,
244
- details: {
245
- error: true,
246
- code: "URL_CHANGED_REQUIRE_NEW_SNAPSHOT",
247
- url: latestUrl
248
- }
249
- };
250
- allToolCalls.push({
251
- name: tc.name,
252
- input: tc.input,
253
- result
254
- });
255
- fullToolTrace.push({
256
- round,
257
- name: tc.name,
258
- input: tc.input,
259
- result,
260
- marker: "[URL变化待重定位]"
261
- });
262
- callbacks?.onToolResult?.(tc.name, result);
263
- continue;
264
- }
265
- }
584
+ const redundant = checkRedundantSnapshot(tc.name, tc.input, pageContext.latestSnapshot, round);
585
+ if (redundant) {
586
+ appendToolTrace(round, tc.name, tc.input, redundant);
587
+ redundantInterceptCount += 1;
588
+ callbacks?.onToolResult?.(tc.name, redundant);
589
+ continue;
266
590
  }
591
+ callbacks?.onToolCall?.(tc.name, tc.input);
267
592
  let result = await registry.dispatch(tc.name, tc.input);
268
- if (tc.name === "dom" && isElementNotFoundResult(result)) {
269
- const key = buildToolCallKey(tc.name, tc.input);
270
- const attempts = (actionRecoveryAttempts.get(key) ?? 0) + 1;
271
- actionRecoveryAttempts.set(key, attempts);
272
- const recoveryWaitMs = resolveRecoveryWaitMs(tc.input);
273
- if (attempts <= DEFAULT_ACTION_RECOVERY_ROUNDS) {
274
- await sleep$1(recoveryWaitMs);
275
- callbacks?.onBeforeRecoverySnapshot?.();
276
- pageContext.latestSnapshot = await readPageSnapshot(registry, 8);
277
- result = {
278
- content: [
279
- toContentString(result.content),
280
- "",
281
- `自动恢复 ${attempts}/${DEFAULT_ACTION_RECOVERY_ROUNDS}:已刷新快照,请重新定位目标元素。`
282
- ].join("\n"),
283
- details: {
284
- error: true,
285
- code: "ELEMENT_NOT_FOUND_RECOVERY",
286
- recoveryAttempt: attempts,
287
- recoveryMaxRounds: DEFAULT_ACTION_RECOVERY_ROUNDS
288
- }
289
- };
290
- } else result = {
291
- content: [
292
- toContentString(result.content),
293
- "",
294
- `已达到最大自动恢复次数(${DEFAULT_ACTION_RECOVERY_ROUNDS})。请调整操作目标后重试。`
295
- ].join("\n"),
296
- details: {
297
- error: true,
298
- code: "ELEMENT_NOT_FOUND_MAX_RECOVERY_REACHED",
299
- recoveryAttempt: attempts,
300
- recoveryMaxRounds: DEFAULT_ACTION_RECOVERY_ROUNDS
301
- }
302
- };
303
- }
304
- allToolCalls.push({
305
- name: tc.name,
306
- input: tc.input,
307
- result
308
- });
309
- fullToolTrace.push({
310
- round,
593
+ const debounced = applySnapshotDebounce(tc.name, tc.input, result, consecutiveSnapshotCalls);
594
+ result = debounced.result;
595
+ consecutiveSnapshotCalls = debounced.consecutiveCount;
596
+ const recovered = await handleElementRecovery(tc.name, tc.input, result, actionRecoveryAttempts, registry, pageContext, callbacks);
597
+ if (recovered) result = recovered;
598
+ if (recovered?.details && typeof recovered.details === "object" && recovered.details.code === "ELEMENT_NOT_FOUND_RECOVERY") recoveryCount += 1;
599
+ appendToolTrace(round, tc.name, tc.input, result);
600
+ executedTaskCalls.push({
311
601
  name: tc.name,
312
- input: tc.input,
313
- result
602
+ input: tc.input
314
603
  });
315
- if (tc.name === "page_info" && getToolAction(tc.input) === "snapshot") pageContext.latestSnapshot = toContentString(result.content);
316
- if (tc.name === "navigate") {
317
- const action = getToolAction(tc.input);
318
- if ((action === "goto" || action === "back" || action === "forward" || action === "reload") && !hasToolError(result)) {
319
- const newUrl = await readPageUrl(registry);
320
- if (newUrl && newUrl !== pageContext.currentUrl) {
321
- pageContext.currentUrl = newUrl;
322
- callbacks?.onBeforeRecoverySnapshot?.(newUrl);
323
- pageContext.latestSnapshot = await readPageSnapshot(registry, 8);
324
- }
325
- }
604
+ const missingTask = collectMissingTask(tc.name, tc.input, result);
605
+ if (missingTask) roundMissingTasks.push(missingTask);
606
+ if (result.details && typeof result.details === "object") roundHasError = roundHasError || Boolean(result.details.error);
607
+ if (tc.name === "page_info" && getToolAction(tc.input) === "snapshot") {
608
+ pageContext.latestSnapshot = toContentString(result.content);
609
+ recordSnapshotStats(pageContext.latestSnapshot);
326
610
  }
611
+ await handleNavigationUrlChange(tc.name, tc.input, result, registry, pageContext, callbacks);
327
612
  callbacks?.onToolResult?.(tc.name, result);
613
+ if (shouldForceRoundBreak(tc.name, tc.input)) break;
328
614
  }
615
+ if (roundMissingTasks.length > 0) pendingNotFoundRetry = {
616
+ attempt: 1,
617
+ tasks: roundMissingTasks
618
+ };
619
+ else pendingNotFoundRetry = void 0;
620
+ if (!nextInstructionState.hasRemainingProtocol) roundHasError = true;
621
+ lastRoundHadError = roundHasError;
622
+ previousRoundTasks = buildTaskArray(executedTaskCalls);
623
+ const idleResult = detectIdleLoop(executedTaskCalls.map((tc) => tc.name), consecutiveReadOnlyRounds);
624
+ if (idleResult === -1) {
625
+ finalReply = response.text || "任务已完成。";
626
+ if (finalReply) callbacks?.onText?.(finalReply);
627
+ break;
628
+ }
629
+ consecutiveReadOnlyRounds = idleResult;
630
+ await refreshSnapshot();
329
631
  }
330
632
  const resultMessages = [...history ?? [], {
331
633
  role: "user",
@@ -335,70 +637,146 @@ async function executeAgentLoop(params) {
335
637
  role: "assistant",
336
638
  content: finalReply
337
639
  });
640
+ const successfulToolCalls = allToolCalls.filter((tc) => {
641
+ const details = tc.result.details;
642
+ return !(details && typeof details === "object" && Boolean(details.error));
643
+ }).length;
644
+ const failedToolCalls = allToolCalls.length - successfulToolCalls;
645
+ const metrics = {
646
+ roundCount: usedRounds,
647
+ totalToolCalls: allToolCalls.length,
648
+ successfulToolCalls,
649
+ failedToolCalls,
650
+ toolSuccessRate: allToolCalls.length > 0 ? Number((successfulToolCalls / allToolCalls.length).toFixed(4)) : 1,
651
+ recoveryCount,
652
+ redundantInterceptCount,
653
+ snapshotReadCount,
654
+ latestSnapshotSize: pageContext.latestSnapshot?.length ?? 0,
655
+ avgSnapshotSize: snapshotReadCount > 0 ? Math.round(snapshotSizeTotal / snapshotReadCount) : 0,
656
+ maxSnapshotSize: snapshotSizeMax,
657
+ inputTokens,
658
+ outputTokens
659
+ };
660
+ callbacks?.onMetrics?.(metrics);
338
661
  return {
339
662
  reply: finalReply,
340
663
  toolCalls: allToolCalls,
341
- messages: resultMessages
664
+ messages: resultMessages,
665
+ metrics
342
666
  };
343
667
  }
344
668
 
345
669
  //#endregion
346
670
  //#region src/core/ai-client/constants.ts
347
- /**
348
- * 各 Provider 的默认 API 端点。
349
- *
350
- * - openai → OpenAI 官方 API
351
- * - copilot → GitHub Models API(使用 OpenAI 兼容格式)
352
- * - anthropic → Anthropic Messages API
353
- */
671
+ /** 默认端点映射(中)/ Default API endpoints by provider (EN). */
354
672
  const PROVIDER_ENDPOINTS = {
355
673
  openai: "https://api.openai.com/v1",
356
674
  copilot: "https://models.inference.ai.azure.com",
357
675
  anthropic: "https://api.anthropic.com",
358
676
  deepseek: "https://api.deepseek.com"
359
677
  };
360
- /**
361
- * 校验 provider 是否受支持。
362
- *
363
- * @throws 不支持的 provider 抛出 Error,附带支持列表
364
- */
678
+ /** 校验 provider(中)/ Validate provider support (EN). */
365
679
  function validateProvider(provider) {
366
680
  if (!PROVIDER_ENDPOINTS[provider]) {
367
681
  const supported = Object.keys(PROVIDER_ENDPOINTS).join(", ");
368
682
  throw new Error(`Unknown AI provider: ${provider}. Supported: ${supported}`);
369
683
  }
370
684
  }
371
- /**
372
- * 解析 provider 对应的 API 基础 URL。
373
- *
374
- * 优先使用用户自定义的 baseURL(如本地 Ollama),
375
- * 其次使用 PROVIDER_ENDPOINTS 中的默认值。
376
- */
685
+ /** 解析 baseURL(中)/ Resolve API base URL (EN). */
377
686
  function resolveBaseURL(config) {
378
687
  return config.baseURL ?? PROVIDER_ENDPOINTS[config.provider] ?? "";
379
688
  }
380
689
  /**
381
- * 清理 TypeBox Schema 去除 Symbol 等不可序列化的属性。
382
- *
383
- * TypeBox 的 Type.Object() 产物包含 Symbol key(如 [Kind]、[Hint]),
384
- * 这些 Symbol 在 JSON.stringify 时会被忽略,但某些 AI API 端点
385
- * 对 JSON Schema 做严格校验时可能报错。
386
- *
387
- * 通过 JSON roundtrip(stringify → parse)清理掉所有不可序列化的属性。
690
+ * 清理 schema(中)/ Clean non-serializable fields from schema (EN).
388
691
  */
389
692
  function cleanSchema(schema) {
390
693
  return JSON.parse(JSON.stringify(schema));
391
694
  }
392
695
 
393
696
  //#endregion
394
- //#region src/core/ai-client/custom.ts
697
+ //#region src/core/ai-client/sse.ts
395
698
  /**
396
- * 可继承的 AI 客户端基类 实现 AIClient 接口。
699
+ * 通用 SSE(JSON) 消费器(中)/ Generic SSE(JSON) consumer (EN).
397
700
  *
398
- * 设计原则:
399
- * - 实现 `AIClient` 接口 可直接传入 `executeAgentLoop()` `WebAgent`
400
- * - 构造时注入 `chatHandler` → 无需继承即可自定义对话逻辑
401
- * - `chat()` 方法可被子类覆盖 支持继承式扩展(添加中间件逻辑)
701
+ * 读取 response.body,按 SSE 规则拼装并分发 JSON data 事件。
702
+ * Reads response body, assembles SSE frames, and dispatches JSON data events.
703
+ */
704
+ async function consumeSSEJSON(response, onEvent, options = {}) {
705
+ if (!response.body) return;
706
+ const reader = response.body.getReader();
707
+ const decoder = new TextDecoder();
708
+ const stopOnDone = options.stopOnDone ?? true;
709
+ let buffer = "";
710
+ let currentEvent;
711
+ let dataLines = [];
712
+ let stoppedByDone = false;
713
+ async function readChunk() {
714
+ const readTimeoutMs = options.readTimeoutMs;
715
+ if (!readTimeoutMs || readTimeoutMs <= 0) return reader.read();
716
+ return new Promise((resolve, reject) => {
717
+ const timer = setTimeout(() => {
718
+ reject(/* @__PURE__ */ new Error(`SSE read timeout (${readTimeoutMs}ms)`));
719
+ }, readTimeoutMs);
720
+ reader.read().then((value) => {
721
+ clearTimeout(timer);
722
+ resolve(value);
723
+ }, (error) => {
724
+ clearTimeout(timer);
725
+ reject(error);
726
+ });
727
+ });
728
+ }
729
+ async function flushEvent() {
730
+ if (dataLines.length === 0) {
731
+ currentEvent = void 0;
732
+ return true;
733
+ }
734
+ const rawData = dataLines.join("\n").trim();
735
+ const event = currentEvent;
736
+ dataLines = [];
737
+ currentEvent = void 0;
738
+ if (!rawData) return true;
739
+ if (stopOnDone && rawData === "[DONE]") {
740
+ stoppedByDone = true;
741
+ return false;
742
+ }
743
+ try {
744
+ if (await onEvent(JSON.parse(rawData), {
745
+ event,
746
+ rawData
747
+ }) === false) return false;
748
+ } catch {}
749
+ return true;
750
+ }
751
+ while (true) {
752
+ const { done, value } = await readChunk();
753
+ if (done) break;
754
+ buffer += decoder.decode(value, { stream: true });
755
+ const lines = buffer.split("\n");
756
+ buffer = lines.pop() ?? "";
757
+ for (const rawLine of lines) {
758
+ const trimmed = (rawLine.endsWith("\r") ? rawLine.slice(0, -1) : rawLine).trim();
759
+ if (!trimmed) {
760
+ if (!await flushEvent()) break;
761
+ continue;
762
+ }
763
+ if (trimmed.startsWith(":")) continue;
764
+ if (trimmed.startsWith("event:")) {
765
+ currentEvent = trimmed.slice(6).trim() || void 0;
766
+ continue;
767
+ }
768
+ if (trimmed.startsWith("data:")) dataLines.push(trimmed.slice(5).trimStart());
769
+ }
770
+ if (stoppedByDone) break;
771
+ }
772
+ if (!stoppedByDone) await flushEvent();
773
+ else await reader.cancel().catch(() => void 0);
774
+ }
775
+
776
+ //#endregion
777
+ //#region src/core/ai-client/custom.ts
778
+ /**
779
+ * BaseAIClient 实现(中)/ BaseAIClient implementation of AIClient (EN).
402
780
  */
403
781
  var BaseAIClient = class {
404
782
  /** 用户提供的对话处理函数 */
@@ -407,47 +785,21 @@ var BaseAIClient = class {
407
785
  this.chatHandler = options.chatHandler;
408
786
  }
409
787
  /**
410
- * 发送对话请求并获取 AI 响应。
411
- *
412
- * 默认实现直接委托给 `chatHandler`。
413
- * 子类可覆盖此方法添加中间件逻辑(日志、重试、缓存等)。
414
- *
415
- * @param params - 统一格式的聊天参数
416
- * @returns 统一格式的 AI 响应
788
+ * 发送对话请求(中)/ Dispatch chat request via handler (EN).
417
789
  */
418
790
  async chat(params) {
419
791
  return this.chatHandler(params);
420
792
  }
793
+ /** SSE 消费复用入口(中)/ Reusable SSE(JSON) consumer for subclasses (EN). */
794
+ async consumeSSEJSON(response, onEvent, options) {
795
+ return consumeSSEJSON(response, onEvent, options);
796
+ }
421
797
  };
422
798
 
423
799
  //#endregion
424
800
  //#region src/core/ai-client/openai.ts
425
801
  /**
426
- * OpenAI / Copilot AI 客户端 继承 BaseAIClient。
427
- *
428
- * 封装完整的 OpenAI Chat Completions API 调用流程:
429
- * 1. buildOpenAIRequest() → 构建 HTTP 请求
430
- * 2. fetch() → 发送请求
431
- * 3. parseOpenAIResponse() → 解析响应为统一格式
432
- *
433
- * 使用示例:
434
- * ```ts
435
- * const client = new OpenAIClient({
436
- * provider: "openai",
437
- * model: "gpt-4o",
438
- * apiKey: "sk-xxx",
439
- * });
440
- * const response = await client.chat({ systemPrompt, messages, tools });
441
- * ```
442
- *
443
- * 也可用于 Copilot(GitHub Models):
444
- * ```ts
445
- * const client = new OpenAIClient({
446
- * provider: "copilot",
447
- * model: "gpt-4o",
448
- * apiKey: "ghp_xxx",
449
- * });
450
- * ```
802
+ * OpenAIClient 类(中)/ OpenAIClient class for OpenAI & Copilot (EN).
451
803
  */
452
804
  var OpenAIClient = class extends BaseAIClient {
453
805
  /** AI 客户端配置(provider / model / apiKey / baseURL) */
@@ -455,30 +807,35 @@ var OpenAIClient = class extends BaseAIClient {
455
807
  constructor(config) {
456
808
  super({ chatHandler: async (params) => {
457
809
  const req = buildOpenAIRequest(this.config, params);
458
- const res = await fetch(req.url, {
810
+ if (!(this.config.stream ?? true)) {
811
+ const res = await fetch(req.url, {
812
+ method: req.method,
813
+ headers: req.headers,
814
+ body: req.body
815
+ });
816
+ if (!res.ok) {
817
+ const errText = await res.text();
818
+ throw new Error(`AI API ${res.status}: ${errText.slice(0, 500)}`);
819
+ }
820
+ return parseOpenAIResponse(await res.json());
821
+ }
822
+ const streamRes = await fetch(req.url, {
459
823
  method: req.method,
460
824
  headers: req.headers,
461
825
  body: req.body
462
826
  });
463
- if (!res.ok) {
464
- const errText = await res.text();
465
- throw new Error(`AI API ${res.status}: ${errText.slice(0, 500)}`);
827
+ if (!streamRes.ok) {
828
+ const errText = await streamRes.text();
829
+ throw new Error(`AI API ${streamRes.status}: ${errText.slice(0, 500)}`);
466
830
  }
467
- return parseOpenAIResponse(await res.json());
831
+ if ((streamRes.headers.get("content-type") ?? "").includes("application/json")) return parseOpenAIResponse(await streamRes.json());
832
+ return parseOpenAIStream(streamRes, 2e4);
468
833
  } });
469
834
  this.config = config;
470
835
  }
471
836
  };
472
837
  /**
473
- * 将统一格式的 ChatParams 转换为 OpenAI Chat Completions API 请求。
474
- *
475
- * 转换逻辑:
476
- * - system prompt → `{ role: "system", content }` 消息
477
- * - 工具定义 → `tools` 数组(function calling 格式)
478
- * - 工具结果 → 拆分为多条 `{ role: "tool", tool_call_id }` 消息
479
- * - AI 回复含工具调用 → `tool_calls` 字段
480
- *
481
- * 默认参数:temperature=0.3, max_tokens=8192, tool_choice="auto"
838
+ * 构建 OpenAI 请求(中)/ Build OpenAI chat request payload (EN).
482
839
  */
483
840
  function buildOpenAIRequest(config, params) {
484
841
  const baseURL = resolveBaseURL(config);
@@ -496,11 +853,16 @@ function buildOpenAIRequest(config, params) {
496
853
  model: config.model,
497
854
  messages: openaiMessages,
498
855
  temperature: .3,
499
- max_tokens: 8192
856
+ max_tokens: 4096
500
857
  };
858
+ if (config.stream ?? true) {
859
+ body.stream = true;
860
+ body.stream_options = { include_usage: true };
861
+ }
501
862
  if (openaiTools && openaiTools.length > 0) {
502
863
  body.tools = openaiTools;
503
864
  body.tool_choice = "auto";
865
+ body.parallel_tool_calls = true;
504
866
  }
505
867
  return {
506
868
  url: `${baseURL}/chat/completions`,
@@ -513,14 +875,7 @@ function buildOpenAIRequest(config, params) {
513
875
  };
514
876
  }
515
877
  /**
516
- * OpenAI Chat Completions API 原始响应解析为统一的 AIChatResponse
517
- *
518
- * 解析要点:
519
- * - 文本回复 → `choice.message.content`
520
- * - 工具调用 → `choice.message.tool_calls`,arguments 为 JSON 字符串需 parse
521
- * - Token 用量 → `usage.prompt_tokens` / `usage.completion_tokens`
522
- *
523
- * @throws 无有效 choice 时抛出 Error
878
+ * 解析 OpenAI 响应(中)/ Parse raw OpenAI response into AIChatResponse (EN).
524
879
  */
525
880
  function parseOpenAIResponse(data) {
526
881
  const d = data;
@@ -542,12 +897,7 @@ function parseOpenAIResponse(data) {
542
897
  };
543
898
  }
544
899
  /**
545
- * 将统一消息格式转换为 OpenAI 消息数组。
546
- *
547
- * 三种特殊消息的处理:
548
- * 1. tool 消息(工具结果)→ 每个结果拆分为单独的 `role: "tool"` 消息
549
- * 2. assistant 含 toolCalls → 附带 `tool_calls` 字段
550
- * 3. 其他消息 → 直接映射 role + content
900
+ * 消息转换(中)/ Convert unified messages to OpenAI format (EN).
551
901
  */
552
902
  function convertMessages$1(systemPrompt, messages) {
553
903
  const result = [{
@@ -577,26 +927,56 @@ function convertMessages$1(systemPrompt, messages) {
577
927
  });
578
928
  return result;
579
929
  }
930
+ /**
931
+ * 解析 OpenAI SSE(中)/ Parse OpenAI SSE stream into unified response (EN).
932
+ */
933
+ async function parseOpenAIStream(response, readTimeoutMs = 2e4) {
934
+ if (!response.body) return parseOpenAIResponse(await response.json());
935
+ let text = "";
936
+ const toolCallMap = /* @__PURE__ */ new Map();
937
+ let usage;
938
+ await consumeSSEJSON(response, (event) => {
939
+ const chunk = event;
940
+ const delta = chunk.choices?.[0]?.delta;
941
+ if (delta?.content) text += delta.content;
942
+ if (delta?.tool_calls) for (const tc of delta.tool_calls) {
943
+ const idx = tc.index ?? 0;
944
+ const existing = toolCallMap.get(idx);
945
+ if (existing) {
946
+ if (tc.function?.arguments) existing.arguments += tc.function.arguments;
947
+ } else toolCallMap.set(idx, {
948
+ id: tc.id ?? "",
949
+ name: tc.function?.name ?? "",
950
+ arguments: tc.function?.arguments ?? ""
951
+ });
952
+ }
953
+ if (chunk.usage) usage = {
954
+ inputTokens: chunk.usage.prompt_tokens ?? 0,
955
+ outputTokens: chunk.usage.completion_tokens ?? 0
956
+ };
957
+ }, {
958
+ readTimeoutMs,
959
+ stopOnDone: true
960
+ });
961
+ const toolCalls = [];
962
+ for (const [, tc] of [...toolCallMap.entries()].sort((a, b) => a[0] - b[0])) try {
963
+ toolCalls.push({
964
+ id: tc.id,
965
+ name: tc.name,
966
+ input: JSON.parse(tc.arguments)
967
+ });
968
+ } catch {}
969
+ return {
970
+ text: text || void 0,
971
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
972
+ usage
973
+ };
974
+ }
580
975
 
581
976
  //#endregion
582
977
  //#region src/core/ai-client/anthropic.ts
583
978
  /**
584
- * Anthropic AI 客户端 继承 BaseAIClient。
585
- *
586
- * 封装完整的 Anthropic Messages API 调用流程:
587
- * 1. buildAnthropicRequest() → 构建 HTTP 请求
588
- * 2. fetch() → 发送请求
589
- * 3. parseAnthropicResponse() → 解析响应为统一格式
590
- *
591
- * 使用示例:
592
- * ```ts
593
- * const client = new AnthropicClient({
594
- * provider: "anthropic",
595
- * model: "claude-sonnet-4-20250514",
596
- * apiKey: "sk-ant-xxx",
597
- * });
598
- * const response = await client.chat({ systemPrompt, messages, tools });
599
- * ```
979
+ * AnthropicClient 类(中)/ AnthropicClient class (EN).
600
980
  */
601
981
  var AnthropicClient = class extends BaseAIClient {
602
982
  /** AI 客户端配置(provider / model / apiKey / baseURL) */
@@ -604,6 +984,18 @@ var AnthropicClient = class extends BaseAIClient {
604
984
  constructor(config) {
605
985
  super({ chatHandler: async (params) => {
606
986
  const req = buildAnthropicRequest(this.config, params);
987
+ if (!(this.config.stream ?? true)) {
988
+ const res = await fetch(req.url, {
989
+ method: req.method,
990
+ headers: req.headers,
991
+ body: req.body
992
+ });
993
+ if (!res.ok) {
994
+ const errText = await res.text();
995
+ throw new Error(`AI API ${res.status}: ${errText.slice(0, 500)}`);
996
+ }
997
+ return parseAnthropicResponse(await res.json());
998
+ }
607
999
  const res = await fetch(req.url, {
608
1000
  method: req.method,
609
1001
  headers: req.headers,
@@ -613,22 +1005,14 @@ var AnthropicClient = class extends BaseAIClient {
613
1005
  const errText = await res.text();
614
1006
  throw new Error(`AI API ${res.status}: ${errText.slice(0, 500)}`);
615
1007
  }
616
- return parseAnthropicResponse(await res.json());
1008
+ if ((res.headers.get("content-type") ?? "").includes("application/json")) return parseAnthropicResponse(await res.json());
1009
+ return parseAnthropicStream(res);
617
1010
  } });
618
1011
  this.config = config;
619
1012
  }
620
1013
  };
621
1014
  /**
622
- * 将统一格式的 ChatParams 转换为 Anthropic Messages API 请求。
623
- *
624
- * 关键格式差异(与 OpenAI 相比):
625
- * - system prompt → body.system 字段(非消息数组元素)
626
- * - 工具定义 → input_schema(而非 parameters)
627
- * - 工具结果 → user 角色 + tool_result content block
628
- * - AI 工具调用 → assistant 角色 + tool_use content block
629
- *
630
- * max_tokens 策略:opus 模型 16384,其他模型 8192。
631
- * 认证头使用 `x-api-key`(而非 Authorization Bearer)。
1015
+ * 构建 Anthropic 请求(中)/ Build Anthropic Messages API request (EN).
632
1016
  */
633
1017
  function buildAnthropicRequest(config, params) {
634
1018
  const baseURL = resolveBaseURL(config);
@@ -645,6 +1029,7 @@ function buildAnthropicRequest(config, params) {
645
1029
  system: systemPrompt,
646
1030
  messages: anthropicMessages
647
1031
  };
1032
+ if (config.stream ?? true) body.stream = true;
648
1033
  if (anthropicTools && anthropicTools.length > 0) body.tools = anthropicTools;
649
1034
  return {
650
1035
  url: `${baseURL}/v1/messages`,
@@ -658,13 +1043,7 @@ function buildAnthropicRequest(config, params) {
658
1043
  };
659
1044
  }
660
1045
  /**
661
- * Anthropic Messages API 原始响应解析为统一的 AIChatResponse。
662
- *
663
- * Anthropic 使用 content block 数组返回多种内容:
664
- * - type="text" → 文本回复(可能多个,合并为一个字符串)
665
- * - type="tool_use" → 工具调用(id + name + input)
666
- *
667
- * Token 用量字段名也不同:input_tokens / output_tokens(非 prompt_tokens)。
1046
+ * 解析 Anthropic 响应(中)/ Parse raw Anthropic response (EN).
668
1047
  */
669
1048
  function parseAnthropicResponse(data) {
670
1049
  const d = data;
@@ -684,12 +1063,7 @@ function parseAnthropicResponse(data) {
684
1063
  };
685
1064
  }
686
1065
  /**
687
- * 将统一消息格式转换为 Anthropic 消息数组。
688
- *
689
- * 关键差异处理:
690
- * 1. 过滤 system 消息(Anthropic 通过 body.system 传入)
691
- * 2. tool 角色消息 → user 角色 + tool_result content block
692
- * 3. assistant 含 toolCalls → text + tool_use content blocks
1066
+ * 消息格式转换(中)/ Convert unified messages to Anthropic format (EN).
693
1067
  */
694
1068
  function convertMessages(messages) {
695
1069
  return messages.filter((m) => m.role !== "system").map((m) => {
@@ -724,64 +1098,80 @@ function convertMessages(messages) {
724
1098
  };
725
1099
  });
726
1100
  }
1101
+ /**
1102
+ * 解析 Anthropic SSE(中)/ Parse Anthropic SSE stream (EN).
1103
+ */
1104
+ async function parseAnthropicStream(response) {
1105
+ if (!response.body) return parseAnthropicResponse(await response.json());
1106
+ let text = "";
1107
+ const toolCalls = [];
1108
+ let currentToolUse = null;
1109
+ let inputTokens = 0;
1110
+ let outputTokens = 0;
1111
+ await consumeSSEJSON(response, (event) => {
1112
+ switch (event.type) {
1113
+ case "message_start":
1114
+ inputTokens = event.message?.usage?.input_tokens ?? 0;
1115
+ break;
1116
+ case "content_block_start": {
1117
+ const block = event.content_block;
1118
+ if (block?.type === "tool_use") currentToolUse = {
1119
+ id: block.id ?? "",
1120
+ name: block.name ?? "",
1121
+ inputJson: ""
1122
+ };
1123
+ break;
1124
+ }
1125
+ case "content_block_delta": {
1126
+ const delta = event.delta;
1127
+ if (delta?.type === "text_delta") text += delta.text ?? "";
1128
+ else if (delta?.type === "input_json_delta" && currentToolUse) currentToolUse.inputJson += delta.partial_json ?? "";
1129
+ break;
1130
+ }
1131
+ case "content_block_stop":
1132
+ if (currentToolUse) {
1133
+ try {
1134
+ toolCalls.push({
1135
+ id: currentToolUse.id,
1136
+ name: currentToolUse.name,
1137
+ input: JSON.parse(currentToolUse.inputJson || "{}")
1138
+ });
1139
+ } catch {}
1140
+ currentToolUse = null;
1141
+ }
1142
+ break;
1143
+ case "message_delta":
1144
+ outputTokens = event.usage?.output_tokens ?? 0;
1145
+ break;
1146
+ }
1147
+ }, { stopOnDone: false });
1148
+ return {
1149
+ text: text || void 0,
1150
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
1151
+ usage: inputTokens > 0 || outputTokens > 0 ? {
1152
+ inputTokens,
1153
+ outputTokens
1154
+ } : void 0
1155
+ };
1156
+ }
727
1157
 
728
1158
  //#endregion
729
1159
  //#region src/core/ai-client/deepseek.ts
730
1160
  /**
731
- * DeepSeek AI 客户端。
732
- *
733
- * DeepSeek 使用 OpenAI 兼容的 Chat Completions API 格式,
734
- * 因此直接继承 OpenAIClient,复用请求构建和响应解析逻辑。
735
- *
736
- * 差异点(相对于 OpenAI):
737
- * - 端点:https://api.deepseek.com(Constants 中配置)
738
- * - 模型:deepseek-chat(V3)、deepseek-reasoner(R1)等
739
- * - 认证:Authorization: Bearer <API Key>(与 OpenAI 相同)
740
- * - tool_calls 格式与 OpenAI 完全一致
741
- *
742
- * 继承关系:
743
- * BaseAIClient(custom.ts)
744
- * └── OpenAIClient(openai.ts)
745
- * └── DeepSeekClient(本文件)— 可覆盖默认参数
746
- *
747
- * 使用示例:
748
- * ```ts
749
- * const client = new DeepSeekClient({
750
- * provider: "deepseek",
751
- * model: "deepseek-chat",
752
- * apiKey: "sk-xxx",
753
- * });
754
- * const response = await client.chat({ systemPrompt, messages, tools });
755
- * ```
1161
+ * DeepSeek 客户端封装(中)/ DeepSeek client wrapper (EN).
756
1162
  *
757
- * 参考文档:
758
- * - Tool Calls: https://api-docs.deepseek.com/zh-cn/guides/tool_calls
759
- * - Chat API: https://api-docs.deepseek.com/zh-cn/api/create-chat-completion/
1163
+ * DeepSeek 与 OpenAI Chat Completions 兼容,直接复用 OpenAIClient。
1164
+ * DeepSeek is OpenAI-compatible, so it reuses OpenAIClient behavior.
760
1165
  */
761
1166
  /**
762
- * DeepSeek AI 客户端 继承 OpenAIClient
763
- *
764
- * DeepSeek API 与 OpenAI Chat Completions API 完全兼容,
765
- * 包括 tool_calls、function calling、消息格式等。
766
- *
767
- * 如需自定义 DeepSeek 特有行为(如 strict 模式、思考模式等),
768
- * 可在此类中覆盖相关方法。
1167
+ * DeepSeek 客户端类(中)/ DeepSeek client class extending OpenAIClient (EN).
769
1168
  */
770
1169
  var DeepSeekClient = class extends OpenAIClient {};
771
1170
 
772
1171
  //#endregion
773
1172
  //#region src/core/ai-client/index.ts
774
1173
  /**
775
- * 创建 AI 客户端(高层 API)。
776
- *
777
- * 根据 provider 自动创建对应的客户端类实例:
778
- * - openai / copilot → new OpenAIClient(config)
779
- * - anthropic → new AnthropicClient(config)
780
- *
781
- * 返回 AIClient 接口,调用 chat() 即可与 AI 对话。
782
- *
783
- * @param config - 包含 provider、model、apiKey 等配置
784
- * @returns AIClient 实例(OpenAIClient 或 AnthropicClient)
1174
+ * 创建 AI 客户端(中)/ Create AI client by provider (EN).
785
1175
  */
786
1176
  function createAIClient(config) {
787
1177
  validateProvider(config.provider);
@@ -849,31 +1239,78 @@ var ToolRegistry = class {
849
1239
  //#endregion
850
1240
  //#region src/core/system-prompt.ts
851
1241
  /**
852
- * 构建系统提示词。
853
- * 由两部分组成:身份描述 + 可用工具列表。
1242
+ * 规范化额外指令(中)/ Normalize additional instructions (EN).
1243
+ */
1244
+ function normalizeExtraInstructions(input) {
1245
+ if (!input) return [];
1246
+ return (Array.isArray(input) ? input : [input]).map((s) => s.trim()).filter(Boolean);
1247
+ }
1248
+ /**
1249
+ * 构建系统提示词(中)/ Build system prompt (EN).
1250
+ *
1251
+ * 约束:
1252
+ * - 输出给模型的提示词正文统一为英文。
1253
+ * - 中文仅用于代码注释,便于团队维护。
1254
+ *
1255
+ * Constraints:
1256
+ * - Prompt text sent to model stays English-only.
1257
+ * - Chinese content is used in code comments only for maintainability.
854
1258
  */
855
1259
  function buildSystemPrompt(params = {}) {
856
1260
  const sections = [];
857
- sections.push("You are AutoPilot, an AI agent embedded in the user's web page.\nYou can click, fill forms, read content, navigate, and execute JavaScript.\n\n## 操作规则\n\n1. 快照中每个元素末尾的 `#xxxx` 是 hash ID。操作时**必须**用 `#xxxx` 作为 dom 工具的 selector 参数。\n2. **禁止**猜测 CSS 选择器,只用快照中的 hash ID。\n3. 多个相似元素时,根据层级结构、所在功能区域和用户意图判断目标。\n4. 快照看不到目标时,先滚动页面或用 snapshot 获取更深层级。\n5. 破坏性操作前先与用户确认。\n\n## 决策流程\n\n每一轮你都会收到:**用户的原始请求**、**已完成的操作步骤**、**当前页面 DOM 快照**。\n你必须严格按以下流程决策:\n\n1. **阅读用户请求** — 理解最终目标。\n2. **审查已完成步骤** — 标记 ✅ 的操作已成功执行,**不要重复**;标记 ❌ 的操作失败了,需要换一种方式。\n3. **对照当前快照** — 确认页面当前状态,找到下一步要操作的目标元素。\n4. **只执行下一步** — 基于以上判断,只调用完成目标所需的下一个工具调用,不跳步、不重复。\n\n**关键**:已完成的步骤代表页面已经发生了变化,当前快照才是页面的真实状态。");
1261
+ sections.push([
1262
+ "You are AutoPilot, an AI agent controlling the current web page via tools.",
1263
+ "",
1264
+ "## Core Rules",
1265
+ "- Work from CURRENT snapshot + CURRENT remaining task directly. Do not restate the request.",
1266
+ "- Treat each round as task reduction:",
1267
+ " Input: (1) current remaining task, (2) previous round executed actions, (3) actions you execute this round.",
1268
+ " Output: new remaining task after removing this-round actions.",
1269
+ "- Use only visible targets from snapshot. Use #hashID as selector. Do not guess CSS selectors.",
1270
+ "- Batch independent visible actions in one round. Do not split one form into many rounds unnecessarily.",
1271
+ "- If an action will change DOM (open modal, navigate), stop after that action batch and continue next round with new snapshot.",
1272
+ "- Do NOT call page_info (snapshot/query/get_url/get_title). Snapshot is already provided every round.",
1273
+ "- For dropdown/select, use dom action=select_option (or fill on select).",
1274
+ "- Do NOT interact with AutoPilot UI unless user explicitly asks.",
1275
+ "",
1276
+ "## Output Contract",
1277
+ "- Return tool calls for this round.",
1278
+ "- Also include one plain text line:",
1279
+ " REMAINING: <new remaining task after this round>",
1280
+ " or REMAINING: DONE",
1281
+ "",
1282
+ "## Minimal Example",
1283
+ "Task: click button -> type \"abc\" in input -> send",
1284
+ "Round1 execute: click button",
1285
+ "Remaining: type \"abc\" in input -> send",
1286
+ "Round2 execute: type \"abc\" in input",
1287
+ "Remaining: send",
1288
+ "Round3 execute: send",
1289
+ "Remaining: DONE"
1290
+ ].join("\n"));
858
1291
  const tools = params.tools ?? [];
859
1292
  if (tools.length > 0) {
860
1293
  const toolLines = tools.map((t) => `- **${t.name}**: ${t.description}`);
861
1294
  sections.push("## Available Tools\n\n" + toolLines.join("\n") + "\n\nUse tools when needed to complete the user's request.");
862
1295
  }
1296
+ if (params.thinkingLevel) sections.push(["## Reasoning Profile", `- Thinking level: ${params.thinkingLevel}`].join("\n"));
1297
+ const extraInstructions = normalizeExtraInstructions(params.extraInstructions);
1298
+ if (extraInstructions.length > 0) sections.push(["## Extra Instructions", ...extraInstructions.map((line) => `- ${line}`)].join("\n"));
863
1299
  return sections.join("\n\n");
864
1300
  }
865
1301
 
866
1302
  //#endregion
867
- //#region src/web/dom-tool.ts
1303
+ //#region src/web/tools/dom-tool.ts
868
1304
  /**
869
1305
  * DOM Tool — 基于 Web API 的 DOM 操作工具。
870
1306
  *
871
1307
  * 替代 Playwright 的 click/fill/type 等操作,直接在页面上下文中执行。
872
1308
  * 运行环境:浏览器 Content Script。
873
1309
  *
874
- * 支持 11 种动作:
1310
+ * 支持 12 种动作:
875
1311
  * click — 点击元素
876
- * fill — 填写输入框(清空后设值)
1312
+ * fill — 填写可编辑控件(input/textarea/select/contenteditable)
1313
+ * select_option — 选择下拉框选项(value/label)
877
1314
  * type — 逐字符模拟键入
878
1315
  * focus — 聚焦元素
879
1316
  * hover — 鼠标悬停(触发 mouseenter/mouseover)
@@ -910,7 +1347,7 @@ function queryElement(selector) {
910
1347
  const el = document.querySelector(selector);
911
1348
  if (!el) return `未找到匹配 "${selector}" 的元素`;
912
1349
  return el;
913
- } catch (e) {
1350
+ } catch {
914
1351
  return `选择器语法错误: ${selector}`;
915
1352
  }
916
1353
  }
@@ -988,11 +1425,11 @@ function createDomTool() {
988
1425
  name: "dom",
989
1426
  description: [
990
1427
  "Perform DOM operations on the current page.",
991
- "Actions: click, fill, type, focus, hover, press, get_text, get_attr, set_attr, add_class, remove_class.",
1428
+ "Actions: click, fill, select_option, type, focus, hover, press, get_text, get_attr, set_attr, add_class, remove_class.",
992
1429
  "Use the hash ID from DOM snapshot (e.g. #a1b2c) as selector."
993
1430
  ].join(" "),
994
1431
  schema: Type.Object({
995
- action: Type.String({ description: "DOM action: click | fill | type | focus | hover | press | get_text | get_attr | set_attr | add_class | remove_class" }),
1432
+ action: Type.String({ description: "DOM action: click | fill | select_option | type | focus | hover | press | get_text | get_attr | set_attr | add_class | remove_class" }),
996
1433
  selector: Type.String({ description: "Element ref ID from snapshot (e.g. #r0, #r5) or CSS selector" }),
997
1434
  value: Type.Optional(Type.String({ description: "Value for fill/type/set_attr actions" })),
998
1435
  key: Type.Optional(Type.String({ description: "Key name for press action (e.g. Enter, Escape, Tab, ArrowDown, ArrowUp, Backspace, Delete, Space)" })),
@@ -1046,6 +1483,15 @@ function createDomTool() {
1046
1483
  try {
1047
1484
  switch (action) {
1048
1485
  case "click":
1486
+ if (el instanceof HTMLOptionElement) {
1487
+ const parent = el.parentElement;
1488
+ if (parent instanceof HTMLSelectElement) {
1489
+ parent.focus();
1490
+ parent.value = el.value;
1491
+ dispatchInputEvents(parent);
1492
+ return { content: `已选择 ${describeElement(parent)} 的选项 "${el.value}"` };
1493
+ }
1494
+ }
1049
1495
  if (el instanceof HTMLElement) {
1050
1496
  el.focus();
1051
1497
  el.click();
@@ -1091,6 +1537,24 @@ function createDomTool() {
1091
1537
  el.focus();
1092
1538
  el.value = value;
1093
1539
  dispatchInputEvents(el);
1540
+ } else if (el instanceof HTMLSelectElement) {
1541
+ el.focus();
1542
+ let matched = false;
1543
+ for (const option of Array.from(el.options)) if (option.value === value) {
1544
+ el.value = option.value;
1545
+ matched = true;
1546
+ break;
1547
+ }
1548
+ if (!matched) {
1549
+ const normalized = value.trim().toLowerCase();
1550
+ for (const option of Array.from(el.options)) if (option.text.trim().toLowerCase() === normalized) {
1551
+ el.value = option.value;
1552
+ matched = true;
1553
+ break;
1554
+ }
1555
+ }
1556
+ if (!matched) return { content: `"${selector}" 下拉框中不存在选项 "${value}"` };
1557
+ dispatchInputEvents(el);
1094
1558
  } else if (el instanceof HTMLElement && el.isContentEditable) {
1095
1559
  el.focus();
1096
1560
  el.textContent = value;
@@ -1098,6 +1562,29 @@ function createDomTool() {
1098
1562
  } else return { content: `"${selector}" 不是可编辑元素` };
1099
1563
  return { content: `已填写 ${describeElement(el)}: "${value}"` };
1100
1564
  }
1565
+ case "select_option": {
1566
+ const value = params.value;
1567
+ if (value === void 0) return { content: "缺少 value 参数" };
1568
+ if (!(el instanceof HTMLSelectElement)) return { content: `"${selector}" 不是下拉框元素` };
1569
+ el.focus();
1570
+ let matched = false;
1571
+ for (const option of Array.from(el.options)) if (option.value === value) {
1572
+ el.value = option.value;
1573
+ matched = true;
1574
+ break;
1575
+ }
1576
+ if (!matched) {
1577
+ const normalized = value.trim().toLowerCase();
1578
+ for (const option of Array.from(el.options)) if (option.text.trim().toLowerCase() === normalized) {
1579
+ el.value = option.value;
1580
+ matched = true;
1581
+ break;
1582
+ }
1583
+ }
1584
+ if (!matched) return { content: `"${selector}" 下拉框中不存在选项 "${value}"` };
1585
+ dispatchInputEvents(el);
1586
+ return { content: `已选择 ${describeElement(el)}: "${el.value}"` };
1587
+ }
1101
1588
  case "type": {
1102
1589
  const value = params.value;
1103
1590
  if (value === void 0) return { content: "缺少 value 参数" };
@@ -1166,7 +1653,7 @@ function createDomTool() {
1166
1653
  }
1167
1654
 
1168
1655
  //#endregion
1169
- //#region src/web/page-info-tool.ts
1656
+ //#region src/web/tools/page-info-tool.ts
1170
1657
  /**
1171
1658
  * Page Info Tool — 基于 Web API 的页面信息获取工具。
1172
1659
  *
@@ -1206,6 +1693,11 @@ function generateSnapshot(root = document.body, options = {}) {
1206
1693
  const maxDepth = opts.maxDepth ?? 6;
1207
1694
  const viewportOnly = opts.viewportOnly ?? true;
1208
1695
  const pruneLayout = opts.pruneLayout ?? true;
1696
+ const maxNodes = opts.maxNodes ?? 220;
1697
+ const maxChildren = opts.maxChildren ?? 25;
1698
+ const maxTextLength = opts.maxTextLength ?? 40;
1699
+ let emittedNodes = 0;
1700
+ let truncatedByNodeBudget = false;
1209
1701
  const refStore = opts.refStore;
1210
1702
  const SKIP_TAGS = new Set([
1211
1703
  "SCRIPT",
@@ -1247,14 +1739,18 @@ function generateSnapshot(root = document.body, options = {}) {
1247
1739
  "title",
1248
1740
  "for",
1249
1741
  "action",
1250
- "method",
1251
- "target",
1252
- "min",
1253
- "max",
1254
- "pattern",
1255
- "maxlength",
1256
- "tabindex"
1742
+ "method"
1257
1743
  ];
1744
+ const INTERACTIVE_TAGS = new Set([
1745
+ "A",
1746
+ "BUTTON",
1747
+ "INPUT",
1748
+ "TEXTAREA",
1749
+ "SELECT",
1750
+ "OPTION",
1751
+ "LABEL",
1752
+ "SUMMARY"
1753
+ ]);
1258
1754
  /** 布尔状态属性 — 只在存在时输出(无值),如 disabled、checked */
1259
1755
  const BOOLEAN_ATTRS = [
1260
1756
  "disabled",
@@ -1262,13 +1758,8 @@ function generateSnapshot(root = document.body, options = {}) {
1262
1758
  "readonly",
1263
1759
  "required",
1264
1760
  "selected",
1265
- "hidden",
1266
- "multiple",
1267
- "autofocus",
1268
- "open"
1761
+ "hidden"
1269
1762
  ];
1270
- /** 内联事件属性前缀 */
1271
- const EVENT_PREFIX = "on";
1272
1763
  /**
1273
1764
  * 计算元素在父节点中同标签兄弟里的序号(1-based,XPath 规范)。
1274
1765
  * 如果同标签兄弟只有一个,返回空字符串(无需索引消歧)。
@@ -1311,9 +1802,22 @@ function generateSnapshot(root = document.body, options = {}) {
1311
1802
  if (directText) return false;
1312
1803
  return true;
1313
1804
  }
1805
+ function isInteractiveElement(el) {
1806
+ if (INTERACTIVE_TAGS.has(el.tagName)) return true;
1807
+ if (el.hasAttribute("onclick")) return true;
1808
+ if (el.hasAttribute("role")) return true;
1809
+ if (el.hasAttribute("tabindex")) return true;
1810
+ if (el.hasAttribute("aria-label")) return true;
1811
+ return false;
1812
+ }
1314
1813
  function walk(el, depth, parentPath) {
1814
+ if (emittedNodes >= maxNodes) {
1815
+ truncatedByNodeBudget = true;
1816
+ return "";
1817
+ }
1315
1818
  if (depth > maxDepth) return "";
1316
1819
  if (SKIP_TAGS.has(el.tagName)) return "";
1820
+ if (el.hasAttribute("data-autopilot-ignore")) return "";
1317
1821
  const style = window.getComputedStyle(el);
1318
1822
  if (style.display === "none" || style.visibility === "hidden") return "";
1319
1823
  if (!isInViewport(el, depth)) return "";
@@ -1325,22 +1829,19 @@ function generateSnapshot(root = document.body, options = {}) {
1325
1829
  if (elId) attrs.push(`id="${elId}"`);
1326
1830
  const className = el.getAttribute("class")?.trim();
1327
1831
  if (className) {
1328
- const classes = className.split(/\s+/).filter((c) => c && !c.startsWith("data-v-") && c.length < 30).slice(0, 2).join(" ");
1329
- if (classes) attrs.push(`class="${classes}"`);
1832
+ const cls = className.split(/\s+/).find((c) => c && !c.startsWith("data-v-") && c.length < 25 && !/^[a-z]{1,2}\d|^_|^css-/.test(c));
1833
+ if (cls) attrs.push(`class="${cls}"`);
1330
1834
  }
1331
1835
  for (const attr of INTERACTIVE_ATTRS) {
1332
1836
  const val = el.getAttribute(attr);
1333
1837
  if (val) attrs.push(`${attr}="${val}"`);
1334
1838
  }
1335
1839
  for (const attr of BOOLEAN_ATTRS) if (el.hasAttribute(attr)) attrs.push(attr);
1336
- const events = [];
1337
- for (const attrObj of Array.from(el.attributes)) if (attrObj.name.startsWith(EVENT_PREFIX)) events.push(attrObj.name);
1338
- if (events.length > 0) attrs.push(`events=[${events.join(",")}]`);
1339
- const dataAttrs = [];
1340
- for (const attrObj of Array.from(el.attributes)) if (attrObj.name.startsWith("data-") && !attrObj.name.match(/^data-v-/) && dataAttrs.length < 2) dataAttrs.push(`${attrObj.name}="${attrObj.value.slice(0, 30)}"`);
1341
- if (dataAttrs.length > 0) attrs.push(...dataAttrs);
1840
+ if (el.hasAttribute("onclick")) attrs.push("onclick");
1841
+ const testId = el.getAttribute("data-testid") || el.getAttribute("data-test-id");
1842
+ if (testId) attrs.push(`data-testid="${testId.slice(0, 25)}"`);
1342
1843
  if ((el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) && el.value) {
1343
- const currentVal = el.value.slice(0, 60);
1844
+ const currentVal = el.value.slice(0, 40);
1344
1845
  if (el.getAttribute("value") !== currentVal) attrs.push(`val="${currentVal}"`);
1345
1846
  }
1346
1847
  let directText = "";
@@ -1353,28 +1854,45 @@ function generateSnapshot(root = document.body, options = {}) {
1353
1854
  }
1354
1855
  directText = directText.trim();
1355
1856
  if (isEmptyLayoutContainer(el, directText)) {
1857
+ const allChildren = Array.from(el.children);
1858
+ const interactiveChildren = allChildren.filter(isInteractiveElement);
1859
+ const nonInteractiveChildren = allChildren.filter((child) => !isInteractiveElement(child));
1860
+ const orderedChildren = [...interactiveChildren, ...nonInteractiveChildren];
1861
+ const selectedChildren = orderedChildren.slice(0, maxChildren);
1862
+ const omittedChildren = orderedChildren.length - selectedChildren.length;
1356
1863
  const childLines = [];
1357
- for (let i = 0; i < el.children.length; i++) {
1358
- const childResult = walk(el.children[i], depth, currentPath);
1864
+ for (let i = 0; i < selectedChildren.length; i++) {
1865
+ const childResult = walk(selectedChildren[i], depth, currentPath);
1359
1866
  if (childResult) childLines.push(childResult);
1360
1867
  }
1868
+ if (omittedChildren > 0) childLines.push(`${" ".repeat(depth)}... (${omittedChildren} children omitted)`);
1361
1869
  return childLines.join("\n");
1362
1870
  }
1363
1871
  let line = `${indent}[${tag}]`;
1364
- if (directText) line += ` "${directText.slice(0, 60)}"`;
1872
+ if (directText) line += ` "${directText.slice(0, maxTextLength)}"`;
1365
1873
  if (attrs.length) line += ` ${attrs.join(" ")}`;
1366
1874
  if (refStore) {
1367
1875
  const hashId = refStore.set(el, currentPath);
1368
1876
  line += ` #${hashId}`;
1369
1877
  } else line += ` ref="${currentPath}"`;
1370
1878
  const lines = [line];
1371
- for (let i = 0; i < el.children.length; i++) {
1372
- const childResult = walk(el.children[i], depth + 1, currentPath);
1879
+ emittedNodes++;
1880
+ const allChildren = Array.from(el.children);
1881
+ const interactiveChildren = allChildren.filter(isInteractiveElement);
1882
+ const nonInteractiveChildren = allChildren.filter((child) => !isInteractiveElement(child));
1883
+ const orderedChildren = [...interactiveChildren, ...nonInteractiveChildren];
1884
+ const selectedChildren = orderedChildren.slice(0, maxChildren);
1885
+ const omittedChildren = orderedChildren.length - selectedChildren.length;
1886
+ for (let i = 0; i < selectedChildren.length; i++) {
1887
+ const childResult = walk(selectedChildren[i], depth + 1, currentPath);
1373
1888
  if (childResult) lines.push(childResult);
1374
1889
  }
1890
+ if (omittedChildren > 0) lines.push(`${indent} ... (${omittedChildren} children omitted)`);
1375
1891
  return lines.join("\n");
1376
1892
  }
1377
- return walk(root, 0, "") || "(空页面)";
1893
+ const output = walk(root, 0, "") || "(空页面)";
1894
+ if (!truncatedByNodeBudget) return output;
1895
+ return `${output}\n... (snapshot truncated: maxNodes=${maxNodes})`;
1378
1896
  }
1379
1897
  /**
1380
1898
  * 查询所有匹配元素并返回摘要信息(标签、文本、关键属性)。
@@ -1395,7 +1913,7 @@ function queryAllElements(selector, limit = 20) {
1395
1913
  }
1396
1914
  if (elements.length > limit) results.push(` ...还有 ${elements.length - limit} 个元素`);
1397
1915
  return results.join("\n");
1398
- } catch (e) {
1916
+ } catch {
1399
1917
  return `选择器语法错误: ${selector}`;
1400
1918
  }
1401
1919
  }
@@ -1412,7 +1930,10 @@ function createPageInfoTool() {
1412
1930
  selector: Type.Optional(Type.String({ description: "CSS selector for query_all action" })),
1413
1931
  maxDepth: Type.Optional(Type.Number({ description: "Max depth for snapshot (default: 6)" })),
1414
1932
  viewportOnly: Type.Optional(Type.Boolean({ description: "Only snapshot elements visible in viewport (default: true)" })),
1415
- pruneLayout: Type.Optional(Type.Boolean({ description: "Collapse empty layout containers like div/span (default: true)" }))
1933
+ pruneLayout: Type.Optional(Type.Boolean({ description: "Collapse empty layout containers like div/span (default: true)" })),
1934
+ maxNodes: Type.Optional(Type.Number({ description: "Maximum nodes to include in snapshot (default: 220)" })),
1935
+ maxChildren: Type.Optional(Type.Number({ description: "Maximum children per element (default: 25)" })),
1936
+ maxTextLength: Type.Optional(Type.Number({ description: "Maximum text length per node (default: 40)" }))
1416
1937
  }),
1417
1938
  execute: async (params) => {
1418
1939
  const action = params.action;
@@ -1436,10 +1957,16 @@ function createPageInfoTool() {
1436
1957
  const maxDepth = params.maxDepth ?? 6;
1437
1958
  const viewportOnly = params.viewportOnly ?? true;
1438
1959
  const pruneLayout = params.pruneLayout ?? true;
1960
+ const maxNodes = params.maxNodes ?? 220;
1961
+ const maxChildren = params.maxChildren ?? 25;
1962
+ const maxTextLength = params.maxTextLength ?? 40;
1439
1963
  return { content: generateSnapshot(document.body, {
1440
1964
  maxDepth,
1441
1965
  viewportOnly,
1442
1966
  pruneLayout,
1967
+ maxNodes,
1968
+ maxChildren,
1969
+ maxTextLength,
1443
1970
  refStore: getActiveRefStore()
1444
1971
  }) };
1445
1972
  }
@@ -1464,7 +1991,7 @@ function createPageInfoTool() {
1464
1991
  }
1465
1992
 
1466
1993
  //#endregion
1467
- //#region src/web/navigate-tool.ts
1994
+ //#region src/web/tools/navigate-tool.ts
1468
1995
  /**
1469
1996
  * Navigate Tool — 基于 Web API 的页面导航工具。
1470
1997
  *
@@ -1544,7 +2071,7 @@ function createNavigateTool() {
1544
2071
  }
1545
2072
 
1546
2073
  //#endregion
1547
- //#region src/web/wait-tool.ts
2074
+ //#region src/web/tools/wait-tool.ts
1548
2075
  /**
1549
2076
  * Wait Tool — 基于 MutationObserver 的元素等待工具。
1550
2077
  *
@@ -1714,7 +2241,7 @@ function createWaitTool() {
1714
2241
  }
1715
2242
 
1716
2243
  //#endregion
1717
- //#region src/web/evaluate-tool.ts
2244
+ //#region src/web/tools/evaluate-tool.ts
1718
2245
  /**
1719
2246
  * Evaluate Tool — 在页面上下文中执行任意 JavaScript 表达式。
1720
2247
  *
@@ -1734,7 +2261,7 @@ function createWaitTool() {
1734
2261
  function safeEvaluate(expression) {
1735
2262
  try {
1736
2263
  return { result: new Function(`"use strict"; return (${expression});`)() };
1737
- } catch (err) {
2264
+ } catch {
1738
2265
  try {
1739
2266
  return { result: new Function(`"use strict"; ${expression}`)() };
1740
2267
  } catch (err2) {
@@ -2000,6 +2527,7 @@ var WebAgent = class {
2000
2527
  provider;
2001
2528
  model;
2002
2529
  baseURL;
2530
+ stream;
2003
2531
  dryRun;
2004
2532
  maxRounds;
2005
2533
  customSystemPrompt;
@@ -2021,8 +2549,9 @@ var WebAgent = class {
2021
2549
  this.provider = options.provider ?? "copilot";
2022
2550
  this.model = options.model ?? "gpt-4o";
2023
2551
  this.baseURL = options.baseURL;
2552
+ this.stream = options.stream ?? true;
2024
2553
  this.dryRun = options.dryRun ?? false;
2025
- this.maxRounds = options.maxRounds ?? 10;
2554
+ this.maxRounds = options.maxRounds ?? 40;
2026
2555
  this.customSystemPrompt = options.systemPrompt;
2027
2556
  this.memory = options.memory ?? false;
2028
2557
  this.autoSnapshot = options.autoSnapshot ?? true;
@@ -2065,6 +2594,14 @@ var WebAgent = class {
2065
2594
  setModel(model) {
2066
2595
  this.model = model;
2067
2596
  }
2597
+ /** 设置是否启用流式输出(SSE) */
2598
+ setStream(enabled) {
2599
+ this.stream = enabled;
2600
+ }
2601
+ /** 获取当前流式输出开关状态 */
2602
+ getStream() {
2603
+ return this.stream;
2604
+ }
2068
2605
  /** 切换干运行模式 */
2069
2606
  setDryRun(enabled) {
2070
2607
  this.dryRun = enabled;
@@ -2116,14 +2653,19 @@ var WebAgent = class {
2116
2653
  let systemPrompt = this.customSystemPrompt ?? buildSystemPrompt({ tools: this.registry.getDefinitions() });
2117
2654
  const refStore = new RefStore(globalThis.location?.href);
2118
2655
  setActiveRefStore(refStore);
2119
- if (this.autoSnapshot) try {
2656
+ let initialSnapshot;
2657
+ try {
2120
2658
  const snapshot = generateSnapshot(document.body, {
2121
2659
  maxDepth: 8,
2660
+ viewportOnly: false,
2661
+ maxNodes: 500,
2662
+ maxChildren: 30,
2122
2663
  ...this.snapshotOptions,
2123
2664
  refStore
2124
2665
  });
2125
- this.callbacks.onSnapshot?.(snapshot);
2126
- systemPrompt += wrapSnapshot(`\n\n## 当前页面 DOM 快照\n\n\`\`\`\n${snapshot}\n\`\`\``);
2666
+ initialSnapshot = snapshot;
2667
+ if (this.autoSnapshot) this.callbacks.onSnapshot?.(snapshot);
2668
+ systemPrompt += wrapSnapshot(`\n\n## DOM Snapshot\n\`\`\`\n${snapshot}\n\`\`\``);
2127
2669
  } catch {}
2128
2670
  const wrappedCallbacks = {
2129
2671
  ...this.callbacks,
@@ -2138,6 +2680,7 @@ var WebAgent = class {
2138
2680
  registry: this.registry,
2139
2681
  systemPrompt,
2140
2682
  message,
2683
+ initialSnapshot,
2141
2684
  history: this.memory ? this.history : void 0,
2142
2685
  dryRun: this.dryRun,
2143
2686
  maxRounds: this.maxRounds,
@@ -2159,7 +2702,8 @@ var WebAgent = class {
2159
2702
  provider: this.provider,
2160
2703
  model: this.model,
2161
2704
  apiKey: this.token,
2162
- baseURL: this.baseURL
2705
+ baseURL: this.baseURL,
2706
+ stream: this.stream
2163
2707
  });
2164
2708
  }
2165
2709
  };