agentpage 0.0.26 → 0.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -10,7 +10,25 @@ const DEFAULT_MAX_ROUNDS = 40;
10
10
  const DEFAULT_RECOVERY_WAIT_MS = 100;
11
11
  const DEFAULT_ACTION_RECOVERY_ROUNDS = 2;
12
12
  const DEFAULT_NOT_FOUND_RETRY_ROUNDS = 2;
13
- const DEFAULT_NOT_FOUND_RETRY_WAIT_MS = 2e3;
13
+ const DEFAULT_NOT_FOUND_RETRY_WAIT_MS = 1e3;
14
+ const DEFAULT_ROUND_STABILITY_WAIT_TIMEOUT_MS = 4e3;
15
+ const DEFAULT_ROUND_STABILITY_WAIT_QUIET_MS = 200;
16
+ const DEFAULT_ROUND_STABILITY_WAIT_LOADING_SELECTORS = [
17
+ ".ant-spin",
18
+ ".ant-spin-spinning",
19
+ ".ant-skeleton",
20
+ ".el-loading-mask",
21
+ ".bk-loading",
22
+ ".bk-spin-loading",
23
+ ".bk-skeleton",
24
+ ".bk-sideslider-loading",
25
+ ".t-loading",
26
+ ".t-skeleton",
27
+ ".t-skeleton__row",
28
+ "[aria-busy=\"true\"]",
29
+ ".skeleton",
30
+ ".loading"
31
+ ];
14
32
  /** 快照起始标记 — 用于在消息中识别快照边界 */
15
33
  const SNAPSHOT_START = "<!-- SNAPSHOT_START -->";
16
34
  /** 快照结束标记 */
@@ -20,15 +38,201 @@ const SNAPSHOT_OUTDATED = "[此快照已过期,请参考对话中最新的快
20
38
 
21
39
  //#endregion
22
40
  //#region src/core/agent-loop/helpers.ts
23
- /** 异步睡眠(中)/ Async sleep utility (EN). */
41
+ /**
42
+ * 异步睡眠。
43
+ *
44
+ * 用于重试等待、节流等待等场景。
45
+ */
24
46
  function sleep$1(ms) {
25
47
  return new Promise((resolve) => setTimeout(resolve, ms));
26
48
  }
27
- /** 统一内容为字符串(中)/ Normalize tool content to string (EN). */
49
+ /**
50
+ * 统一内容为字符串。
51
+ *
52
+ * 工具返回 content 可能是 string 或 object;这里统一转成 string,
53
+ * 便于日志、错误判定、摘要拼接。
54
+ */
28
55
  function toContentString(content) {
29
56
  return typeof content === "string" ? content : JSON.stringify(content, null, 2);
30
57
  }
31
- /** 元素不存在判定(中)/ Detect element-not-found failure (EN). */
58
+ /**
59
+ * 解析快照放宽提示。
60
+ *
61
+ * 约定格式:`SNAPSHOT_HINT: EXPAND_CHILDREN #ref1 #ref2`
62
+ *
63
+ * 返回:去掉 `#` 前缀后的 ref id 列表。
64
+ */
65
+ function parseSnapshotExpandHints(text) {
66
+ if (!text) return [];
67
+ const refs = [];
68
+ const regex = /^\s*SNAPSHOT_HINT\s*:\s*EXPAND_CHILDREN\s+(.+)$/gim;
69
+ let match;
70
+ while ((match = regex.exec(text)) !== null) {
71
+ const tokens = (match[1] ?? "").match(/#[A-Za-z0-9_-]+/g) ?? [];
72
+ for (const token of tokens) refs.push(token.replace(/^#/, ""));
73
+ }
74
+ return refs;
75
+ }
76
+ /**
77
+ * 提取 hash selector 的 ref。
78
+ *
79
+ * 仅处理“纯 hash 选择器”,例如 `#1rv01x`。
80
+ * 如果是复杂 CSS(如 `.x #id`)会返回 null,避免误判。
81
+ */
82
+ function extractHashSelectorRef(toolInput) {
83
+ if (!toolInput || typeof toolInput !== "object") return null;
84
+ const selector = toolInput.selector;
85
+ if (typeof selector !== "string") return null;
86
+ const m = selector.trim().match(/^#([A-Za-z0-9_-]+)$/);
87
+ return m ? m[1] : null;
88
+ }
89
+ /**
90
+ * 构建任务数组。
91
+ *
92
+ * 作用:把一轮工具调用规整成稳定字符串数组,
93
+ * 用于“上一轮任务回显”和“重复批次检测”。
94
+ */
95
+ function buildTaskArray(toolCalls) {
96
+ return toolCalls.map((tc) => `${tc.name}:${JSON.stringify(tc.input)}`);
97
+ }
98
+ /**
99
+ * 规范化模型输出。
100
+ *
101
+ * 优先保留 REMAINING;否则保留首段摘要,避免长文本污染上下文。
102
+ *
103
+ * 返回字符串会被注入下一轮消息,作为“上一轮模型输出摘要”。
104
+ */
105
+ function normalizeModelOutput(text) {
106
+ if (!text) return "";
107
+ const trimmed = text.trim();
108
+ if (!trimmed) return "";
109
+ const remainingMatch = trimmed.match(/REMAINING\s*:\s*([\s\S]*)$/i);
110
+ if (remainingMatch) return `REMAINING: ${remainingMatch[1].trim()}`;
111
+ return (trimmed.split(/\n\s*\n/)[0]?.trim() ?? trimmed).slice(0, 220);
112
+ }
113
+ /**
114
+ * 解析 REMAINING。
115
+ *
116
+ * 返回值:
117
+ * - `""` 表示 DONE
118
+ * - 非空字符串表示新的 remaining
119
+ * - `null` 表示协议缺失
120
+ *
121
+ * 注意:这里只负责解析,不负责 fallback 策略。
122
+ */
123
+ function parseRemainingInstruction(text) {
124
+ if (!text) return null;
125
+ const match = text.match(/REMAINING\s*:\s*([\s\S]*)$/i);
126
+ if (!match) return null;
127
+ const value = match[1].trim();
128
+ return /^done$/i.test(value) ? "" : value;
129
+ }
130
+ /**
131
+ * 推导下一轮 remaining。
132
+ *
133
+ * 策略:
134
+ * - 有 REMAINING 协议 -> 使用模型给出的 nextInstruction
135
+ * - 无协议 -> 保持 currentInstruction 不变(由上层决定是否启发式推进)
136
+ */
137
+ function deriveNextInstruction(text, currentInstruction) {
138
+ const parsed = parseRemainingInstruction(text);
139
+ if (parsed !== null) return {
140
+ nextInstruction: parsed,
141
+ hasRemainingProtocol: true
142
+ };
143
+ return {
144
+ nextInstruction: currentInstruction,
145
+ hasRemainingProtocol: false
146
+ };
147
+ }
148
+ /**
149
+ * 启发式剔除 remaining。
150
+ *
151
+ * 用于协议缺失但本轮有执行动作时,按线性步骤剔除已执行数量。
152
+ *
153
+ * 这是“保守推进”策略,不保证语义完美,但能避免 remaining 长期不变。
154
+ */
155
+ function reduceRemainingHeuristically(currentInstruction, executedCount) {
156
+ if (!currentInstruction.trim() || executedCount <= 0) return currentInstruction;
157
+ if (!/(->|=>|→|\bthen\b|\band then\b|\bnext\b|\bafter that\b|然后|接着|随后|之后|再)/i.test(currentInstruction)) return currentInstruction;
158
+ const parts = currentInstruction.replace(/\s+/g, " ").replace(/(->|=>|→)/g, " 然后 ").split(/\s*(?:then|and then|next|after that|然后|接着|随后|之后|再)\s*/gi).map((part) => part.trim()).filter(Boolean);
159
+ if (parts.length <= 1) return currentInstruction;
160
+ const consumedSteps = Math.min(Math.max(1, Math.floor(executedCount)), 1);
161
+ const nextParts = parts.slice(Math.min(consumedSteps, parts.length));
162
+ if (nextParts.length === 0) return "";
163
+ return nextParts.join(" 然后 ");
164
+ }
165
+ /**
166
+ * 判定是否强制断轮。
167
+ *
168
+ * 语义:潜在 DOM 结构变化动作后,等待下一轮新快照。
169
+ *
170
+ * 当前规则:
171
+ * - `navigate.*` 一律断轮
172
+ * - `dom.press` 仅 Enter 断轮
173
+ * - `evaluate` 断轮
174
+ * - 其他动作默认不断轮
175
+ */
176
+ function shouldForceRoundBreak(toolName, toolInput) {
177
+ const action = getToolAction(toolInput);
178
+ if (toolName === "navigate") return action === "goto" || action === "back" || action === "forward" || action === "reload";
179
+ if (toolName === "dom") {
180
+ if (action === "press") return (typeof toolInput === "object" && toolInput !== null ? String(toolInput.key ?? toolInput.value ?? "") : "") === "Enter";
181
+ return false;
182
+ }
183
+ return toolName === "evaluate";
184
+ }
185
+ /**
186
+ * 判定动作是否可能引发页面结构或状态变化。
187
+ *
188
+ * 用于“轮次后稳定等待”触发条件:
189
+ * - 命中 true:本轮结束后执行加载态 + DOM 静默双重等待
190
+ * - 命中 false:跳过等待,直接进入下一轮
191
+ */
192
+ function isPotentialDomMutation(toolName, toolInput) {
193
+ const action = getToolAction(toolInput);
194
+ if (toolName === "navigate") return true;
195
+ if (toolName === "evaluate") return true;
196
+ if (toolName !== "dom") return false;
197
+ if (!action) return false;
198
+ return [
199
+ "click",
200
+ "fill",
201
+ "select_option",
202
+ "clear",
203
+ "check",
204
+ "uncheck",
205
+ "type",
206
+ "focus",
207
+ "hover",
208
+ "scroll",
209
+ "press",
210
+ "set_attr",
211
+ "add_class",
212
+ "remove_class"
213
+ ].includes(action);
214
+ }
215
+ /**
216
+ * 采集找不到元素任务。
217
+ *
218
+ * 返回 null 表示当前结果不属于“元素未找到”,
219
+ * 返回对象表示可进入 not-found retry 对话流。
220
+ */
221
+ function collectMissingTask(name, input, result) {
222
+ if (!isElementNotFoundResult(result)) return null;
223
+ return {
224
+ name,
225
+ input,
226
+ reason: toContentString(result.content).slice(0, 240)
227
+ };
228
+ }
229
+ /**
230
+ * 元素不存在判定。
231
+ *
232
+ * 判定顺序:
233
+ * 1) 优先看结构化错误码 `ELEMENT_NOT_FOUND`
234
+ * 2) 回退看中文错误文本关键词(兼容历史结果格式)
235
+ */
32
236
  function isElementNotFoundResult(result) {
33
237
  const details = result.details;
34
238
  if (details && typeof details === "object") {
@@ -37,14 +241,19 @@ function isElementNotFoundResult(result) {
37
241
  const content = toContentString(result.content);
38
242
  return content.includes("未找到") && content.includes("元素");
39
243
  }
40
- /** 生成稳定调用键(中)/ Build stable key for a tool call (EN). */
244
+ /**
245
+ * 生成稳定调用键。
246
+ *
247
+ * 用于 recoveryAttempts 的 map key(同名 + 同参数视为同一调用)。
248
+ */
41
249
  function buildToolCallKey(name, input) {
42
250
  return `${name}:${JSON.stringify(input)}`;
43
251
  }
44
252
  /**
45
- * 解析恢复等待时长(中)/ Resolve recovery wait duration (EN).
253
+ * 解析恢复等待时长。
46
254
  * 优先级:waitMs > waitSeconds > 默认值。
47
- * Priority: waitMs > waitSeconds > default value.
255
+ *
256
+ * 统一返回毫秒整数,且最小为 0。
48
257
  */
49
258
  function resolveRecoveryWaitMs(input) {
50
259
  if (!input || typeof input !== "object") return DEFAULT_RECOVERY_WAIT_MS;
@@ -55,13 +264,21 @@ function resolveRecoveryWaitMs(input) {
55
264
  if (typeof waitSeconds === "number" && Number.isFinite(waitSeconds)) return Math.max(0, Math.floor(waitSeconds * 1e3));
56
265
  return DEFAULT_RECOVERY_WAIT_MS;
57
266
  }
58
- /** 读取工具 action(中)/ Read tool action from input (EN). */
267
+ /**
268
+ * 读取工具 action。
269
+ *
270
+ * 仅在 input 是对象且 action 为字符串时返回值,否则返回 undefined。
271
+ */
59
272
  function getToolAction(input) {
60
273
  if (!input || typeof input !== "object") return void 0;
61
274
  const action = input.action;
62
275
  return typeof action === "string" ? action : void 0;
63
276
  }
64
- /** 判定错误标记(中)/ Check whether result is marked as error (EN). */
277
+ /**
278
+ * 判定错误标记。
279
+ *
280
+ * 约定:`result.details.error === true` 视为错误结果。
281
+ */
65
282
  function hasToolError(result) {
66
283
  return result.details && typeof result.details === "object" ? Boolean(result.details.error) : false;
67
284
  }
@@ -69,25 +286,24 @@ function hasToolError(result) {
69
286
  //#endregion
70
287
  //#region src/core/agent-loop/snapshot.ts
71
288
  /**
72
- * 读取页面快照(中)/ Read current page snapshot (EN).
289
+ * 读取页面快照。
73
290
  *
74
291
  * 默认关闭 viewportOnly,优先完整性。
75
- * viewportOnly defaults to false to prioritize completeness.
76
292
  *
77
- * 步骤(中)/ Steps (EN):
293
+ * 步骤:
78
294
  * 1) 合并调用方 options 与默认值(深度/裁剪/剪枝/节点上限等)。
79
295
  * 2) 分发 `page_info.snapshot` 获取当前 DOM 文本快照。
80
296
  * 3) 使用 `toContentString` 归一化输出,避免 provider 差异导致结构不一致。
81
297
  * 4) 返回稳定字符串给 loop,供后续注入消息与统计。
82
298
  *
83
- * 默认参数意图(中)/ Default parameter rationale (EN):
299
+ * 默认参数意图:
84
300
  * - `maxDepth=8`: 保留足够层级,减少关键控件被截断。
85
301
  * - `viewportOnly=false`: 优先完整性,避免误判“元素不存在”。
86
302
  * - `pruneLayout=true`: 抑制纯布局噪声,降低 token 压力。
87
303
  * - `maxNodes=500` / `maxChildren=30`: 控制体积上限,兼顾可读性。
88
304
  * - `maxTextLength=40`: 防止长文本淹没结构信息。
89
305
  *
90
- * 压缩/剪枝是怎么做的(中)/ How compression & pruning works in practice (EN):
306
+ * 压缩/剪枝是怎么做的:
91
307
  * - `viewportOnly=true` 时:仅保留与视口相交元素(根层容器保留),完全视口外元素跳过。
92
308
  * - `pruneLayout=true` 时:无 id/无语义/无交互/无直接文本的布局容器会被“折叠”,
93
309
  * 子节点直接提升输出,减少无意义层级;当同一折叠容器提升出多个相邻节点时,
@@ -98,10 +314,10 @@ function hasToolError(result) {
98
314
  * - 交互优先排序:优先输出按钮/输入框/链接等交互元素,再输出普通元素。
99
315
  * - 属性压缩:仅保留关键属性(如 id、关键 class、交互属性、布尔状态、val),减少冗余 token。
100
316
  *
101
- * 输入/输出(中)/ I/O contract (EN):
102
- * - In: `ToolRegistry` + 可选快照参数
103
- * - Out: 归一化后的快照字符串(始终 string)
104
- * - Side effects: 无本地状态写入;仅依赖工具调用结果
317
+ * 输入/输出:
318
+ * - 输入:`ToolRegistry` + 可选快照参数
319
+ * - 输出:归一化后的快照字符串(始终 string)
320
+ * - 副作用:无本地状态写入;仅依赖工具调用结果
105
321
  */
106
322
  async function readPageSnapshot(registry, options) {
107
323
  return toContentString((await registry.dispatch("page_info", {
@@ -118,9 +334,9 @@ async function readPageSnapshot(registry, options) {
118
334
  })).content);
119
335
  }
120
336
  /**
121
- * 包裹快照(中)/ Wrap snapshot with boundary markers (EN).
337
+ * 包裹快照。
122
338
  *
123
- * 作用(中)/ Purpose (EN):
339
+ * 作用:
124
340
  * - 为快照加 `SNAPSHOT_START/END` 边界,便于后续正则定位。
125
341
  * - 支持去重与旧快照剥离,防止多轮 token 累积。
126
342
  * - 仅做纯字符串变换,不访问外部状态。
@@ -128,20 +344,20 @@ async function readPageSnapshot(registry, options) {
128
344
  function wrapSnapshot(snapshot) {
129
345
  return `${SNAPSHOT_START}\n${snapshot}\n${SNAPSHOT_END}`;
130
346
  }
131
- /** 转义正则字符(中)/ Escape regex special chars (EN). */
347
+ /** 转义正则字符。 */
132
348
  function escapeRegex(str) {
133
349
  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
134
350
  }
135
- /** 快照块匹配正则(中)/ Regex for snapshot blocks (EN). */
351
+ /** 快照块匹配正则。 */
136
352
  const SNAPSHOT_REGEX = new RegExp(`${escapeRegex(SNAPSHOT_START)}[\\s\\S]*?${escapeRegex(SNAPSHOT_END)}`, "g");
137
- /** 是否包含快照标记(中)/ Check whether text includes snapshot markers (EN). */
353
+ /** 是否包含快照标记。 */
138
354
  function containsSnapshot(text) {
139
- return text.includes(SNAPSHOT_START);
355
+ return text.includes(SNAPSHOT_START) && text.includes(SNAPSHOT_END);
140
356
  }
141
357
  /**
142
- * 剥离旧快照(中)/ Strip outdated snapshot blocks from system prompt (EN).
358
+ * 剥离旧快照。
143
359
  *
144
- * 说明(中)/ Notes (EN):
360
+ * 说明:
145
361
  * - 当 prompt 中已有历史快照时,将其替换为过期占位文本。
146
362
  * - 让每轮真正生效的只有“最新注入快照”,减少冲突上下文。
147
363
  * - 这是 prompt 级清理;不会触碰 tool trace 中的原始结果对象。
@@ -154,7 +370,15 @@ function stripSnapshotFromPrompt(prompt) {
154
370
  //#endregion
155
371
  //#region src/core/agent-loop/messages.ts
156
372
  /**
157
- * 显式 UI 意图判定(中)/ Detect explicit intent to operate AutoPilot UI (EN).
373
+ * 显式 UI 意图判定。
374
+ *
375
+ * 用途:默认禁止模型操作 AutoPilot 自己的聊天 UI(输入框/发送按钮等),
376
+ * 只有当用户文本里“同时出现 UI 关键词 + 操作动词”时才放行。
377
+ *
378
+ * 判定逻辑:
379
+ * - `hasAgentUiKeyword`:是否提到聊天面板/输入框/发送按钮等
380
+ * - `hasActionVerb`:是否包含点击/输入/发送等动作意图
381
+ * - 二者都满足才返回 true
158
382
  */
159
383
  function isExplicitAgentUiRequest(userMessage) {
160
384
  const lower = userMessage.toLowerCase();
@@ -163,7 +387,12 @@ function isExplicitAgentUiRequest(userMessage) {
163
387
  const hasActionVerb = /(press|click|type|fill|send|input|submit|enter)/i.test(lower) || /(输入|点击|发送|填写|填入|操作|提交|回车|按下)/.test(compact);
164
388
  return hasAgentUiKeyword && hasActionVerb;
165
389
  }
166
- /** 输入摘要(中)/ Build brief text for tool input (EN). */
390
+ /**
391
+ * 输入摘要。
392
+ *
393
+ * 把工具输入压缩成一段短文本(用于轨迹展示),
394
+ * 只保留高价值字段,避免日志过长。
395
+ */
167
396
  function formatToolInputBrief(input) {
168
397
  if (!input || typeof input !== "object") return "";
169
398
  const params = input;
@@ -185,7 +414,11 @@ function formatToolInputBrief(input) {
185
414
  return ` (${parts.join(", ")})`;
186
415
  }
187
416
  /**
188
- * 结果摘要(中)/ Build one-line summary for tool result (EN).
417
+ * 结果摘要。
418
+ *
419
+ * 读取工具结果首行,拼接错误码,生成一行可读结论:
420
+ * - 成功:`✓ ...`
421
+ * - 失败:`✗ ... [CODE]`
189
422
  */
190
423
  function formatToolResultBrief(result) {
191
424
  const firstLine = toContentString(result.content).split("\n").find((l) => l.trim())?.trim().slice(0, 80) ?? "";
@@ -196,15 +429,20 @@ function formatToolResultBrief(result) {
196
429
  return `✓ ${firstLine}`;
197
430
  }
198
431
  /**
199
- * 构建紧凑消息数组(中)/ Build compact AI message array (EN).
432
+ * 构建紧凑消息数组。
200
433
  *
201
- * Round 0: task + snapshot.
202
- * Round 1+: master goal + done steps + execution context + latest snapshot.
434
+ * 两种轮次语义:
435
+ * - Round 0:发送“初始任务 + 当前快照 + 执行约束”
436
+ * - Round 1+:发送“已完成步骤 + 当前 remaining + 最新快照”
203
437
  *
204
- * 新增渐进式语义(中)/ Progressive semantics (EN):
438
+ * 渐进式语义:
205
439
  * - `remainingInstruction`:当前轮次仍待执行的文本。
206
440
  * - `previousRoundTasks`:上一轮已执行的任务数组,避免重复计划。
207
- * - 消息中要求模型输出 `REMAINING: ...` 或 `REMAINING: DONE`,供下一轮继续消费。
441
+ * - `previousRoundModelOutput`:上一轮模型输出摘要,用于 task-reduction 输入。
442
+ * - `previousRoundPlannedTasks`:上一轮计划数组,用于对齐“计划 vs 实际执行”。
443
+ * - `protocolViolationHint`:协议修复提示(当 remaining 未完成但模型无动作时)。
444
+ *
445
+ * 输出:符合 AIMessage 结构的消息数组,可直接传给 AIClient.chat。
208
446
  */
209
447
  function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, history, remainingInstruction, previousRoundTasks, previousRoundModelOutput, previousRoundPlannedTasks, protocolViolationHint) {
210
448
  const messages = history ? [...history] : [];
@@ -241,26 +479,10 @@ function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, hi
241
479
  content: `Done steps (do NOT repeat):\n${traceParts.join("\n")}`
242
480
  });
243
481
  const hasErrors = trace.some((e) => hasToolError(e.result));
244
- const contextParts = [
245
- "## Execution context",
246
- "Current remaining instruction:",
247
- activeInstruction,
248
- "",
249
- "Task-reduction model:",
250
- "Input: current remaining instruction + previous round executed actions + this-round actions.",
251
- "Output: new remaining instruction after removing this-round actions.",
252
- "Start from visible page state directly. Do NOT restate task. Do NOT output planning text.",
253
- "Execute all independent visible sub-tasks in one round.",
254
- "Do NOT act on elements not present in this snapshot yet.",
255
- "If action changes DOM (open modal/navigate), stop after that batch and continue next round.",
256
- "Do NOT call page_info (get_url/get_title/query_all/snapshot).",
257
- "For dropdown/select fields, use dom with action=select_option (or fill on a select).",
258
- "If a needed list shows `... (N children omitted)` under a specific container, output `SNAPSHOT_HINT: EXPAND_CHILDREN #<containerRef>` and wait for next round snapshot.",
259
- "Build the minimal action array from current snapshot to finish this remaining instruction in one round whenever possible.",
260
- "For deterministic increase/decrease controls, compute delta from current visible value and issue exactly that many clicks in one round (e.g., +2 => two increase clicks). Do not overshoot then undo.",
261
- "Stop rule: once requested state is reached, stop tool calls. If verification is needed, verify once and then output REMAINING: DONE.",
262
- allowAgentUiInteraction ? "User explicitly asked to operate AutoPilot UI. You may interact with chat input/send/dock only as requested." : "Do NOT interact with any AI chat UI elements (chat input, send button, dock). Only operate on the actual page content."
263
- ];
482
+ const needsMasterGoalAnchor = activeInstruction.trim().toLowerCase() !== userMessage.trim().toLowerCase();
483
+ const contextParts = ["## Execution context"];
484
+ if (needsMasterGoalAnchor) contextParts.push(`Master goal (reference only — do NOT restart from scratch):`, userMessage, "");
485
+ contextParts.push("Current remaining instruction:", activeInstruction, "", "Task-reduction model:", "Input: current remaining instruction + previous round executed actions + this-round actions.", "Output: new remaining instruction after removing this-round actions.", "Start from visible page state directly. Do NOT restate task. Do NOT output planning text.", "Execute all independent visible sub-tasks in one round.", "Do NOT act on elements not present in this snapshot yet.", "If action changes DOM (open modal/navigate), stop after that batch and continue next round.", "Do NOT call page_info (get_url/get_title/query_all/snapshot).", "For dropdown/select fields, use dom with action=select_option (or fill on a select).", "If a needed list shows `... (N children omitted)` under a specific container, output `SNAPSHOT_HINT: EXPAND_CHILDREN #<containerRef>` and wait for next round snapshot.", "Build the minimal action array from current snapshot to finish this remaining instruction in one round whenever possible.", "For deterministic increase/decrease controls, compute delta from current visible value and issue exactly that many clicks in one round (e.g., +2 => two increase clicks). Do not overshoot then undo.", "Stop rule: once requested state is reached, stop tool calls. If verification is needed, verify once and then output REMAINING: DONE.", allowAgentUiInteraction ? "User explicitly asked to operate AutoPilot UI. You may interact with chat input/send/dock only as requested." : "Do NOT interact with any AI chat UI elements (chat input, send button, dock). Only operate on the actual page content.");
264
486
  if (hasErrors) contextParts.push("", "The last step failed. Retry with a different approach, or skip and continue with other visible targets.");
265
487
  else contextParts.push("", "If the goal is fully done, reply with a short summary (no tool calls).");
266
488
  if (previousRoundTasks && previousRoundTasks.length > 0) contextParts.push("", "Previous round planned task array (already executed):", ...previousRoundTasks.map((task, index) => `${index + 1}. ${task}`));
@@ -284,7 +506,7 @@ function buildCompactMessages(userMessage, trace, latestSnapshot, currentUrl, hi
284
506
 
285
507
  //#endregion
286
508
  //#region src/core/agent-loop/recovery.ts
287
- /** 冗余 page_info 动作(中)/ Redundant page_info actions to intercept (EN). */
509
+ /** 冗余 page_info 动作集合。 */
288
510
  const REDUNDANT_PAGE_INFO_ACTIONS = new Set([
289
511
  "snapshot",
290
512
  "query_all",
@@ -293,7 +515,14 @@ const REDUNDANT_PAGE_INFO_ACTIONS = new Set([
293
515
  "get_viewport"
294
516
  ]);
295
517
  /**
296
- * 冗余 page_info 检查(中)/ Check whether page_info call is redundant (EN).
518
+ * 冗余 page_info 检查。
519
+ *
520
+ * 场景:模型在 loop 中频繁请求 page_info,导致“只看不做”。
521
+ * 处理:命中白名单动作时直接返回拦截结果,不真正执行工具。
522
+ *
523
+ * 示例:
524
+ * - 输入:`page_info.snapshot`
525
+ * - 输出:`REDUNDANT_PAGE_INFO_SKIPPED`
297
526
  */
298
527
  function checkRedundantSnapshot(toolName, toolInput, _latestSnapshot, round) {
299
528
  if (toolName !== "page_info") return null;
@@ -309,7 +538,14 @@ function checkRedundantSnapshot(toolName, toolInput, _latestSnapshot, round) {
309
538
  return null;
310
539
  }
311
540
  /**
312
- * 快照防抖(中)/ Debounce repeated snapshot calls (EN).
541
+ * 快照防抖。
542
+ *
543
+ * 规则:连续触发 `page_info.snapshot` 时,第 2 次起标记为冗余,
544
+ * 返回 `REDUNDANT_SNAPSHOT`,提醒模型直接使用已有快照继续执行。
545
+ *
546
+ * 返回值:
547
+ * - `result`:可能被替换成防抖后的结果
548
+ * - `consecutiveCount`:更新后的连续 snapshot 计数
313
549
  */
314
550
  function applySnapshotDebounce(toolName, toolInput, result, consecutiveCount) {
315
551
  if (toolName === "page_info" && getToolAction(toolInput) === "snapshot") {
@@ -336,10 +572,18 @@ function applySnapshotDebounce(toolName, toolInput, result, consecutiveCount) {
336
572
  };
337
573
  }
338
574
  /**
339
- * 元素未找到恢复(中)/ Recover from element-not-found failures (EN).
575
+ * 元素未找到恢复。
576
+ *
577
+ * 触发条件:
578
+ * - 工具是 `dom`
579
+ * - 结果被识别为“元素未找到”
580
+ *
581
+ * 处理流程:
582
+ * 1) 按调用键统计恢复次数(同 name + input 视为同一调用)
583
+ * 2) 在上限内:等待 -> 刷新快照 -> 返回 `ELEMENT_NOT_FOUND_RECOVERY`
584
+ * 3) 超过上限:返回 `ELEMENT_NOT_FOUND_MAX_RECOVERY_REACHED`
340
585
  *
341
- * 前两次自动恢复,超过上限后返回终止提示。
342
- * Auto-recovers for initial attempts, then returns max-recovery signal.
586
+ * 说明:函数只返回“恢复后的结果描述”,是否继续下一轮由主循环决定。
343
587
  */
344
588
  async function handleElementRecovery(toolName, toolInput, result, recoveryAttempts, registry, pageContext, callbacks) {
345
589
  if (toolName !== "dom" || !isElementNotFoundResult(result)) return null;
@@ -371,7 +615,12 @@ async function handleElementRecovery(toolName, toolInput, result, recoveryAttemp
371
615
  }
372
616
  };
373
617
  }
374
- /** 导航后快照刷新(中)/ Refresh snapshot after navigation actions (EN). */
618
+ /**
619
+ * 导航后快照刷新。
620
+ *
621
+ * 当 `navigate.goto/back/forward/reload` 成功后,立即刷新快照,
622
+ * 防止后续动作还在旧页面上下文里决策。
623
+ */
375
624
  async function handleNavigationUrlChange(toolName, toolInput, result, registry, pageContext, callbacks) {
376
625
  if (toolName !== "navigate") return;
377
626
  const action = getToolAction(toolInput);
@@ -386,7 +635,15 @@ const READ_ONLY_TOOLS = new Set(["page_info"]);
386
635
  const READ_ONLY_DOM_ACTIONS = new Set(["get_text", "get_attr"]);
387
636
  /**
388
637
  * 空转检测:识别连续只读轮次并终止。
389
- * 返回 -1 表示应终止循环。
638
+ *
639
+ * 判定口径:
640
+ * - `page_info.*` 视为只读
641
+ * - `dom.get_text/get_attr` 视为只读
642
+ *
643
+ * 返回值语义:
644
+ * - `-1`:触发停机(连续 2 轮纯只读)
645
+ * - `0`:本轮有实质操作,计数清零
646
+ * - `>0`:当前连续只读轮次
390
647
  */
391
648
  function detectIdleLoop(toolCalls, consecutiveReadOnlyRounds) {
392
649
  if (toolCalls.length > 0 && toolCalls.every(({ name, input }) => {
@@ -404,32 +661,76 @@ function detectIdleLoop(toolCalls, consecutiveReadOnlyRounds) {
404
661
  //#endregion
405
662
  //#region src/core/agent-loop/index.ts
406
663
  /**
407
- * Agent Loop 主流程
408
- *
409
- * 负责消息构建、AI 决策、工具执行、恢复保护与指标汇总。
410
- *
664
+ * Agent Loop 主流程(口语版)
411
665
  *
412
666
  * 流程图(文本):
413
667
  *
414
668
  * 轮次开始
415
669
  * │
416
- * ├─ 确保快照可用
417
- * ├─ 构建紧凑消息(目标 + 剩余任务 + 执行轨迹 + 快照)
418
- * ├─ 调用模型
419
- * ├─ 无 toolCalls ? 结束 : 执行工具
420
- * ├─ 应用保护机制(冗余拦截/恢复/导航检测/空转/防自转)
670
+ * ├─ 先看有没有最新快照
671
+ * │ └─ 没有就先拍一张(可带 expandChildrenRefs)
672
+ *
673
+ * ├─ 组装本轮上下文消息
674
+ * │ └─ remaining + 上轮任务 + 最新快照 +(必要时)重试提示
675
+ * │
676
+ * ├─ 调用模型拿决策
677
+ * │ └─ 同时解析 `REMAINING` 和 `SNAPSHOT_HINT`
678
+ * │
679
+ * ├─ 有 toolCalls 吗?
680
+ * │ ├─ 没有:走收敛/协议修复判断(必要时等待后重试)
681
+ * │ └─ 有:逐个执行工具
682
+ * │ ├─ 冗余拦截(例如 page_info 空转)
683
+ * │ ├─ 失败恢复(元素未找到重试)
684
+ * │ ├─ 导航后更新快照
685
+ * │ └─ 命中断轮条件则提前结束本轮
686
+ * │
687
+ * ├─ 更新 remaining(优先协议,缺失时启发式剔除)
688
+ * │
689
+ * ├─ 防空转 / 防自转检查
690
+ * │ └─ 连续只读或重复批次会触发停机
691
+ * │
421
692
  * ├─ 刷新快照
422
693
  * ▼
423
694
  * 下一轮或停机
695
+ *
696
+ * 停机条件(任一命中):
697
+ * - `REMAINING: DONE`(或 remaining 为空)
698
+ * - 协议修复后仍无推进
699
+ * - 连续只读(空转)
700
+ * - 重复批次(自转)
701
+ * - 达到 maxRounds
424
702
  */
425
703
  /**
426
- * 执行 Agent 循环(中)/ Execute the agent loop (EN).
704
+ * 执行 Agent 循环。
705
+ *
706
+ * 你可以把这个函数理解成“任务执行调度器”:
707
+ * - 输入:用户任务、系统提示词、工具注册表、历史消息、初始快照
708
+ * - 过程:按轮次持续执行“看页面 -> 让模型决策 -> 跑工具 -> 更新上下文”
709
+ * - 输出:最终回复、完整工具调用记录、可复用消息、结构化指标
427
710
  *
428
- * 每轮:确保快照 → 构建消息 → 调用 AI → 执行工具 → 保护处理 → 刷新快照。
429
- * Per round: ensure snapshot -> build messages -> call AI -> execute tools -> apply protections -> refresh snapshot.
711
+ * 每轮主流程(固定顺序):
712
+ * 1) Ensure Snapshot:确保当前有最新快照(必要时读取)
713
+ * 2) Build Messages:构建紧凑上下文(remaining + 上轮轨迹 + 最新快照)
714
+ * 3) Call AI:请求模型并解析协议字段(`REMAINING` / `SNAPSHOT_HINT`)
715
+ * 4) Execute Tools:执行工具调用并应用保护机制(冗余拦截、恢复、导航刷新)
716
+ * 5) Reduce Remaining:推进剩余任务(优先协议,缺失时启发式剔除)
717
+ * 6) Guard & Refresh:防空转/防自转判定,并刷新快照进入下一轮
718
+ *
719
+ * 核心状态语义:
720
+ * - `remainingInstruction`:当前轮还未消费完的任务文本
721
+ * - `previousRoundTasks`:上一轮已执行动作,防止模型原样重复
722
+ * - `previousRoundPlannedTasks`:上一轮模型计划,用于重复批次检测
723
+ * - `protocolViolationHint`:协议修复提示(remaining 未完成却无工具调用时注入)
724
+ *
725
+ * 停机条件(命中任意一条即结束):
726
+ * - 模型无工具调用且 remaining 已收敛(`REMAINING: DONE` 或空)
727
+ * - 协议修复后仍无推进
728
+ * - 连续只读轮次(防空转)
729
+ * - 连续重复计划批次(防自转)
730
+ * - 达到 `maxRounds`
430
731
  */
431
732
  async function executeAgentLoop(params) {
432
- const { client, registry, systemPrompt, message, initialSnapshot, history, dryRun = false, maxRounds = DEFAULT_MAX_ROUNDS, callbacks } = params;
733
+ const { client, registry, systemPrompt, message, initialSnapshot, history, dryRun = false, maxRounds = DEFAULT_MAX_ROUNDS, roundStabilityWait, callbacks } = params;
433
734
  const tools = registry.getDefinitions();
434
735
  const allToolCalls = [];
435
736
  const fullToolTrace = [];
@@ -450,6 +751,12 @@ async function executeAgentLoop(params) {
450
751
  let lastRoundHadError = false;
451
752
  let protocolViolationHint;
452
753
  const snapshotExpandRefIds = /* @__PURE__ */ new Set();
754
+ const effectiveRoundStabilityWait = {
755
+ enabled: roundStabilityWait?.enabled ?? true,
756
+ timeoutMs: Math.max(200, Math.floor(roundStabilityWait?.timeoutMs ?? DEFAULT_ROUND_STABILITY_WAIT_TIMEOUT_MS)),
757
+ quietMs: Math.max(50, Math.floor(roundStabilityWait?.quietMs ?? DEFAULT_ROUND_STABILITY_WAIT_QUIET_MS)),
758
+ loadingSelectors: [...new Set([...DEFAULT_ROUND_STABILITY_WAIT_LOADING_SELECTORS, ...roundStabilityWait?.loadingSelectors ?? []].map((selector) => selector.trim()).filter(Boolean))]
759
+ };
453
760
  let recoveryCount = 0;
454
761
  let redundantInterceptCount = 0;
455
762
  let pendingNotFoundRetry;
@@ -457,7 +764,7 @@ async function executeAgentLoop(params) {
457
764
  let snapshotSizeTotal = 0;
458
765
  let snapshotSizeMax = 0;
459
766
  /**
460
- * 记录快照统计(中)/ Record snapshot metrics (EN).
767
+ * 记录快照统计。
461
768
  *
462
769
  * 用于输出可观测指标:读取次数、平均长度、最大长度。
463
770
  * Used for observability metrics: read count, avg size, max size.
@@ -469,7 +776,7 @@ async function executeAgentLoop(params) {
469
776
  if (snapshot.length > snapshotSizeMax) snapshotSizeMax = snapshot.length;
470
777
  };
471
778
  /**
472
- * 刷新页面快照(中)/ Refresh page snapshot (EN).
779
+ * 刷新页面快照。
473
780
  *
474
781
  * 只做两件事:读取最新快照 + 更新快照统计。
475
782
  * Does exactly two things: read latest snapshot + update metrics.
@@ -482,33 +789,32 @@ async function executeAgentLoop(params) {
482
789
  recordSnapshotStats(pageContext.latestSnapshot);
483
790
  };
484
791
  /**
485
- * 解析模型文本中的快照放宽指令(中)/ Parse snapshot expansion hint from model text (EN).
792
+ * 轮次后稳定等待(双重等待)。
486
793
  *
487
- * 约定:
488
- * SNAPSHOT_HINT: EXPAND_CHILDREN #ref1 #ref2
794
+ * 顺序固定为:
795
+ * 1) 等待 loading 指示器隐藏
796
+ * 2) 等待 DOM quiet window
489
797
  */
490
- const parseSnapshotExpandHints = (text) => {
491
- if (!text) return [];
492
- const refs = [];
493
- const regex = /^\s*SNAPSHOT_HINT\s*:\s*EXPAND_CHILDREN\s+(.+)$/gim;
494
- let match;
495
- while ((match = regex.exec(text)) !== null) {
496
- const tokens = (match[1] ?? "").match(/#[A-Za-z0-9_-]+/g) ?? [];
497
- for (const token of tokens) refs.push(token.replace(/^#/, ""));
498
- }
499
- return refs;
500
- };
501
- /** 从工具输入提取 hash selector(如 #1rv01x),用于定向快照放宽。 */
502
- const extractHashSelectorRef = (toolInput) => {
503
- if (!toolInput || typeof toolInput !== "object") return null;
504
- const selector = toolInput.selector;
505
- if (typeof selector !== "string") return null;
506
- const m = selector.trim().match(/^#([A-Za-z0-9_-]+)$/);
507
- return m ? m[1] : null;
798
+ const runRoundStabilityBarrier = async () => {
799
+ if (!effectiveRoundStabilityWait.enabled) return;
800
+ if (!registry.has("wait")) return;
801
+ const timeout = effectiveRoundStabilityWait.timeoutMs;
802
+ const loadingSelector = effectiveRoundStabilityWait.loadingSelectors.join(", ");
803
+ if (loadingSelector) await registry.dispatch("wait", {
804
+ action: "wait_for_selector",
805
+ selector: loadingSelector,
806
+ state: "hidden",
807
+ timeout
808
+ });
809
+ await registry.dispatch("wait", {
810
+ action: "wait_for_stable",
811
+ timeout,
812
+ quietMs: effectiveRoundStabilityWait.quietMs
813
+ });
508
814
  };
509
815
  if (pageContext.latestSnapshot) recordSnapshotStats(pageContext.latestSnapshot);
510
816
  /**
511
- * 追加工具轨迹(中)/ Append tool trace entry (EN).
817
+ * 追加工具轨迹。
512
818
  *
513
819
  * 同时写入:
514
820
  * - allToolCalls:对外返回结果
@@ -527,104 +833,6 @@ async function executeAgentLoop(params) {
527
833
  result
528
834
  });
529
835
  };
530
- /**
531
- * 生成任务数组(中)/ Build normalized task array (EN).
532
- *
533
- * 将本轮 toolCalls 归一化成稳定字符串数组,便于:
534
- * - 回传到下一轮消息上下文(提醒已执行计划)
535
- * - 进行“是否与上一轮完全相同”的比较
536
- */
537
- const buildTaskArray = (toolCalls) => toolCalls.map((tc) => {
538
- const inputText = JSON.stringify(tc.input);
539
- return `${tc.name}:${inputText}`;
540
- });
541
- /**
542
- * 规范化模型文本输出(中)/ Normalize model text for next-round input (EN).
543
- *
544
- * 优先保留 REMAINING 行;否则截断首段文本,避免长篇规划污染下一轮输入。
545
- * Prefer REMAINING line; otherwise keep a short excerpt to avoid long planning spillover.
546
- */
547
- const normalizeModelOutput = (text) => {
548
- if (!text) return "";
549
- const trimmed = text.trim();
550
- if (!trimmed) return "";
551
- const remainingMatch = trimmed.match(/REMAINING\s*:\s*([\s\S]*)$/i);
552
- if (remainingMatch) return `REMAINING: ${remainingMatch[1].trim()}`;
553
- return (trimmed.split(/\n\s*\n/)[0]?.trim() ?? trimmed).slice(0, 220);
554
- };
555
- /**
556
- * 判定动作是否会触发 DOM 结构变化(
557
- *
558
- * 触发后应强制断轮,等待下一轮新快照继续。
559
- *
560
- */
561
- const shouldForceRoundBreak = (toolName, toolInput) => {
562
- const action = getToolAction(toolInput);
563
- if (toolName === "navigate") return action === "goto" || action === "back" || action === "forward" || action === "reload";
564
- if (toolName === "dom") {
565
- if (action === "press") return (typeof toolInput === "object" && toolInput !== null ? String(toolInput.key ?? toolInput.value ?? "") : "") === "Enter";
566
- return false;
567
- }
568
- if (toolName === "evaluate") return true;
569
- return false;
570
- };
571
- /**
572
- * 将“找不到元素”的失败任务整理成可重试清单(中)/ Build retry task list for not-found failures (EN).
573
- */
574
- const collectMissingTask = (name, input, result) => {
575
- if (!isElementNotFoundResult(result)) return null;
576
- return {
577
- name,
578
- input,
579
- reason: toContentString(result.content).slice(0, 240)
580
- };
581
- };
582
- /**
583
- * 解析 REMAINING 协议(中)/ Parse REMAINING protocol from model text (EN).
584
- *
585
- * 支持:
586
- * - `REMAINING: <text>` → 继续下一轮消费该剩余文本
587
- * - `REMAINING: DONE` → 剩余任务为空
588
- * 返回 null 表示本轮没有提供 REMAINING 标记。
589
- */
590
- const parseRemainingInstruction = (text) => {
591
- if (!text) return null;
592
- const match = text.match(/REMAINING\s*:\s*([\s\S]*)$/i);
593
- if (!match) return null;
594
- const value = match[1].trim();
595
- return /^done$/i.test(value) ? "" : value;
596
- };
597
- /**
598
- * 推进下一轮描述(中)/ Derive next-round instruction from model text (EN).
599
- *
600
- * 优先 REMAINING 协议;若未提供,则保持当前 remaining 不变。
601
- * Priority: REMAINING protocol first; otherwise keep current remaining instruction unchanged.
602
- */
603
- const deriveNextInstruction = (text, currentInstruction) => {
604
- const parsed = parseRemainingInstruction(text);
605
- if (parsed !== null) return {
606
- nextInstruction: parsed,
607
- hasRemainingProtocol: true
608
- };
609
- return {
610
- nextInstruction: currentInstruction,
611
- hasRemainingProtocol: false
612
- };
613
- };
614
- /**
615
- * 启发式任务剔除(中)/ Heuristic remaining reduction for linear instructions (EN).
616
- *
617
- * 在 REMAINING 缺失但本轮有执行动作时,按“线性片段”剔除已执行步数,避免下一轮继续携带整段原任务。
618
- * When REMAINING is missing but actions were executed, drop executed step count from a linearized instruction.
619
- */
620
- const reduceRemainingHeuristically = (currentInstruction, executedCount) => {
621
- if (!currentInstruction.trim() || executedCount <= 0) return currentInstruction;
622
- const parts = currentInstruction.replace(/\s+/g, " ").replace(/(->|=>|→)/g, " 然后 ").replace(/[,,。;;]/g, " 然后 ").split(/\s*(?:然后|再|并且|并|接着|随后|之后)\s*/g).map((part) => part.trim()).filter(Boolean);
623
- if (parts.length <= 1) return currentInstruction;
624
- const nextParts = parts.slice(Math.min(executedCount, parts.length));
625
- if (nextParts.length === 0) return "";
626
- return nextParts.join(" -> ");
627
- };
628
836
  for (let round = 0; round < maxRounds; round++) {
629
837
  callbacks?.onRound?.(round);
630
838
  usedRounds = round + 1;
@@ -719,6 +927,7 @@ async function executeAgentLoop(params) {
719
927
  break;
720
928
  }
721
929
  let roundHasError = false;
930
+ let roundHasPotentialDomMutation = false;
722
931
  const executedTaskCalls = [];
723
932
  const roundMissingTasks = [];
724
933
  for (const tc of response.toolCalls) {
@@ -749,6 +958,7 @@ async function executeAgentLoop(params) {
749
958
  const missingTask = collectMissingTask(tc.name, tc.input, result);
750
959
  if (missingTask) roundMissingTasks.push(missingTask);
751
960
  if (result.details && typeof result.details === "object") roundHasError = roundHasError || Boolean(result.details.error);
961
+ if (!hasToolError(result) && isPotentialDomMutation(tc.name, tc.input)) roundHasPotentialDomMutation = true;
752
962
  if (tc.name === "page_info" && getToolAction(tc.input) === "snapshot") {
753
963
  pageContext.latestSnapshot = toContentString(result.content);
754
964
  recordSnapshotStats(pageContext.latestSnapshot);
@@ -764,7 +974,8 @@ async function executeAgentLoop(params) {
764
974
  else pendingNotFoundRetry = void 0;
765
975
  if (parsedInstructionState.hasRemainingProtocol) remainingInstruction = parsedInstructionState.nextInstruction;
766
976
  else {
767
- const nextByHeuristic = reduceRemainingHeuristically(remainingInstruction, executedTaskCalls.length);
977
+ const heuristicProgressUnits = executedTaskCalls.length > 0 ? 1 : 0;
978
+ const nextByHeuristic = reduceRemainingHeuristically(remainingInstruction, heuristicProgressUnits);
768
979
  if (nextByHeuristic !== remainingInstruction) remainingInstruction = nextByHeuristic;
769
980
  else roundHasError = true;
770
981
  }
@@ -782,6 +993,7 @@ async function executeAgentLoop(params) {
782
993
  break;
783
994
  }
784
995
  consecutiveReadOnlyRounds = idleResult;
996
+ if (roundHasPotentialDomMutation) await runRoundStabilityBarrier();
785
997
  await refreshSnapshot();
786
998
  }
787
999
  const resultMessages = [...history ?? [], {
@@ -828,7 +1040,9 @@ const PROVIDER_ENDPOINTS = {
828
1040
  openai: "https://api.openai.com/v1",
829
1041
  copilot: "https://models.inference.ai.azure.com",
830
1042
  anthropic: "https://api.anthropic.com",
831
- deepseek: "https://api.deepseek.com"
1043
+ deepseek: "https://api.deepseek.com",
1044
+ doubao: "https://ark.cn-beijing.volces.com/api/v3",
1045
+ qwen: "https://dashscope.aliyuncs.com/compatible-mode/v1"
832
1046
  };
833
1047
  /** 校验 provider(中)/ Validate provider support (EN). */
834
1048
  function validateProvider(provider) {
@@ -1323,6 +1537,32 @@ async function parseAnthropicStream(response) {
1323
1537
  */
1324
1538
  var DeepSeekClient = class extends OpenAIClient {};
1325
1539
 
1540
+ //#endregion
1541
+ //#region src/core/ai-client/doubao.ts
1542
+ /**
1543
+ * Doubao 客户端封装(中)/ Doubao client wrapper (EN).
1544
+ *
1545
+ * Doubao(火山引擎 Ark)与 OpenAI Chat Completions 兼容,直接复用 OpenAIClient。
1546
+ * Doubao (Volcengine Ark) is OpenAI-compatible, so it reuses OpenAIClient behavior.
1547
+ */
1548
+ /**
1549
+ * Doubao 客户端类(中)/ Doubao client class extending OpenAIClient (EN).
1550
+ */
1551
+ var DoubaoClient = class extends OpenAIClient {};
1552
+
1553
+ //#endregion
1554
+ //#region src/core/ai-client/qwen.ts
1555
+ /**
1556
+ * Qwen 客户端封装(中)/ Qwen client wrapper (EN).
1557
+ *
1558
+ * Qwen(阿里云百炼兼容模式)与 OpenAI Chat Completions 兼容,直接复用 OpenAIClient。
1559
+ * Qwen (DashScope compatible mode) is OpenAI-compatible, so it reuses OpenAIClient behavior.
1560
+ */
1561
+ /**
1562
+ * Qwen 客户端类(中)/ Qwen client class extending OpenAIClient (EN).
1563
+ */
1564
+ var QwenClient = class extends OpenAIClient {};
1565
+
1326
1566
  //#endregion
1327
1567
  //#region src/core/ai-client/index.ts
1328
1568
  /**
@@ -1333,9 +1573,11 @@ function createAIClient(config) {
1333
1573
  switch (config.provider) {
1334
1574
  case "openai":
1335
1575
  case "copilot": return new OpenAIClient(config);
1576
+ case "doubao": return new DoubaoClient(config);
1577
+ case "qwen": return new QwenClient(config);
1336
1578
  case "anthropic": return new AnthropicClient(config);
1337
1579
  case "deepseek": return new DeepSeekClient(config);
1338
- default: throw new Error(`Unknown AI provider: ${config.provider}. Supported: openai, copilot, anthropic, deepseek`);
1580
+ default: throw new Error(`Unknown AI provider: ${config.provider}. Supported: openai, copilot, anthropic, deepseek, doubao, qwen`);
1339
1581
  }
1340
1582
  }
1341
1583
 
@@ -1436,12 +1678,16 @@ function buildSystemPrompt(params = {}) {
1436
1678
  "- If an action will change DOM (open modal, navigate), stop after that action batch and continue next round with new snapshot.",
1437
1679
  "- Do NOT call page_info (snapshot/query/get_url/get_title). Snapshot is already provided every round.",
1438
1680
  "- For dropdown/select, use dom action=select_option (or fill on select).",
1681
+ "- Always cross-check planned actions against the original goal to avoid task drift (e.g., do not confuse create issue vs create repository).",
1439
1682
  "- If a required list shows `... (N children omitted)` under a specific container, request focused expansion by outputting `SNAPSHOT_HINT: EXPAND_CHILDREN #<containerRef>`.",
1440
1683
  "- After outputting snapshot expansion hint, wait for the next refreshed snapshot before further scrolling/clicking on that list.",
1441
1684
  "- Verification whitelist: do NOT use get_text/get_attr to verify input/select values unless the user explicitly asks for verification.",
1442
1685
  "- Stop rule: when the requested state is achieved, stop calling tools. If verification is requested, verify once and then return REMAINING: DONE (no repeated get_text/get_attr on the same target).",
1443
1686
  "- Do NOT interact with AutoPilot UI unless user explicitly asks.",
1444
1687
  "",
1688
+ "## Listener Abbrevs",
1689
+ "clk=click dbl=dblclick mdn=mousedown mup=mouseup mmv=mousemove mov=mouseover mot=mouseout men=mouseenter mlv=mouseleave pdn=pointerdown pup=pointerup pmv=pointermove tst=touchstart ted=touchend kdn=keydown kup=keyup inp=input chg=change sub=submit fcs=focus blr=blur scl=scroll whl=wheel drg=drag drs=dragstart dre=dragend drp=drop ctx=contextmenu",
1690
+ "",
1445
1691
  "## Output Contract",
1446
1692
  "- Return tool calls for this round.",
1447
1693
  "- Also include one plain text line:",
@@ -1469,25 +1715,89 @@ function buildSystemPrompt(params = {}) {
1469
1715
  }
1470
1716
 
1471
1717
  //#endregion
1472
- //#region src/web/tools/dom-tool.ts
1718
+ //#region src/web/event-listener-tracker.ts
1719
+ const elementEventMap = /* @__PURE__ */ new WeakMap();
1720
+ let installed = false;
1721
+ let originalAddEventListener;
1722
+ let originalRemoveEventListener;
1723
+ function normalizeEventType(type) {
1724
+ if (typeof type !== "string") return null;
1725
+ return type.trim().toLowerCase() || null;
1726
+ }
1727
+ function canTrackElementTarget(target) {
1728
+ if (typeof Element === "undefined") return false;
1729
+ return target instanceof Element;
1730
+ }
1731
+ function trackElementEvent(target, type) {
1732
+ if (!canTrackElementTarget(target)) return;
1733
+ const prev = elementEventMap.get(target);
1734
+ if (prev) {
1735
+ prev.add(type);
1736
+ return;
1737
+ }
1738
+ elementEventMap.set(target, new Set([type]));
1739
+ }
1740
+ function untrackElementEvent(target, type) {
1741
+ if (!canTrackElementTarget(target)) return;
1742
+ const prev = elementEventMap.get(target);
1743
+ if (!prev) return;
1744
+ prev.delete(type);
1745
+ if (prev.size === 0) elementEventMap.delete(target);
1746
+ }
1473
1747
  /**
1474
- * DOM Tool — 浏览器 DOM 操作工具(结合 Playwright 核心交互模式增强)。
1475
- *
1476
- * 关键改进(参考 Playwright):
1477
- * 1. retarget — 点击时自动重定向到 button/link/label.control
1478
- * 2. scrollIntoView 多策略 4 种 block 对齐轮换,解决 sticky 遮挡
1479
- * 3. stable 检查 — rAF 逐帧检测元素位置稳定后再操作
1480
- * 4. hit-target 验证 — elementsFromPoint 检查是否被遮挡
1481
- * 5. 完整点击事件链 — pointermove→pointerdown→mousedown→pointerup→mouseup→click
1482
- * 6. check/uncheck 通过 click 先检查→click 切换→验证状态
1483
- * 7. press 组合键 — 支持 Control+a, Shift+Enter 等修饰键
1484
- * 8. fill 分类型 — date/color/range 走 setValue,text 类走 selectAll+原生写入
1485
- * 9. 自定义下拉增强 更广泛的 option 选择器 + 等待弹出
1486
- * 10. ARIA disabled — 检查祖先链 aria-disabled
1748
+ * 安装全局监听追踪补丁(幂等)。
1749
+ */
1750
+ function installEventListenerTracking() {
1751
+ if (installed) return;
1752
+ if (typeof EventTarget === "undefined") return;
1753
+ const proto = EventTarget.prototype;
1754
+ const nativeAdd = proto.addEventListener;
1755
+ const nativeRemove = proto.removeEventListener;
1756
+ if (typeof nativeAdd !== "function" || typeof nativeRemove !== "function") return;
1757
+ originalAddEventListener = nativeAdd;
1758
+ originalRemoveEventListener = nativeRemove;
1759
+ proto.addEventListener = function patchedAddEventListener(type, listener, options) {
1760
+ originalAddEventListener?.call(this, type, listener, options);
1761
+ try {
1762
+ const normalizedType = normalizeEventType(type);
1763
+ if (!normalizedType || listener == null) return;
1764
+ trackElementEvent(this, normalizedType);
1765
+ } catch {}
1766
+ };
1767
+ proto.removeEventListener = function patchedRemoveEventListener(type, listener, options) {
1768
+ originalRemoveEventListener?.call(this, type, listener, options);
1769
+ try {
1770
+ const normalizedType = normalizeEventType(type);
1771
+ if (!normalizedType || listener == null) return;
1772
+ untrackElementEvent(this, normalizedType);
1773
+ } catch {}
1774
+ };
1775
+ installed = true;
1776
+ }
1777
+ /**
1778
+ * 读取元素已记录的事件名(排序后返回,便于稳定输出)。
1779
+ */
1780
+ function getTrackedElementEvents(el) {
1781
+ const set = elementEventMap.get(el);
1782
+ if (!set || set.size === 0) return [];
1783
+ return Array.from(set).sort();
1784
+ }
1785
+ /**
1786
+ * 判断元素是否存在至少一个被追踪到的事件绑定。
1787
+ */
1788
+ function hasTrackedElementEvents(el) {
1789
+ return (elementEventMap.get(el)?.size ?? 0) > 0;
1790
+ }
1791
+
1792
+ //#endregion
1793
+ //#region src/web/tools/dom-tool/constants.ts
1794
+ /**
1795
+ * DOM Tool 常量定义。
1487
1796
  *
1488
- * 运行环境:浏览器 Content Script(直接访问 DOM,无 CDP)。
1797
+ * 包含:input 类型分类、修饰键集合、键码映射、滚动策略。
1489
1798
  */
1490
- const DEFAULT_WAIT_MS = 2e3;
1799
+ /** 默认等待超时(ms) */
1800
+ const DEFAULT_WAIT_MS = 1200;
1491
1801
  /** scrollIntoView 轮换策略(参考 Playwright dom.ts) */
1492
1802
  const SCROLL_OPTIONS = [
1493
1803
  void 0,
@@ -1547,6 +1857,9 @@ const KEY_CODE_MAP = {
1547
1857
  Alt: "AltLeft",
1548
1858
  Meta: "MetaLeft"
1549
1859
  };
1860
+
1861
+ //#endregion
1862
+ //#region src/web/tools/dom-tool/query.ts
1550
1863
  let activeRefStore;
1551
1864
  function setActiveRefStore(store) {
1552
1865
  activeRefStore = store;
@@ -1557,15 +1870,26 @@ function getActiveRefStore() {
1557
1870
  function sleep(ms) {
1558
1871
  return new Promise((r) => setTimeout(r, ms));
1559
1872
  }
1560
- /** 查询元素:优先 RefStore hash,回退 CSS 选择器 */
1873
+ /**
1874
+ * 查询元素:优先 RefStore hash,回退 CSS 选择器。
1875
+ * 支持复合 hash 选择器(如 "#hashID .child-class")——先解析 hash 根,再在其子树内 querySelector。
1876
+ */
1561
1877
  function queryElement(selector) {
1562
1878
  try {
1563
1879
  if (selector.startsWith("#") && activeRefStore) {
1564
- const id = selector.slice(1);
1565
- if (activeRefStore.has(id)) {
1566
- const el = activeRefStore.get(id);
1567
- if (!el) return `未找到 ref "${selector}" 对应的元素(可能已被移除或快照已过期)`;
1568
- return el;
1880
+ const spaceIdx = selector.indexOf(" ");
1881
+ const hashPart = spaceIdx > 0 ? selector.slice(1, spaceIdx) : selector.slice(1);
1882
+ const rest = spaceIdx > 0 ? selector.slice(spaceIdx + 1).trim() : "";
1883
+ if (activeRefStore.has(hashPart)) {
1884
+ const root = activeRefStore.get(hashPart);
1885
+ if (!root || !root.isConnected) {
1886
+ activeRefStore.delete(hashPart);
1887
+ return `未找到 ref "#${hashPart}" 对应的元素(可能已被移除或快照已过期)`;
1888
+ }
1889
+ if (!rest) return root;
1890
+ const child = root.querySelector(rest);
1891
+ if (!child) return `在 #${hashPart} 内未找到匹配 "${rest}" 的子元素`;
1892
+ return child;
1569
1893
  }
1570
1894
  }
1571
1895
  const el = document.querySelector(selector);
@@ -1593,6 +1917,30 @@ function resolveWaitMs(params) {
1593
1917
  if (typeof waitSeconds === "number" && Number.isFinite(waitSeconds)) return Math.max(0, Math.floor(waitSeconds * 1e3));
1594
1918
  return DEFAULT_WAIT_MS;
1595
1919
  }
1920
+ /** 生成元素的简洁描述字符串,用于工具调用结果的可读输出。 */
1921
+ function describeElement(el) {
1922
+ const tag = el.tagName.toLowerCase();
1923
+ const id = el.id ? `#${el.id}` : "";
1924
+ const cls = el.className && typeof el.className === "string" ? el.className.trim().split(/\s+/).filter(Boolean).slice(0, 3).map((c) => `.${c}`).join("") : "";
1925
+ const text = el instanceof HTMLSelectElement ? el.selectedOptions[0]?.textContent?.trim().slice(0, 40) ?? "" : el.textContent?.trim().slice(0, 40) ?? "";
1926
+ const textHint = text ? ` "${text}"` : "";
1927
+ const hints = [];
1928
+ for (const attr of [
1929
+ "type",
1930
+ "name",
1931
+ "placeholder",
1932
+ "href",
1933
+ "role"
1934
+ ]) {
1935
+ const v = el.getAttribute(attr);
1936
+ if (v) hints.push(`${attr}=${v}`);
1937
+ }
1938
+ if (el instanceof HTMLSelectElement && el.value) hints.push(`val=${el.value}`);
1939
+ return `<${tag}${id}${cls}>${textHint}${hints.length > 0 ? ` [${hints.join(", ")}]` : ""}`;
1940
+ }
1941
+
1942
+ //#endregion
1943
+ //#region src/web/tools/dom-tool/actionability.ts
1596
1944
  /** 检查元素样式可见性(处理 checkVisibility / details 折叠 / visibility) */
1597
1945
  function isStyleVisible(el, style) {
1598
1946
  style = style ?? window.getComputedStyle(el);
@@ -1673,23 +2021,6 @@ function checkElementStable(el, timeoutMs = 800) {
1673
2021
  requestAnimationFrame(check);
1674
2022
  });
1675
2023
  }
1676
- /**
1677
- * 将目标重定向到关联的交互控件。
1678
- * - button-link:非交互元素→最近 button/[role=button]/a/[role=link]
1679
- * - follow-label:label→control + 非交互→button/[role=button]/[role=checkbox]/[role=radio]
1680
- */
1681
- function retarget(el, mode) {
1682
- if (mode === "none") return el;
1683
- if (!el.matches("input, textarea, select") && !el.isContentEditable) if (mode === "button-link") el = el.closest("button, [role=button], a, [role=link]") || el;
1684
- else el = el.closest("button, [role=button], [role=checkbox], [role=radio]") || el;
1685
- if (mode === "follow-label") {
1686
- if (!el.matches("a, input, textarea, button, select, [role=link], [role=button], [role=checkbox], [role=radio]") && !el.isContentEditable) {
1687
- const label = el.closest("label");
1688
- if (label?.control) el = label.control;
1689
- }
1690
- }
1691
- return el;
1692
- }
1693
2024
  function scrollIntoViewIfNeeded(el, retry = 0) {
1694
2025
  if (retry === 0 && "scrollIntoViewIfNeeded" in el) {
1695
2026
  el.scrollIntoViewIfNeeded(true);
@@ -1711,7 +2042,7 @@ function checkHitTarget(el) {
1711
2042
  if (topEl === el || el.contains(topEl) || topEl.contains(el)) return null;
1712
2043
  const sharedLabel = topEl.closest("label");
1713
2044
  if (sharedLabel && sharedLabel.contains(el)) return null;
1714
- return describeElement(topEl);
2045
+ return `<${topEl.tagName.toLowerCase()}${topEl.id ? `#${topEl.id}` : ""}>`;
1715
2046
  }
1716
2047
  function ensureActionable(el, action, selector, force) {
1717
2048
  if (force) return null;
@@ -1766,6 +2097,15 @@ function ensureActionable(el, action, selector, force) {
1766
2097
  };
1767
2098
  return null;
1768
2099
  }
2100
+
2101
+ //#endregion
2102
+ //#region src/web/tools/dom-tool/events.ts
2103
+ /**
2104
+ * DOM Tool — 事件派发与键盘操作。
2105
+ *
2106
+ * 包含:完整点击事件链、hover 事件链、input/change 派发、
2107
+ * 原生 setter 写入、selectText、组合键 press。
2108
+ */
1769
2109
  function getClickPoint(el) {
1770
2110
  const r = el.getBoundingClientRect();
1771
2111
  return {
@@ -1774,7 +2114,7 @@ function getClickPoint(el) {
1774
2114
  };
1775
2115
  }
1776
2116
  /**
1777
- * 完整点击事件链(参考 Playwright Mouse.click):
2117
+ * 完整点击事件链:
1778
2118
  * pointermove → mousemove → (per clickCount) pointerdown → mousedown → focus → pointerup → mouseup → click
1779
2119
  */
1780
2120
  function dispatchClickEvents(el, clickCount = 1) {
@@ -1942,25 +2282,31 @@ function executePress(el, key) {
1942
2282
  ...modState
1943
2283
  }));
1944
2284
  }
1945
- function describeElement(el) {
1946
- const tag = el.tagName.toLowerCase();
1947
- const id = el.id ? `#${el.id}` : "";
1948
- const cls = el.className && typeof el.className === "string" ? el.className.trim().split(/\s+/).filter(Boolean).slice(0, 3).map((c) => `.${c}`).join("") : "";
1949
- const text = el instanceof HTMLSelectElement ? el.selectedOptions[0]?.textContent?.trim().slice(0, 40) ?? "" : el.textContent?.trim().slice(0, 40) ?? "";
1950
- const textHint = text ? ` "${text}"` : "";
1951
- const hints = [];
1952
- for (const attr of [
1953
- "type",
1954
- "name",
1955
- "placeholder",
1956
- "href",
1957
- "role"
1958
- ]) {
1959
- const v = el.getAttribute(attr);
1960
- if (v) hints.push(`${attr}=${v}`);
2285
+
2286
+ //#endregion
2287
+ //#region src/web/tools/dom-tool/resolve.ts
2288
+ /**
2289
+ * DOM Tool 目标解析与归一化。
2290
+ *
2291
+ * 包含:retarget、checkable 目标归一化、pointer action 代理、
2292
+ * 表单项控件重定向、editable 穿透。
2293
+ */
2294
+ /**
2295
+ * 将目标重定向到关联的交互控件。
2296
+ * - button-link:非交互元素→最近 button/[role=button]/a/[role=link]
2297
+ * - follow-label:label→control + 非交互→button/[role=button]/[role=checkbox]/[role=radio]
2298
+ */
2299
+ function retarget(el, mode) {
2300
+ if (mode === "none") return el;
2301
+ if (!el.matches("input, textarea, select") && !el.isContentEditable) if (mode === "button-link") el = el.closest("button, [role=button], a, [role=link]") || el;
2302
+ else el = el.closest("button, [role=button], [role=checkbox], [role=radio]") || el;
2303
+ if (mode === "follow-label") {
2304
+ if (!el.matches("a, input, textarea, button, select, [role=link], [role=button], [role=checkbox], [role=radio]") && !el.isContentEditable) {
2305
+ const label = el.closest("label");
2306
+ if (label?.control) el = label.control;
2307
+ }
1961
2308
  }
1962
- if (el instanceof HTMLSelectElement && el.value) hints.push(`val=${el.value}`);
1963
- return `<${tag}${id}${cls}>${textHint}${hints.length > 0 ? ` [${hints.join(", ")}]` : ""}`;
2309
+ return el;
1964
2310
  }
1965
2311
  function getChecked(el) {
1966
2312
  if (el instanceof HTMLInputElement && (el.type === "checkbox" || el.type === "radio")) return el.checked;
@@ -2021,6 +2367,35 @@ function resolveFormItemControlTarget(el) {
2021
2367
  if (control && isElementVisible(control)) return control;
2022
2368
  return el;
2023
2369
  }
2370
+ /**
2371
+ * 穿透包裹容器,查找内部可编辑子元素。
2372
+ * 覆盖 UI 框架常见模式:wrapper div 包裹真实 input/textarea。
2373
+ * 若自身已可编辑则直接返回;否则在子树中搜索第一个可编辑且可见的控件。
2374
+ * 对 role=slider/spinbutton 等 ARIA widget:向上逐级查找最近容器中的关联 input。
2375
+ */
2376
+ function resolveEditableTarget(el) {
2377
+ if (isEditableElement(el)) return el;
2378
+ const inner = el.querySelector("input:not([type=\"hidden\"]), textarea, select, [contenteditable=\"true\"]");
2379
+ if (inner && isEditableElement(inner) && isElementVisible(inner)) return inner;
2380
+ const role = el.getAttribute("role");
2381
+ if (role === "slider" || role === "spinbutton") {
2382
+ let ancestor = el.parentElement;
2383
+ for (let depth = 0; ancestor && depth < 5; depth++, ancestor = ancestor.parentElement) {
2384
+ const input = ancestor.querySelector("input[type=\"number\"], input[role=\"spinbutton\"], input:not([type=\"hidden\"])");
2385
+ if (input instanceof HTMLInputElement && isEditableElement(input) && isElementVisible(input)) return input;
2386
+ }
2387
+ }
2388
+ return el;
2389
+ }
2390
+
2391
+ //#endregion
2392
+ //#region src/web/tools/dom-tool/dropdown.ts
2393
+ /**
2394
+ * DOM Tool — 自定义下拉增强。
2395
+ *
2396
+ * 包含:全局可见 option 查找、下拉弹出等待。
2397
+ */
2398
+ /** 在全局可见 option 节点中按文本匹配(精确 → 包含) */
2024
2399
  function findVisibleOptionByText(text) {
2025
2400
  const target = text.trim().toLowerCase();
2026
2401
  if (!target) return null;
@@ -2041,6 +2416,7 @@ function findVisibleOptionByText(text) {
2041
2416
  for (const n of visible) if (n.textContent?.trim().toLowerCase().includes(target)) return n;
2042
2417
  return null;
2043
2418
  }
2419
+ /** 轮询等待下拉弹出层出现 */
2044
2420
  async function waitForDropdownPopup(maxWait = 500) {
2045
2421
  const start = Date.now();
2046
2422
  while (Date.now() - start < maxWait) {
@@ -2049,22 +2425,33 @@ async function waitForDropdownPopup(maxWait = 500) {
2049
2425
  await sleep(50);
2050
2426
  }
2051
2427
  }
2428
+
2429
+ //#endregion
2430
+ //#region src/web/tools/dom-tool/index.ts
2431
+ /**
2432
+ * DOM Tool — 浏览器 DOM 操作工具入口(结合 Playwright 核心交互模式增强)。
2433
+ *
2434
+ * 关键能力:
2435
+ * 1. retarget — 点击时自动重定向到 button/link/label.control
2436
+ * 2. scrollIntoView 多策略 — 4 种 block 对齐轮换,解决 sticky 遮挡
2437
+ * 3. stable 检查 — rAF 逐帧检测元素位置稳定后再操作
2438
+ * 4. hit-target 验证 — elementsFromPoint 检查是否被遮挡
2439
+ * 5. 完整点击事件链 — pointermove→pointerdown→mousedown→pointerup→mouseup→click
2440
+ * 6. check/uncheck 通过 click — 先检查→click 切换→验证状态
2441
+ * 7. press 组合键 — 支持 Control+a, Shift+Enter 等修饰键
2442
+ * 8. fill 分类型 — date/color/range 走 setValue,text 类走 selectAll+原生写入
2443
+ * 9. 自定义下拉增强 — 更广泛的 option 选择器 + 等待弹出
2444
+ * 10. ARIA disabled — 检查祖先链 aria-disabled
2445
+ *
2446
+ * 运行环境:浏览器 Content Script(直接访问 DOM,无 CDP)。
2447
+ */
2052
2448
  function createDomTool() {
2053
2449
  return {
2054
2450
  name: "dom",
2055
2451
  description: [
2056
2452
  "Perform DOM operations on the current page.",
2057
2453
  "Actions: click, fill, select_option, clear, check, uncheck, type, focus, hover, scroll, press, get_text, get_attr, set_attr, add_class, remove_class.",
2058
- "Input/Select rule: before each fill/type/select_option, click or focus the same target immediately in the same round.",
2059
- "For multiple fields, use alternating pairs in one batch: focus/click A -> fill/type A -> focus/click B -> fill/type B.",
2060
- "Use the hash ID from DOM snapshot (e.g. #a1b2c) as selector.",
2061
- "press supports combo keys like 'Control+a', 'Shift+Enter'.",
2062
- "check/uncheck is done via click — state change is verified after action.",
2063
- "Ordinal/index rule: treat visual order as 1-based when the instruction says 'the Nth item' (e.g. 4th star = 4th visible icon from left to right), and avoid off-by-one mistakes.",
2064
- "Disambiguation rule: distinguish descriptive text/labels from actionable options. Do not click nearby label/help text; click the actual interactive option/control item (icon/button/option) that changes state.",
2065
- "Unknown/complex components: if a container element (e.g. role=slider, rating, custom widget) has multiple child icons/items in the snapshot but you don't know how to operate it directly, try clicking the appropriate child element instead. For example, a rating component with 5 star icon children — click the 4th icon child to set 4 stars. A slider with a runway — clicking the runway at the right position may work. Always prefer interacting with visible children when the parent container doesn't respond to fill/click as expected.",
2066
- "fill supports role=slider elements: use fill with a numeric value on a role=slider container (rating/slider) to set its value programmatically.",
2067
- "For wheel/virtualized pickers where target option is not visible yet, use scroll on the picker column first, then click/select the newly visible option. scroll supports steps for repeated scrolling in one call."
2454
+ "fill auto-resolves wrapper inner input. check/uncheck toggles via click. press supports combos (Control+a). scroll supports steps for repeated scrolling."
2068
2455
  ].join(" "),
2069
2456
  schema: Type.Object({
2070
2457
  action: Type.String({ description: "DOM action: click | fill | select_option | clear | check | uncheck | type | focus | hover | scroll | press | get_text | get_attr | set_attr | add_class | remove_class." }),
@@ -2079,7 +2466,7 @@ function createDomTool() {
2079
2466
  deltaY: Type.Optional(Type.Number({ description: "Vertical scroll delta for scroll action. Positive = down, negative = up." })),
2080
2467
  deltaX: Type.Optional(Type.Number({ description: "Horizontal scroll delta for scroll action." })),
2081
2468
  steps: Type.Optional(Type.Number({ description: "Repeat count for scroll action (default 1, max 20)." })),
2082
- waitMs: Type.Optional(Type.Number({ description: "Wait timeout in ms before action (default: 2000)." })),
2469
+ waitMs: Type.Optional(Type.Number({ description: "Wait timeout in ms before action (default: 1200)." })),
2083
2470
  waitSeconds: Type.Optional(Type.Number({ description: "Wait timeout in seconds (fallback for waitMs)." })),
2084
2471
  force: Type.Optional(Type.Boolean({ description: "Skip actionability checks (default false)." }))
2085
2472
  }),
@@ -2127,6 +2514,11 @@ function createDomTool() {
2127
2514
  el = r;
2128
2515
  }
2129
2516
  if (action === "check" || action === "uncheck") el = resolveCheckableTarget(el);
2517
+ if ([
2518
+ "fill",
2519
+ "type",
2520
+ "clear"
2521
+ ].includes(action)) el = resolveEditableTarget(retarget(el, "follow-label"));
2130
2522
  const actionabilityTarget = action === "click" || action === "check" || action === "uncheck" ? resolvePointerActionTarget(resolveFormItemControlTarget(el)) : el;
2131
2523
  try {
2132
2524
  const checkResult = ensureActionable(actionabilityTarget, action, selector, force);
@@ -2160,7 +2552,7 @@ function createDomTool() {
2160
2552
  case "fill": {
2161
2553
  const value = params.value;
2162
2554
  if (value === void 0) return { content: "缺少 value 参数" };
2163
- const target = retarget(el, "follow-label");
2555
+ const target = el;
2164
2556
  if (target instanceof HTMLInputElement) {
2165
2557
  const type = target.type.toLowerCase();
2166
2558
  if (INPUT_BLOCKED_TYPES.has(type)) return {
@@ -2303,7 +2695,7 @@ function createDomTool() {
2303
2695
  return { content: `已选择 ${describeElement(target)}: value="${selected.value}", label="${selected.text.trim()}"` };
2304
2696
  }
2305
2697
  case "clear": {
2306
- const target = retarget(el, "follow-label");
2698
+ const target = el;
2307
2699
  if (target instanceof HTMLInputElement || target instanceof HTMLTextAreaElement) {
2308
2700
  scrollIntoViewIfNeeded(target);
2309
2701
  target.focus();
@@ -2363,7 +2755,7 @@ function createDomTool() {
2363
2755
  case "type": {
2364
2756
  const value = params.value;
2365
2757
  if (value === void 0) return { content: "缺少 value 参数" };
2366
- const target = retarget(el, "follow-label");
2758
+ const target = el;
2367
2759
  scrollIntoViewIfNeeded(target);
2368
2760
  if (target instanceof HTMLElement) target.focus();
2369
2761
  for (const char of value) {
@@ -2530,6 +2922,50 @@ const MAX_EXPANDED_LIST_CHILDREN = 120;
2530
2922
  /** 定向放宽 children 的硬上限。 */
2531
2923
  const MAX_EXPANDED_CHILDREN_LIMIT = 300;
2532
2924
  /**
2925
+ * 事件名 → 快照简写映射。
2926
+ * 目的:大幅压缩 listeners="..." 占用的 token,同时保留可读性。
2927
+ * 简写规则在 system-prompt 中向模型说明。
2928
+ */
2929
+ const EVENT_ABBREV = {
2930
+ click: "clk",
2931
+ dblclick: "dbl",
2932
+ mousedown: "mdn",
2933
+ mouseup: "mup",
2934
+ mousemove: "mmv",
2935
+ mouseover: "mov",
2936
+ mouseout: "mot",
2937
+ mouseenter: "men",
2938
+ mouseleave: "mlv",
2939
+ pointerdown: "pdn",
2940
+ pointerup: "pup",
2941
+ pointermove: "pmv",
2942
+ pointerenter: "pen",
2943
+ pointerleave: "plv",
2944
+ touchstart: "tst",
2945
+ touchend: "ted",
2946
+ touchmove: "tmv",
2947
+ keydown: "kdn",
2948
+ keyup: "kup",
2949
+ keypress: "kpr",
2950
+ input: "inp",
2951
+ change: "chg",
2952
+ submit: "sub",
2953
+ focus: "fcs",
2954
+ blur: "blr",
2955
+ scroll: "scl",
2956
+ wheel: "whl",
2957
+ drag: "drg",
2958
+ dragstart: "drs",
2959
+ dragend: "dre",
2960
+ drop: "drp",
2961
+ contextmenu: "ctx",
2962
+ resize: "rsz"
2963
+ };
2964
+ /** 将完整事件名转为快照简写(未收录的取前 3 字符)。 */
2965
+ function abbrevEvent(name) {
2966
+ return EVENT_ABBREV[name] ?? name.slice(0, 3);
2967
+ }
2968
+ /**
2533
2969
  * 规整快照属性值,避免把长 base64/data URL 原样注入快照。
2534
2970
  */
2535
2971
  function sanitizeSnapshotAttrValue(value) {
@@ -2581,6 +3017,7 @@ function generateSnapshot(root = document.body, options = {}) {
2581
3017
  const expandChildrenRefSet = new Set((opts.expandChildrenRefs ?? []).map((ref) => ref.trim().replace(/^#/, "")).filter(Boolean));
2582
3018
  let emittedNodes = 0;
2583
3019
  let truncatedByNodeBudget = false;
3020
+ const emittedRefIds = /* @__PURE__ */ new Set();
2584
3021
  const refStore = opts.refStore;
2585
3022
  const SKIP_TAGS = new Set([
2586
3023
  "SCRIPT",
@@ -2617,6 +3054,9 @@ function generateSnapshot(root = document.body, options = {}) {
2617
3054
  "name",
2618
3055
  "role",
2619
3056
  "aria-label",
3057
+ "aria-valuenow",
3058
+ "aria-valuemin",
3059
+ "aria-valuemax",
2620
3060
  "src",
2621
3061
  "alt",
2622
3062
  "title",
@@ -2634,6 +3074,25 @@ function generateSnapshot(root = document.body, options = {}) {
2634
3074
  "LABEL",
2635
3075
  "SUMMARY"
2636
3076
  ]);
3077
+ /** 常见可交互事件(用于提升元素交互优先级)。 */
3078
+ const INTERACTIVE_EVENTS = new Set([
3079
+ "click",
3080
+ "dblclick",
3081
+ "mousedown",
3082
+ "mouseup",
3083
+ "pointerdown",
3084
+ "pointerup",
3085
+ "touchstart",
3086
+ "touchend",
3087
+ "input",
3088
+ "change",
3089
+ "keydown",
3090
+ "keyup",
3091
+ "keypress",
3092
+ "submit",
3093
+ "focus",
3094
+ "blur"
3095
+ ]);
2637
3096
  /** 布尔状态属性 — 只在存在时输出(无值),如 disabled、checked */
2638
3097
  const BOOLEAN_ATTRS = [
2639
3098
  "disabled",
@@ -2682,15 +3141,22 @@ function generateSnapshot(root = document.body, options = {}) {
2682
3141
  if (el.getAttribute("id")) return false;
2683
3142
  if (el.getAttribute("role") || el.getAttribute("aria-label")) return false;
2684
3143
  for (const attr of Array.from(el.attributes)) if (attr.name.startsWith("on")) return false;
3144
+ if (hasTrackedElementEvents(el)) return false;
2685
3145
  if (directText) return false;
2686
3146
  return true;
2687
3147
  }
3148
+ function hasInteractiveTrackedEvents(el) {
3149
+ const trackedEvents = getTrackedElementEvents(el);
3150
+ if (trackedEvents.length === 0) return false;
3151
+ return trackedEvents.some((eventName) => INTERACTIVE_EVENTS.has(eventName));
3152
+ }
2688
3153
  function isInteractiveElement(el) {
2689
3154
  if (INTERACTIVE_TAGS.has(el.tagName)) return true;
2690
3155
  if (el.hasAttribute("onclick")) return true;
2691
3156
  if (el.hasAttribute("role")) return true;
2692
3157
  if (el.hasAttribute("tabindex")) return true;
2693
3158
  if (el.hasAttribute("aria-label")) return true;
3159
+ if (hasInteractiveTrackedEvents(el)) return true;
2694
3160
  return false;
2695
3161
  }
2696
3162
  /** 判断是否为“选项列表”容器(时间/下拉/listbox 等)。 */
@@ -2751,6 +3217,12 @@ function generateSnapshot(root = document.body, options = {}) {
2751
3217
  if (!attrs.includes("readonly")) attrs.push("readonly");
2752
3218
  }
2753
3219
  if (el.hasAttribute("onclick")) attrs.push("onclick");
3220
+ const trackedEvents = getTrackedElementEvents(el);
3221
+ if (trackedEvents.length > 0) {
3222
+ const preview = trackedEvents.slice(0, 6).map(abbrevEvent).join(",");
3223
+ const suffix = trackedEvents.length > 6 ? ",..." : "";
3224
+ attrs.push(`listeners="${preview}${suffix}"`);
3225
+ }
2754
3226
  const testId = el.getAttribute("data-testid") || el.getAttribute("data-test-id");
2755
3227
  if (testId) {
2756
3228
  const safeTestId = sanitizeSnapshotAttrValue(testId).slice(0, 25);
@@ -2800,8 +3272,10 @@ function generateSnapshot(root = document.body, options = {}) {
2800
3272
  let line = `${indent}[${tag}]`;
2801
3273
  if (directText) line += ` "${directText.slice(0, maxTextLength)}"`;
2802
3274
  if (attrs.length) line += ` ${attrs.join(" ")}`;
2803
- if (hashId) line += ` #${hashId}`;
2804
- else line += ` ref="${currentPath}"`;
3275
+ if (hashId) {
3276
+ line += ` #${hashId}`;
3277
+ emittedRefIds.add(hashId);
3278
+ } else line += ` ref="${currentPath}"`;
2805
3279
  const lines = [line];
2806
3280
  emittedNodes++;
2807
3281
  const allChildren = Array.from(el.children);
@@ -2819,6 +3293,7 @@ function generateSnapshot(root = document.body, options = {}) {
2819
3293
  return lines.join("\n");
2820
3294
  }
2821
3295
  const output = walk(root, 0, "") || "(空页面)";
3296
+ refStore?.prune(emittedRefIds);
2822
3297
  if (!truncatedByNodeBudget) return output;
2823
3298
  return `${output}\n... (snapshot truncated: maxNodes=${maxNodes})`;
2824
3299
  }
@@ -3046,7 +3521,7 @@ function createNavigateTool() {
3046
3521
  * - hash selector(如 #abc123)优先通过 RefStore 解析。
3047
3522
  * - 可见性语义与 dom-tool 保持一致(参考 Playwright 风格)。
3048
3523
  */
3049
- const DEFAULT_TIMEOUT = 1e4;
3524
+ const DEFAULT_TIMEOUT = 6e3;
3050
3525
  const POLL_INTERVAL_MS = 80;
3051
3526
  const STABLE_TICK_MS = 50;
3052
3527
  const OBSERVER_OPTIONS = {
@@ -3100,7 +3575,14 @@ function resolveSelector(selector) {
3100
3575
  const store = getActiveRefStore();
3101
3576
  if (store) {
3102
3577
  const id = selector.slice(1);
3103
- if (store.has(id)) return store.get(id) ?? null;
3578
+ if (store.has(id)) {
3579
+ const el = store.get(id);
3580
+ if (!el || !el.isConnected) {
3581
+ store.delete(id);
3582
+ return null;
3583
+ }
3584
+ return el;
3585
+ }
3104
3586
  }
3105
3587
  }
3106
3588
  try {
@@ -3238,7 +3720,7 @@ function createWaitTool() {
3238
3720
  selector: Type.Optional(Type.String({ description: "CSS selector for wait_for_selector/wait_for_hidden" })),
3239
3721
  state: Type.Optional(Type.String({ description: "Selector state for wait_for_selector: attached | visible | hidden | detached (default: attached)" })),
3240
3722
  text: Type.Optional(Type.String({ description: "Text to wait for in wait_for_text" })),
3241
- timeout: Type.Optional(Type.Number({ description: "Timeout in milliseconds (default: 10000)" })),
3723
+ timeout: Type.Optional(Type.Number({ description: "Timeout in milliseconds (default: 6000)" })),
3242
3724
  quietMs: Type.Optional(Type.Number({ description: "Quiet window for wait_for_stable in milliseconds (default: 300)" }))
3243
3725
  }),
3244
3726
  execute: async (params) => {
@@ -3444,6 +3926,29 @@ var RefStore = class {
3444
3926
  has(id) {
3445
3927
  return this.map.has(id);
3446
3928
  }
3929
+ /** 删除指定 hash ID 映射,返回是否删除成功。 */
3930
+ delete(id) {
3931
+ return this.map.delete(id);
3932
+ }
3933
+ /**
3934
+ * 清理失效引用:
3935
+ * - 仅保留 keepIds 中的映射(若提供)
3936
+ * - 自动移除已脱离文档(isConnected=false)的元素
3937
+ *
3938
+ * @returns 被移除的映射数量
3939
+ */
3940
+ prune(keepIds) {
3941
+ let removed = 0;
3942
+ for (const [id, el] of this.map.entries()) {
3943
+ const shouldKeepById = keepIds ? keepIds.has(id) : true;
3944
+ const isConnected = el.isConnected;
3945
+ if (!shouldKeepById || !isConnected) {
3946
+ this.map.delete(id);
3947
+ removed++;
3948
+ }
3949
+ }
3950
+ return removed;
3951
+ }
3447
3952
  /** 清空所有映射 */
3448
3953
  clear() {
3449
3954
  this.map.clear();
@@ -3575,6 +4080,7 @@ function registerToolHandler(executors) {
3575
4080
  * │ └──────────┘ └────────────┘ └──────────────┘ │
3576
4081
  * └──────────────────────────────────────────────────┘
3577
4082
  */
4083
+ installEventListenerTracking();
3578
4084
  var WebAgent = class WebAgent {
3579
4085
  /** 默认系统提示词 key(兼容旧版 setSystemPrompt(prompt))。 */
3580
4086
  static DEFAULT_SYSTEM_PROMPT_KEY = "default";
@@ -3607,6 +4113,8 @@ var WebAgent = class WebAgent {
3607
4113
  autoSnapshot;
3608
4114
  /** 快照选项 */
3609
4115
  snapshotOptions;
4116
+ /** 轮次后稳定等待配置 */
4117
+ roundStabilityWait;
3610
4118
  /** 工具注册表实例 — 每个 WebAgent 拥有独立的工具集 */
3611
4119
  registry = new ToolRegistry();
3612
4120
  /** 事件回调 — 绑定后可实时获取 Agent 进度,用于 UI 展示 */
@@ -3623,6 +4131,7 @@ var WebAgent = class WebAgent {
3623
4131
  this.memory = options.memory ?? false;
3624
4132
  this.autoSnapshot = options.autoSnapshot ?? true;
3625
4133
  this.snapshotOptions = options.snapshotOptions ?? {};
4134
+ this.roundStabilityWait = options.roundStabilityWait;
3626
4135
  if (typeof options.systemPrompt === "string") this.setSystemPrompt(options.systemPrompt);
3627
4136
  else if (options.systemPrompt && typeof options.systemPrompt === "object") this.setSystemPrompts(options.systemPrompt);
3628
4137
  }
@@ -3815,6 +4324,7 @@ var WebAgent = class WebAgent {
3815
4324
  history: this.memory ? this.history : void 0,
3816
4325
  dryRun: this.dryRun,
3817
4326
  maxRounds: this.maxRounds,
4327
+ roundStabilityWait: this.roundStabilityWait,
3818
4328
  callbacks: wrappedCallbacks
3819
4329
  });
3820
4330
  if (this.memory) this.history = result.messages;