agentpage 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -791,7 +791,7 @@ var ToolRegistry = class {
791
791
  */
792
792
  function buildSystemPrompt(params = {}) {
793
793
  const sections = [];
794
- sections.push("You are AutoPilot, an AI agent embedded in the user's web page.\nYou can interact with the page by clicking elements, filling forms, reading content, and executing JavaScript.\nAlways confirm destructive actions with the user before executing.\n\n## 操作策略\n\n每次用户请求操作页面时,系统会自动附上当前页面的 DOM 快照。\n快照中每个元素都带有 ref 属性(基于层级位置的唯一路径,如 /body/main/form/button)。\n请严格遵循以下流程:\n1. 分析快照,理解页面结构和元素层级关系。\n2. 从快照中找到目标元素,复制其 ref 路径。\n3. ref 路径作为 dom 工具的 selector 参数传入。\n4. **禁止**猜测 CSS 选择器(如 'button'、'#id'、'.class'),必须使用快照中的 ref 路径。\n5. 规划操作步骤后,按顺序逐步执行。\n\n## 元素选择原则(语义优先)\n\n页面中可能存在多个文本相似的元素(如多个「发送」按钮、多个输入框)。\n**严禁仅凭元素文本匹配来选择操作对象**,必须结合以下语义上下文综合判断:\n1. **层级归属**:元素属于哪个区域/表单/卡片?从 ref 路径的父级结构判断(如 /body/main/form 下的按钮属于该表单)。\n2. **功能关联**:元素与用户意图的功能是否匹配?一个「发送」按钮在聊天区域,另一个在表单区域,要根据用户想操作的功能区来选择。\n3. **周围元素**:查看目标元素的兄弟节点和父级容器,理解它所在的功能模块。\n4. **属性辅助**:利用 id、class、placeholder、aria-label、name 等属性辅助确认元素的用途。\n5. **操作上下文**:如果用户在一系列操作中(如先填写表单再点提交),选择与前序操作同区域的元素。\n\n示例:用户说「点击发送按钮」,页面有两个按钮都叫「发送」:\n- /body/div[1]/div/chat-area/button → 聊天发送按钮\n- /body/div[1]/div/form/button → 表单提交按钮\n你必须根据用户意图和对话上下文判断应该点击哪个,而不是随意选择。");
794
+ sections.push("You are AutoPilot, an AI agent embedded in the user's web page.\nYou can click, fill forms, read content, navigate, and execute JavaScript.\n\n## 操作规则\n\n1. 快照中每个元素末尾的 `#xxxx` hash ID。操作时**必须**用 `#xxxx` 作为 dom 工具的 selector 参数。\n2. **禁止**猜测 CSS 选择器,只用快照中的 hash ID。\n3. 多个相似元素时,根据层级结构、所在功能区域和用户意图判断目标。\n4. 快照看不到目标时,先滚动页面或用 snapshot 获取更深层级。\n5. 破坏性操作前先与用户确认。");
795
795
  const tools = params.tools ?? [];
796
796
  if (tools.length > 0) {
797
797
  const toolLines = tools.map((t) => `- **${t.name}**: ${t.description}`);
@@ -800,6 +800,271 @@ function buildSystemPrompt(params = {}) {
800
800
  return sections.join("\n\n");
801
801
  }
802
802
 
803
+ //#endregion
804
+ //#region src/web/dom-tool.ts
805
+ /**
806
+ * DOM Tool — 基于 Web API 的 DOM 操作工具。
807
+ *
808
+ * 替代 Playwright 的 click/fill/type 等操作,直接在页面上下文中执行。
809
+ * 运行环境:浏览器 Content Script。
810
+ *
811
+ * 支持 8 种动作:
812
+ * click — 点击元素
813
+ * fill — 填写输入框(清空后设值)
814
+ * type — 逐字符模拟键入
815
+ * get_text — 获取元素文本内容
816
+ * get_attr — 获取元素属性值
817
+ * set_attr — 设置元素属性
818
+ * add_class — 添加 CSS 类名
819
+ * remove_class — 移除 CSS 类名
820
+ */
821
+ const DEFAULT_WAIT_MS = 1e3;
822
+ /** 当前活跃的 RefStore 实例(由 WebAgent 在 chat() 时设置) */
823
+ let activeRefStore;
824
+ function sleep(ms) {
825
+ return new Promise((resolve) => setTimeout(resolve, ms));
826
+ }
827
+ /**
828
+ * 安全地查询 DOM 元素。
829
+ *
830
+ * 支持两种定位方式(优先级从高到低):
831
+ * - hash ID(以 "#" 开头且在 RefStore 中存在):确定性 hash 查找(最高效)
832
+ * - CSS 选择器(其他):传统 querySelector
833
+ */
834
+ function queryElement(selector) {
835
+ try {
836
+ if (selector.startsWith("#") && activeRefStore) {
837
+ const id = selector.slice(1);
838
+ if (activeRefStore.has(id)) {
839
+ const el = activeRefStore.get(id);
840
+ if (!el) return `未找到 ref "${selector}" 对应的元素(可能已被移除或快照已过期)`;
841
+ return el;
842
+ }
843
+ }
844
+ const el = document.querySelector(selector);
845
+ if (!el) return `未找到匹配 "${selector}" 的元素`;
846
+ return el;
847
+ } catch (e) {
848
+ return `选择器语法错误: ${selector}`;
849
+ }
850
+ }
851
+ /**
852
+ * 设置当前活跃的 RefStore(由 WebAgent 在 chat 开始时调用)。
853
+ */
854
+ function setActiveRefStore(store) {
855
+ activeRefStore = store;
856
+ }
857
+ /** 获取当前活跃的 RefStore(供其他工具复用) */
858
+ function getActiveRefStore() {
859
+ return activeRefStore;
860
+ }
861
+ /**
862
+ * 在给定超时时间内轮询查找元素。
863
+ * - 返回 Element:找到元素
864
+ * - 返回 string:选择器语法错误
865
+ * - 返回 null:超时未找到
866
+ */
867
+ async function waitForElement(selector, timeoutMs) {
868
+ const start = Date.now();
869
+ while (Date.now() - start <= timeoutMs) {
870
+ const elOrError = queryElement(selector);
871
+ if (typeof elOrError !== "string") return elOrError;
872
+ if (elOrError.startsWith("选择器语法错误")) return elOrError;
873
+ await sleep(100);
874
+ }
875
+ return null;
876
+ }
877
+ function resolveWaitMs(params) {
878
+ const waitMs = params.waitMs;
879
+ if (typeof waitMs === "number" && Number.isFinite(waitMs)) return Math.max(0, Math.floor(waitMs));
880
+ const waitSeconds = params.waitSeconds;
881
+ if (typeof waitSeconds === "number" && Number.isFinite(waitSeconds)) return Math.max(0, Math.floor(waitSeconds * 1e3));
882
+ return DEFAULT_WAIT_MS;
883
+ }
884
+ /**
885
+ * 模拟真实用户输入:触发 input、change 事件,兼容 React/Vue 等框架。
886
+ */
887
+ function dispatchInputEvents(el) {
888
+ el.dispatchEvent(new Event("input", {
889
+ bubbles: true,
890
+ cancelable: true
891
+ }));
892
+ el.dispatchEvent(new Event("change", {
893
+ bubbles: true,
894
+ cancelable: true
895
+ }));
896
+ }
897
+ /**
898
+ * 生成元素的可读描述,用于在操作结果中展示实际命中的 DOM 节点。
899
+ * 格式:<tag#id.class> "文本" [attr=val, ...]
900
+ */
901
+ function describeElement(el) {
902
+ const tag = el.tagName.toLowerCase();
903
+ const id = el.id ? `#${el.id}` : "";
904
+ const cls = el.className && typeof el.className === "string" ? el.className.trim().split(/\s+/).filter(Boolean).slice(0, 3).map((c) => `.${c}`).join("") : "";
905
+ const text = el.textContent?.trim().slice(0, 40) ?? "";
906
+ const textHint = text ? ` "${text}"` : "";
907
+ const hints = [];
908
+ for (const attr of [
909
+ "type",
910
+ "name",
911
+ "placeholder",
912
+ "href",
913
+ "role"
914
+ ]) {
915
+ const val = el.getAttribute(attr);
916
+ if (val) hints.push(`${attr}=${val}`);
917
+ }
918
+ return `<${tag}${id}${cls}>${textHint}${hints.length > 0 ? ` [${hints.join(", ")}]` : ""}`;
919
+ }
920
+ function createDomTool() {
921
+ return {
922
+ name: "dom",
923
+ description: [
924
+ "Perform DOM operations on the current page.",
925
+ "Actions: click, fill, type, get_text, get_attr, set_attr, add_class, remove_class.",
926
+ "Use the hash ID from DOM snapshot (e.g. #a1b2c) as selector."
927
+ ].join(" "),
928
+ schema: Type.Object({
929
+ action: Type.String({ description: "DOM action: click | fill | type | get_text | get_attr | set_attr | add_class | remove_class" }),
930
+ selector: Type.String({ description: "Element ref ID from snapshot (e.g. #r0, #r5) or CSS selector" }),
931
+ value: Type.Optional(Type.String({ description: "Value for fill/type/set_attr actions" })),
932
+ attribute: Type.Optional(Type.String({ description: "Attribute name for get_attr/set_attr actions" })),
933
+ className: Type.Optional(Type.String({ description: "CSS class name for add_class/remove_class" })),
934
+ waitMs: Type.Optional(Type.Number({ description: "Optional wait timeout in ms before action (default: 1000). Use 0 to disable waiting." })),
935
+ waitSeconds: Type.Optional(Type.Number({ description: "Optional wait timeout in seconds before action. Used when waitMs is not provided." }))
936
+ }),
937
+ execute: async (params) => {
938
+ const action = params.action;
939
+ const selector = params.selector;
940
+ const waitMs = resolveWaitMs(params);
941
+ if (!selector) return { content: "缺少 selector 参数" };
942
+ let el;
943
+ if (waitMs > 0) {
944
+ const found = await waitForElement(selector, waitMs);
945
+ if (typeof found === "string") return {
946
+ content: found,
947
+ details: {
948
+ error: true,
949
+ code: "INVALID_SELECTOR",
950
+ action,
951
+ selector
952
+ }
953
+ };
954
+ if (!found) return {
955
+ content: `未找到匹配 "${selector}" 的元素`,
956
+ details: {
957
+ error: true,
958
+ code: "ELEMENT_NOT_FOUND",
959
+ action,
960
+ selector,
961
+ waitMs
962
+ }
963
+ };
964
+ el = found;
965
+ } else {
966
+ const elOrError = queryElement(selector);
967
+ if (typeof elOrError === "string") return {
968
+ content: elOrError,
969
+ details: {
970
+ error: true,
971
+ code: elOrError.startsWith("未找到") ? "ELEMENT_NOT_FOUND" : "INVALID_SELECTOR",
972
+ action,
973
+ selector,
974
+ waitMs
975
+ }
976
+ };
977
+ el = elOrError;
978
+ }
979
+ try {
980
+ switch (action) {
981
+ case "click":
982
+ if (el instanceof HTMLElement) {
983
+ el.focus();
984
+ el.click();
985
+ } else el.dispatchEvent(new MouseEvent("click", { bubbles: true }));
986
+ return { content: `已点击 ${describeElement(el)}` };
987
+ case "fill": {
988
+ const value = params.value;
989
+ if (value === void 0) return { content: "缺少 value 参数" };
990
+ if (el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) {
991
+ el.focus();
992
+ el.value = value;
993
+ dispatchInputEvents(el);
994
+ } else if (el instanceof HTMLElement && el.isContentEditable) {
995
+ el.focus();
996
+ el.textContent = value;
997
+ el.dispatchEvent(new Event("input", { bubbles: true }));
998
+ } else return { content: `"${selector}" 不是可编辑元素` };
999
+ return { content: `已填写 ${describeElement(el)}: "${value}"` };
1000
+ }
1001
+ case "type": {
1002
+ const value = params.value;
1003
+ if (value === void 0) return { content: "缺少 value 参数" };
1004
+ if (el instanceof HTMLElement) el.focus();
1005
+ for (const char of value) {
1006
+ el.dispatchEvent(new KeyboardEvent("keydown", {
1007
+ key: char,
1008
+ bubbles: true
1009
+ }));
1010
+ el.dispatchEvent(new KeyboardEvent("keypress", {
1011
+ key: char,
1012
+ bubbles: true
1013
+ }));
1014
+ if (el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) el.value += char;
1015
+ el.dispatchEvent(new Event("input", { bubbles: true }));
1016
+ el.dispatchEvent(new KeyboardEvent("keyup", {
1017
+ key: char,
1018
+ bubbles: true
1019
+ }));
1020
+ }
1021
+ return { content: `已逐字输入到 ${describeElement(el)}: "${value}"` };
1022
+ }
1023
+ case "get_text": {
1024
+ const text = el.textContent?.trim() ?? "";
1025
+ return { content: `${describeElement(el)} 的文本内容:${text || "(空)"}` };
1026
+ }
1027
+ case "get_attr": {
1028
+ const attribute = params.attribute;
1029
+ if (!attribute) return { content: "缺少 attribute 参数" };
1030
+ const attrValue = el.getAttribute(attribute);
1031
+ return { content: `${describeElement(el)} 的 ${attribute} = ${attrValue ?? "(不存在)"}` };
1032
+ }
1033
+ case "set_attr": {
1034
+ const attribute = params.attribute;
1035
+ const value = params.value;
1036
+ if (!attribute || value === void 0) return { content: "缺少 attribute 或 value 参数" };
1037
+ el.setAttribute(attribute, value);
1038
+ return { content: `已设置 ${describeElement(el)} 的 ${attribute}="${value}"` };
1039
+ }
1040
+ case "add_class": {
1041
+ const className = params.className;
1042
+ if (!className) return { content: "缺少 className 参数" };
1043
+ el.classList.add(className);
1044
+ return { content: `已添加 class "${className}" 到 ${describeElement(el)}` };
1045
+ }
1046
+ case "remove_class": {
1047
+ const className = params.className;
1048
+ if (!className) return { content: "缺少 className 参数" };
1049
+ el.classList.remove(className);
1050
+ return { content: `已移除 ${describeElement(el)} 的 class "${className}"` };
1051
+ }
1052
+ default: return { content: `未知的 DOM 动作: ${action}` };
1053
+ }
1054
+ } catch (err) {
1055
+ return {
1056
+ content: `DOM 操作 "${action}" 失败: ${err instanceof Error ? err.message : String(err)}`,
1057
+ details: {
1058
+ error: true,
1059
+ action,
1060
+ selector
1061
+ }
1062
+ };
1063
+ }
1064
+ }
1065
+ };
1066
+ }
1067
+
803
1068
  //#endregion
804
1069
  //#region src/web/page-info-tool.ts
805
1070
  /**
@@ -819,27 +1084,19 @@ function buildSystemPrompt(params = {}) {
819
1084
  /**
820
1085
  * 生成页面 DOM 快照 — 将 DOM 树转为 AI 可理解的文本描述。
821
1086
  *
822
- * 类似 Playwright ariaSnapshot(),但基于 Web API 实现。
823
- * 只遍历可见元素,跳过 script/style/svg 等无意义节点。
824
- *
825
- * 每个元素自动生成基于层级位置的 XPath 引用(ref),
826
- * AI 可以通过 ref 精确定位元素,无需猜测 CSS 选择器。
1087
+ * 基于 Web API 实现,只遍历可见元素,跳过 script/style/svg 等无意义节点。
1088
+ * 传入 RefStore 时,每个元素生成确定性 hash ID(如 #a1b2c),
1089
+ * AI 通过 hash ID 精确定位元素,无需猜测 CSS 选择器。
827
1090
  *
828
1091
  * 输出格式示例:
829
- * [header] ref="/body/header"
830
- * [nav] ref="/body/header/nav"
831
- * [a] "首页" href="/" ref="/body/header/nav/a[1]"
832
- * [a] "关于" href="/about" ref="/body/header/nav/a[2]"
833
- * [main] ref="/body/main"
834
- * [h1] "欢迎来到示例网站" ref="/body/main/h1"
835
- * [input] type="text" placeholder="搜索..." ref="/body/main/input"
836
- * [button] "搜索" id="search-btn" onclick ref="/body/main/button"
837
- *
838
- * 增强信息:
839
- * - id:元素的 id 属性
840
- * - placeholder:输入框的占位文本
841
- * - 事件绑定:onclick/onchange 等内联事件处理器
842
- * - 状态属性:disabled/checked/readonly/required 等
1092
+ * [header] #k9f2a
1093
+ * [nav] #m3d7e
1094
+ * [a] "首页" href="/" #p1c4b
1095
+ * [a] "关于" href="/about" #q8e5f
1096
+ * [main] #r2a6d
1097
+ * [h1] "欢迎" #s7g3h
1098
+ * [input] type="text" placeholder="搜索..." #t4j8k
1099
+ * [button] "搜索" id="search-btn" onclick #u5n2m
843
1100
  *
844
1101
  * @param root - 快照根元素(默认 document.body)
845
1102
  * @param options - 快照选项对象,或传入数字作为 maxDepth(向后兼容)
@@ -849,6 +1106,7 @@ function generateSnapshot(root = document.body, options = {}) {
849
1106
  const maxDepth = opts.maxDepth ?? 6;
850
1107
  const viewportOnly = opts.viewportOnly ?? true;
851
1108
  const pruneLayout = opts.pruneLayout ?? true;
1109
+ const refStore = opts.refStore;
852
1110
  const SKIP_TAGS = new Set([
853
1111
  "SCRIPT",
854
1112
  "STYLE",
@@ -967,7 +1225,7 @@ function generateSnapshot(root = document.body, options = {}) {
967
1225
  if (elId) attrs.push(`id="${elId}"`);
968
1226
  const className = el.getAttribute("class")?.trim();
969
1227
  if (className) {
970
- const classes = className.split(/\s+/).filter(Boolean).slice(0, 3).join(" ");
1228
+ const classes = className.split(/\s+/).filter((c) => c && !c.startsWith("data-v-") && c.length < 30).slice(0, 2).join(" ");
971
1229
  if (classes) attrs.push(`class="${classes}"`);
972
1230
  }
973
1231
  for (const attr of INTERACTIVE_ATTRS) {
@@ -979,11 +1237,11 @@ function generateSnapshot(root = document.body, options = {}) {
979
1237
  for (const attrObj of Array.from(el.attributes)) if (attrObj.name.startsWith(EVENT_PREFIX)) events.push(attrObj.name);
980
1238
  if (events.length > 0) attrs.push(`events=[${events.join(",")}]`);
981
1239
  const dataAttrs = [];
982
- for (const attrObj of Array.from(el.attributes)) if (attrObj.name.startsWith("data-") && dataAttrs.length < 3) dataAttrs.push(`${attrObj.name}="${attrObj.value.slice(0, 30)}"`);
1240
+ for (const attrObj of Array.from(el.attributes)) if (attrObj.name.startsWith("data-") && !attrObj.name.match(/^data-v-/) && dataAttrs.length < 2) dataAttrs.push(`${attrObj.name}="${attrObj.value.slice(0, 30)}"`);
983
1241
  if (dataAttrs.length > 0) attrs.push(...dataAttrs);
984
1242
  if ((el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) && el.value) {
985
1243
  const currentVal = el.value.slice(0, 60);
986
- if (el.getAttribute("value") !== currentVal) attrs.push(`current-value="${currentVal}"`);
1244
+ if (el.getAttribute("value") !== currentVal) attrs.push(`val="${currentVal}"`);
987
1245
  }
988
1246
  let directText = "";
989
1247
  for (let i = 0; i < el.childNodes.length; i++) {
@@ -1003,376 +1261,101 @@ function generateSnapshot(root = document.body, options = {}) {
1003
1261
  return childLines.join("\n");
1004
1262
  }
1005
1263
  let line = `${indent}[${tag}]`;
1006
- if (directText) line += ` "${directText.slice(0, 80)}"`;
1007
- if (attrs.length) line += ` ${attrs.join(" ")}`;
1008
- line += ` ref="${currentPath}"`;
1009
- const lines = [line];
1010
- for (let i = 0; i < el.children.length; i++) {
1011
- const childResult = walk(el.children[i], depth + 1, currentPath);
1012
- if (childResult) lines.push(childResult);
1013
- }
1014
- return lines.join("\n");
1015
- }
1016
- return walk(root, 0, "") || "(空页面)";
1017
- }
1018
- /**
1019
- * 查询所有匹配元素并返回摘要信息(标签、文本、关键属性)。
1020
- */
1021
- function queryAllElements(selector, limit = 20) {
1022
- try {
1023
- const elements = document.querySelectorAll(selector);
1024
- if (elements.length === 0) return `未找到匹配 "${selector}" 的元素`;
1025
- const results = [`找到 ${elements.length} 个元素:`];
1026
- const count = Math.min(elements.length, limit);
1027
- for (let i = 0; i < count; i++) {
1028
- const el = elements[i];
1029
- const tag = el.tagName.toLowerCase();
1030
- const text = el.textContent?.trim().slice(0, 60) ?? "";
1031
- const id = el.id ? `#${el.id}` : "";
1032
- const cls = el.className && typeof el.className === "string" ? `.${el.className.split(" ").filter(Boolean).join(".")}` : "";
1033
- results.push(` ${i + 1}. <${tag}${id}${cls}> "${text}"`);
1034
- }
1035
- if (elements.length > limit) results.push(` ...还有 ${elements.length - limit} 个元素`);
1036
- return results.join("\n");
1037
- } catch (e) {
1038
- return `选择器语法错误: ${selector}`;
1039
- }
1040
- }
1041
- function createPageInfoTool() {
1042
- return {
1043
- name: "page_info",
1044
- description: [
1045
- "Get information about the current page.",
1046
- "Actions: get_url, get_title, get_selection (selected text),",
1047
- "get_viewport (size & scroll), snapshot (DOM structure), query_all (find all matching elements)."
1048
- ].join(" "),
1049
- schema: Type.Object({
1050
- action: Type.String({ description: "Info action: get_url | get_title | get_selection | get_viewport | snapshot | query_all" }),
1051
- selector: Type.Optional(Type.String({ description: "CSS selector for query_all action" })),
1052
- maxDepth: Type.Optional(Type.Number({ description: "Max depth for snapshot (default: 6)" })),
1053
- viewportOnly: Type.Optional(Type.Boolean({ description: "Only snapshot elements visible in viewport (default: true)" })),
1054
- pruneLayout: Type.Optional(Type.Boolean({ description: "Collapse empty layout containers like div/span (default: true)" }))
1055
- }),
1056
- execute: async (params) => {
1057
- const action = params.action;
1058
- try {
1059
- switch (action) {
1060
- case "get_url": return { content: window.location.href };
1061
- case "get_title": return { content: document.title || "(无标题)" };
1062
- case "get_selection": return { content: (window.getSelection()?.toString().trim() ?? "") || "(未选中任何文本)" };
1063
- case "get_viewport": {
1064
- const info = {
1065
- viewportWidth: window.innerWidth,
1066
- viewportHeight: window.innerHeight,
1067
- scrollX: window.scrollX,
1068
- scrollY: window.scrollY,
1069
- pageWidth: document.documentElement.scrollWidth,
1070
- pageHeight: document.documentElement.scrollHeight
1071
- };
1072
- return { content: JSON.stringify(info, null, 2) };
1073
- }
1074
- case "snapshot": {
1075
- const maxDepth = params.maxDepth ?? 6;
1076
- const viewportOnly = params.viewportOnly ?? true;
1077
- const pruneLayout = params.pruneLayout ?? true;
1078
- return { content: generateSnapshot(document.body, {
1079
- maxDepth,
1080
- viewportOnly,
1081
- pruneLayout
1082
- }) };
1083
- }
1084
- case "query_all": {
1085
- const selector = params.selector;
1086
- if (!selector) return { content: "缺少 selector 参数" };
1087
- return { content: queryAllElements(selector) };
1088
- }
1089
- default: return { content: `未知的页面信息动作: ${action}` };
1090
- }
1091
- } catch (err) {
1092
- return {
1093
- content: `页面信息操作 "${action}" 失败: ${err instanceof Error ? err.message : String(err)}`,
1094
- details: {
1095
- error: true,
1096
- action
1097
- }
1098
- };
1099
- }
1100
- }
1101
- };
1102
- }
1103
-
1104
- //#endregion
1105
- //#region src/web/dom-tool.ts
1106
- /**
1107
- * DOM Tool — 基于 Web API 的 DOM 操作工具。
1108
- *
1109
- * 替代 Playwright 的 click/fill/type 等操作,直接在页面上下文中执行。
1110
- * 运行环境:浏览器 Content Script。
1111
- *
1112
- * 支持 8 种动作:
1113
- * click — 点击元素
1114
- * fill — 填写输入框(清空后设值)
1115
- * type — 逐字符模拟键入
1116
- * get_text — 获取元素文本内容
1117
- * get_attr — 获取元素属性值
1118
- * set_attr — 设置元素属性
1119
- * add_class — 添加 CSS 类名
1120
- * remove_class — 移除 CSS 类名
1121
- */
1122
- const DEFAULT_WAIT_MS = 1e3;
1123
- function sleep(ms) {
1124
- return new Promise((resolve) => setTimeout(resolve, ms));
1125
- }
1126
- /**
1127
- * 通过快照 ref(XPath 路径)解析到 DOM 元素。
1128
- *
1129
- * ref 格式示例:/body/div[1]/main/form/input[2]
1130
- * 每段为 tagName,可选 [n] 表示同标签兄弟中第 n 个(1-based)。
1131
- */
1132
- function resolveRef(ref) {
1133
- const segments = ref.split("/").filter(Boolean);
1134
- let current = document.documentElement;
1135
- for (let i = 0; i < segments.length; i++) {
1136
- const seg = segments[i];
1137
- if (!current) return null;
1138
- const match = seg.match(/^([a-z0-9-]+)(?:\[(\d+)\])?$/i);
1139
- if (!match) return null;
1140
- const tag = match[1].toUpperCase();
1141
- const index = match[2] ? parseInt(match[2], 10) : 1;
1142
- if (i === 0 && current.tagName === tag) continue;
1143
- const children = Array.from(current.children).filter((c) => c.tagName === tag);
1144
- const sameTagCount = children.length;
1145
- if (sameTagCount === 0) return null;
1146
- if (sameTagCount === 1) current = children[0];
1147
- else {
1148
- if (index < 1 || index > sameTagCount) return null;
1149
- current = children[index - 1];
1150
- }
1151
- }
1152
- return current;
1153
- }
1154
- /**
1155
- * 安全地查询 DOM 元素。
1156
- *
1157
- * 支持两种定位方式:
1158
- * - ref 路径(以 "/" 开头):使用快照生成的 XPath 精确定位
1159
- * - CSS 选择器(其他):传统 querySelector
1160
- */
1161
- function queryElement(selector) {
1162
- try {
1163
- if (selector.startsWith("/")) {
1164
- const el = resolveRef(selector);
1165
- if (!el) return `未找到 ref "${selector}" 对应的元素`;
1166
- return el;
1167
- }
1168
- const el = document.querySelector(selector);
1169
- if (!el) return `未找到匹配 "${selector}" 的元素`;
1170
- return el;
1171
- } catch (e) {
1172
- return `选择器语法错误: ${selector}`;
1173
- }
1174
- }
1175
- /**
1176
- * 在给定超时时间内轮询查找元素。
1177
- * - 返回 Element:找到元素
1178
- * - 返回 string:选择器语法错误
1179
- * - 返回 null:超时未找到
1180
- */
1181
- async function waitForElement(selector, timeoutMs) {
1182
- const start = Date.now();
1183
- while (Date.now() - start <= timeoutMs) {
1184
- const elOrError = queryElement(selector);
1185
- if (typeof elOrError !== "string") return elOrError;
1186
- if (elOrError.startsWith("选择器语法错误")) return elOrError;
1187
- await sleep(100);
1264
+ if (directText) line += ` "${directText.slice(0, 60)}"`;
1265
+ if (attrs.length) line += ` ${attrs.join(" ")}`;
1266
+ if (refStore) {
1267
+ const hashId = refStore.set(el, currentPath);
1268
+ line += ` #${hashId}`;
1269
+ } else line += ` ref="${currentPath}"`;
1270
+ const lines = [line];
1271
+ for (let i = 0; i < el.children.length; i++) {
1272
+ const childResult = walk(el.children[i], depth + 1, currentPath);
1273
+ if (childResult) lines.push(childResult);
1274
+ }
1275
+ return lines.join("\n");
1188
1276
  }
1189
- return null;
1190
- }
1191
- function resolveWaitMs(params) {
1192
- const waitMs = params.waitMs;
1193
- if (typeof waitMs === "number" && Number.isFinite(waitMs)) return Math.max(0, Math.floor(waitMs));
1194
- const waitSeconds = params.waitSeconds;
1195
- if (typeof waitSeconds === "number" && Number.isFinite(waitSeconds)) return Math.max(0, Math.floor(waitSeconds * 1e3));
1196
- return DEFAULT_WAIT_MS;
1197
- }
1198
- /**
1199
- * 模拟真实用户输入:触发 input、change 事件,兼容 React/Vue 等框架。
1200
- */
1201
- function dispatchInputEvents(el) {
1202
- el.dispatchEvent(new Event("input", {
1203
- bubbles: true,
1204
- cancelable: true
1205
- }));
1206
- el.dispatchEvent(new Event("change", {
1207
- bubbles: true,
1208
- cancelable: true
1209
- }));
1277
+ return walk(root, 0, "") || "(空页面)";
1210
1278
  }
1211
1279
  /**
1212
- * 生成元素的可读描述,用于在操作结果中展示实际命中的 DOM 节点。
1213
- * 格式:<tag#id.class> "文本" [attr=val, ...]
1280
+ * 查询所有匹配元素并返回摘要信息(标签、文本、关键属性)。
1214
1281
  */
1215
- function describeElement(el) {
1216
- const tag = el.tagName.toLowerCase();
1217
- const id = el.id ? `#${el.id}` : "";
1218
- const cls = el.className && typeof el.className === "string" ? el.className.trim().split(/\s+/).filter(Boolean).slice(0, 3).map((c) => `.${c}`).join("") : "";
1219
- const text = el.textContent?.trim().slice(0, 40) ?? "";
1220
- const textHint = text ? ` "${text}"` : "";
1221
- const hints = [];
1222
- for (const attr of [
1223
- "type",
1224
- "name",
1225
- "placeholder",
1226
- "href",
1227
- "role"
1228
- ]) {
1229
- const val = el.getAttribute(attr);
1230
- if (val) hints.push(`${attr}=${val}`);
1282
+ function queryAllElements(selector, limit = 20) {
1283
+ try {
1284
+ const elements = document.querySelectorAll(selector);
1285
+ if (elements.length === 0) return `未找到匹配 "${selector}" 的元素`;
1286
+ const results = [`找到 ${elements.length} 个元素:`];
1287
+ const count = Math.min(elements.length, limit);
1288
+ for (let i = 0; i < count; i++) {
1289
+ const el = elements[i];
1290
+ const tag = el.tagName.toLowerCase();
1291
+ const text = el.textContent?.trim().slice(0, 60) ?? "";
1292
+ const id = el.id ? `#${el.id}` : "";
1293
+ const cls = el.className && typeof el.className === "string" ? `.${el.className.split(" ").filter(Boolean).join(".")}` : "";
1294
+ results.push(` ${i + 1}. <${tag}${id}${cls}> "${text}"`);
1295
+ }
1296
+ if (elements.length > limit) results.push(` ...还有 ${elements.length - limit} 个元素`);
1297
+ return results.join("\n");
1298
+ } catch (e) {
1299
+ return `选择器语法错误: ${selector}`;
1231
1300
  }
1232
- return `<${tag}${id}${cls}>${textHint}${hints.length > 0 ? ` [${hints.join(", ")}]` : ""}`;
1233
1301
  }
1234
- function createDomTool() {
1302
+ function createPageInfoTool() {
1235
1303
  return {
1236
- name: "dom",
1304
+ name: "page_info",
1237
1305
  description: [
1238
- "Perform DOM operations on the current page.",
1239
- "Actions: click, fill, type, get_text, get_attr, set_attr, add_class, remove_class.",
1240
- "Use the ref path from the DOM snapshot (e.g. /body/main/button) as selector to precisely target elements.",
1241
- "CSS selectors are also supported but ref paths are preferred for accuracy."
1306
+ "Get information about the current page.",
1307
+ "Actions: get_url, get_title, get_selection (selected text),",
1308
+ "get_viewport (size & scroll), snapshot (DOM structure), query_all (find all matching elements)."
1242
1309
  ].join(" "),
1243
1310
  schema: Type.Object({
1244
- action: Type.String({ description: "DOM action: click | fill | type | get_text | get_attr | set_attr | add_class | remove_class" }),
1245
- selector: Type.String({ description: "Element ref path from snapshot (e.g. /body/main/button[2]) or CSS selector" }),
1246
- value: Type.Optional(Type.String({ description: "Value for fill/type/set_attr actions" })),
1247
- attribute: Type.Optional(Type.String({ description: "Attribute name for get_attr/set_attr actions" })),
1248
- className: Type.Optional(Type.String({ description: "CSS class name for add_class/remove_class" })),
1249
- waitMs: Type.Optional(Type.Number({ description: "Optional wait timeout in ms before action (default: 1000). Use 0 to disable waiting." })),
1250
- waitSeconds: Type.Optional(Type.Number({ description: "Optional wait timeout in seconds before action. Used when waitMs is not provided." }))
1311
+ action: Type.String({ description: "Info action: get_url | get_title | get_selection | get_viewport | snapshot | query_all" }),
1312
+ selector: Type.Optional(Type.String({ description: "CSS selector for query_all action" })),
1313
+ maxDepth: Type.Optional(Type.Number({ description: "Max depth for snapshot (default: 6)" })),
1314
+ viewportOnly: Type.Optional(Type.Boolean({ description: "Only snapshot elements visible in viewport (default: true)" })),
1315
+ pruneLayout: Type.Optional(Type.Boolean({ description: "Collapse empty layout containers like div/span (default: true)" }))
1251
1316
  }),
1252
1317
  execute: async (params) => {
1253
1318
  const action = params.action;
1254
- const selector = params.selector;
1255
- const waitMs = resolveWaitMs(params);
1256
- if (!selector) return { content: "缺少 selector 参数" };
1257
- let el;
1258
- if (waitMs > 0) {
1259
- const found = await waitForElement(selector, waitMs);
1260
- if (typeof found === "string") return {
1261
- content: found,
1262
- details: {
1263
- error: true,
1264
- code: "INVALID_SELECTOR",
1265
- action,
1266
- selector
1267
- }
1268
- };
1269
- if (!found) return {
1270
- content: `未找到匹配 "${selector}" 的元素`,
1271
- details: {
1272
- error: true,
1273
- code: "ELEMENT_NOT_FOUND",
1274
- action,
1275
- selector,
1276
- waitMs
1277
- }
1278
- };
1279
- el = found;
1280
- } else {
1281
- const elOrError = queryElement(selector);
1282
- if (typeof elOrError === "string") return {
1283
- content: elOrError,
1284
- details: {
1285
- error: true,
1286
- code: elOrError.startsWith("未找到") ? "ELEMENT_NOT_FOUND" : "INVALID_SELECTOR",
1287
- action,
1288
- selector,
1289
- waitMs
1290
- }
1291
- };
1292
- el = elOrError;
1293
- }
1294
1319
  try {
1295
1320
  switch (action) {
1296
- case "click":
1297
- if (el instanceof HTMLElement) {
1298
- el.focus();
1299
- el.click();
1300
- } else el.dispatchEvent(new MouseEvent("click", { bubbles: true }));
1301
- return { content: `已点击 ${describeElement(el)}` };
1302
- case "fill": {
1303
- const value = params.value;
1304
- if (value === void 0) return { content: "缺少 value 参数" };
1305
- if (el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) {
1306
- el.focus();
1307
- el.value = value;
1308
- dispatchInputEvents(el);
1309
- } else if (el instanceof HTMLElement && el.isContentEditable) {
1310
- el.focus();
1311
- el.textContent = value;
1312
- el.dispatchEvent(new Event("input", { bubbles: true }));
1313
- } else return { content: `"${selector}" 不是可编辑元素` };
1314
- return { content: `已填写 ${describeElement(el)}: "${value}"` };
1315
- }
1316
- case "type": {
1317
- const value = params.value;
1318
- if (value === void 0) return { content: "缺少 value 参数" };
1319
- if (el instanceof HTMLElement) el.focus();
1320
- for (const char of value) {
1321
- el.dispatchEvent(new KeyboardEvent("keydown", {
1322
- key: char,
1323
- bubbles: true
1324
- }));
1325
- el.dispatchEvent(new KeyboardEvent("keypress", {
1326
- key: char,
1327
- bubbles: true
1328
- }));
1329
- if (el instanceof HTMLInputElement || el instanceof HTMLTextAreaElement) el.value += char;
1330
- el.dispatchEvent(new Event("input", { bubbles: true }));
1331
- el.dispatchEvent(new KeyboardEvent("keyup", {
1332
- key: char,
1333
- bubbles: true
1334
- }));
1335
- }
1336
- return { content: `已逐字输入到 ${describeElement(el)}: "${value}"` };
1337
- }
1338
- case "get_text": {
1339
- const text = el.textContent?.trim() ?? "";
1340
- return { content: `${describeElement(el)} 的文本内容:${text || "(空)"}` };
1341
- }
1342
- case "get_attr": {
1343
- const attribute = params.attribute;
1344
- if (!attribute) return { content: "缺少 attribute 参数" };
1345
- const attrValue = el.getAttribute(attribute);
1346
- return { content: `${describeElement(el)} 的 ${attribute} = ${attrValue ?? "(不存在)"}` };
1347
- }
1348
- case "set_attr": {
1349
- const attribute = params.attribute;
1350
- const value = params.value;
1351
- if (!attribute || value === void 0) return { content: "缺少 attribute 或 value 参数" };
1352
- el.setAttribute(attribute, value);
1353
- return { content: `已设置 ${describeElement(el)} 的 ${attribute}="${value}"` };
1321
+ case "get_url": return { content: window.location.href };
1322
+ case "get_title": return { content: document.title || "(无标题)" };
1323
+ case "get_selection": return { content: (window.getSelection()?.toString().trim() ?? "") || "(未选中任何文本)" };
1324
+ case "get_viewport": {
1325
+ const info = {
1326
+ viewportWidth: window.innerWidth,
1327
+ viewportHeight: window.innerHeight,
1328
+ scrollX: window.scrollX,
1329
+ scrollY: window.scrollY,
1330
+ pageWidth: document.documentElement.scrollWidth,
1331
+ pageHeight: document.documentElement.scrollHeight
1332
+ };
1333
+ return { content: JSON.stringify(info, null, 2) };
1354
1334
  }
1355
- case "add_class": {
1356
- const className = params.className;
1357
- if (!className) return { content: "缺少 className 参数" };
1358
- el.classList.add(className);
1359
- return { content: `已添加 class "${className}" 到 ${describeElement(el)}` };
1335
+ case "snapshot": {
1336
+ const maxDepth = params.maxDepth ?? 6;
1337
+ const viewportOnly = params.viewportOnly ?? true;
1338
+ const pruneLayout = params.pruneLayout ?? true;
1339
+ return { content: generateSnapshot(document.body, {
1340
+ maxDepth,
1341
+ viewportOnly,
1342
+ pruneLayout,
1343
+ refStore: getActiveRefStore()
1344
+ }) };
1360
1345
  }
1361
- case "remove_class": {
1362
- const className = params.className;
1363
- if (!className) return { content: "缺少 className 参数" };
1364
- el.classList.remove(className);
1365
- return { content: `已移除 ${describeElement(el)} 的 class "${className}"` };
1346
+ case "query_all": {
1347
+ const selector = params.selector;
1348
+ if (!selector) return { content: "缺少 selector 参数" };
1349
+ return { content: queryAllElements(selector) };
1366
1350
  }
1367
- default: return { content: `未知的 DOM 动作: ${action}` };
1351
+ default: return { content: `未知的页面信息动作: ${action}` };
1368
1352
  }
1369
1353
  } catch (err) {
1370
1354
  return {
1371
- content: `DOM 操作 "${action}" 失败: ${err instanceof Error ? err.message : String(err)}`,
1355
+ content: `页面信息操作 "${action}" 失败: ${err instanceof Error ? err.message : String(err)}`,
1372
1356
  details: {
1373
1357
  error: true,
1374
- action,
1375
- selector
1358
+ action
1376
1359
  }
1377
1360
  };
1378
1361
  }
@@ -1701,6 +1684,94 @@ function createEvaluateTool() {
1701
1684
  };
1702
1685
  }
1703
1686
 
1687
+ //#endregion
1688
+ //#region src/web/ref-store.ts
1689
+ /**
1690
+ * RefStore — 快照 hash ID 与 DOM 元素的映射表。
1691
+ *
1692
+ * 快照生成时,根据元素的 DOM 路径 + 页面 URL 生成确定性 hash ID,
1693
+ * 同时保存 ID → Element 的映射。AI 使用 hash ID 作为 selector 定位元素,
1694
+ * 免去超长 XPath 路径,大幅减少 token 消耗。
1695
+ *
1696
+ * 优势:
1697
+ * - **确定性**:同一元素无论快照顺序,始终得到相同 ID
1698
+ * - **并发安全**:多次快照不会产生 ID 冲突
1699
+ * - **跨页面隔离**:URL hash 作为命名空间,不同页面元素 ID 互不碰撞
1700
+ *
1701
+ * 生命周期:每次 WebAgent.chat() 调用时创建,对话结束后清空。
1702
+ *
1703
+ * 使用方:
1704
+ * page-info-tool.ts — generateSnapshot() 写入映射
1705
+ * dom-tool.ts — queryElement() 读取映射
1706
+ * index.ts — WebAgent 持有实例,管理生命周期
1707
+ */
1708
+ /**
1709
+ * FNV-1a 32-bit hash — 简单高效的字符串散列。
1710
+ * 分布均匀,碰撞率低,适合生成短 ID。
1711
+ */
1712
+ function fnv1a(str) {
1713
+ let h = 2166136261;
1714
+ for (let i = 0; i < str.length; i++) {
1715
+ h ^= str.charCodeAt(i);
1716
+ h = Math.imul(h, 16777619);
1717
+ }
1718
+ return h >>> 0;
1719
+ }
1720
+ /**
1721
+ * hash ID → DOM 元素的映射存储。
1722
+ *
1723
+ * - `set(el, path)` 由快照生成时调用,返回确定性 hash ID
1724
+ * - `get(id)` 由 dom-tool 查询时调用,根据 hash ID 取回元素
1725
+ * - `has(id)` 检查 ID 是否存在(用于 selector 类型判断)
1726
+ * - `clear()` 每次对话结束后清空
1727
+ */
1728
+ var RefStore = class {
1729
+ map = /* @__PURE__ */ new Map();
1730
+ /** 页面 URL 的 hash 前缀,用于跨页面命名空间隔离 */
1731
+ urlKey;
1732
+ /**
1733
+ * @param url 当前页面 URL(可选)。传入后作为 hash 命名空间,
1734
+ * 使不同页面的相同 DOM 路径产生不同 ID。
1735
+ */
1736
+ constructor(url) {
1737
+ this.urlKey = url ?? "";
1738
+ }
1739
+ /**
1740
+ * 注册一个元素,返回确定性 hash ID。
1741
+ * 相同 URL + path 始终产生相同 ID(并发安全)。
1742
+ *
1743
+ * @param el DOM 元素引用
1744
+ * @param path 元素的 XPath-like 路径(如 "/body/div[1]/main/button")
1745
+ */
1746
+ set(el, path) {
1747
+ const baseId = fnv1a(this.urlKey + path).toString(36);
1748
+ let id = baseId;
1749
+ let suffix = 2;
1750
+ while (this.map.has(id) && this.map.get(id) !== el) id = baseId + suffix++;
1751
+ this.map.set(id, el);
1752
+ return id;
1753
+ }
1754
+ /**
1755
+ * 根据 hash ID 获取 DOM 元素。
1756
+ * 返回 Element 或 undefined(ID 不存在或元素已被移除)。
1757
+ */
1758
+ get(id) {
1759
+ return this.map.get(id);
1760
+ }
1761
+ /** 检查 hash ID 是否存在 */
1762
+ has(id) {
1763
+ return this.map.has(id);
1764
+ }
1765
+ /** 清空所有映射 */
1766
+ clear() {
1767
+ this.map.clear();
1768
+ }
1769
+ /** 当前映射数量 */
1770
+ get size() {
1771
+ return this.map.size;
1772
+ }
1773
+ };
1774
+
1704
1775
  //#endregion
1705
1776
  //#region src/web/messaging.ts
1706
1777
  /**
@@ -1931,27 +2002,16 @@ var WebAgent = class {
1931
2002
  async chat(message) {
1932
2003
  const client = this.client ?? this.createBuiltinClient();
1933
2004
  let systemPrompt = this.customSystemPrompt ?? buildSystemPrompt({ tools: this.registry.getDefinitions() });
2005
+ const refStore = new RefStore(globalThis.location?.href);
2006
+ setActiveRefStore(refStore);
1934
2007
  if (this.autoSnapshot) try {
1935
2008
  const snapshot = generateSnapshot(document.body, {
1936
2009
  maxDepth: 8,
1937
- ...this.snapshotOptions
2010
+ ...this.snapshotOptions,
2011
+ refStore
1938
2012
  });
1939
2013
  this.callbacks.onSnapshot?.(snapshot);
1940
- systemPrompt += [
1941
- "\n\n## 当前页面 DOM 快照(实时生成)\n",
1942
- "每个元素末尾的 ref=\"...\" 是基于层级位置生成的唯一路径。",
1943
- "操作元素时,必须使用 ref 路径作为 selector 参数(如 /body/main/form/input)。\n",
1944
- "```",
1945
- snapshot,
1946
- "```\n",
1947
- "## 操作规则\n",
1948
- "1. 从快照中找到目标元素,复制其 ref 路径。",
1949
- "2. 将 ref 路径作为 dom 工具的 selector 参数传入。",
1950
- "3. 禁止猜测 CSS 选择器(如 \"button\"、\"#send\"),必须使用快照中的 ref。",
1951
- "4. 如果快照中看不到目标元素,先滚动页面或调整 maxDepth 获取更深的快照。",
1952
- "5. 先规划操作步骤,再按顺序逐步执行。",
1953
- "6. **语义选择**:当页面存在多个文本相同的元素时,严禁随意选择。必须根据 ref 路径的父级结构、周围兄弟元素、所在功能区域来判断哪个才是用户意图中的目标。例如聊天区的\"发送\"和表单的\"提交\"是不同功能,要结合用户当前操作的上下文来选择正确的元素。"
1954
- ].join("\n");
2014
+ systemPrompt += `\n\n## 当前页面 DOM 快照\n\n\`\`\`\n${snapshot}\n\`\`\``;
1955
2015
  } catch {}
1956
2016
  const result = await executeAgentLoop({
1957
2017
  client,
@@ -1964,6 +2024,8 @@ var WebAgent = class {
1964
2024
  callbacks: this.callbacks
1965
2025
  });
1966
2026
  if (this.memory) this.history = result.messages;
2027
+ refStore.clear();
2028
+ setActiveRefStore(void 0);
1967
2029
  return result;
1968
2030
  }
1969
2031
  /**