@roll-agent/browser-use-agent 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ import type { BrowserDomActionHint, NativeCdpController } from "@roll-agent/browser";
2
+ type DomActionCandidateController = Pick<NativeCdpController, "createIsolatedWorld" | "describeNode" | "evaluateJson" | "getDocument" | "querySelectorAllByNodeId">;
3
+ type DomActionCandidateOptions = {
4
+ readonly frameId?: string;
5
+ readonly maxCandidates?: number;
6
+ };
7
+ export declare function collectDomActionHints(controller: DomActionCandidateController, optionsOrMaxCandidates?: DomActionCandidateOptions | number): Promise<readonly BrowserDomActionHint[]>;
8
+ export {};
@@ -0,0 +1,6 @@
1
+ import type { BrowserContextManager, BrowserInspectablePage, BrowserRuntime } from "@roll-agent/browser";
2
+ export declare function resolveNativePageForBrowserTool(input: {
3
+ readonly ctxManager: BrowserContextManager;
4
+ readonly runtime: BrowserRuntime;
5
+ readonly pageId?: string;
6
+ }): Promise<BrowserInspectablePage>;
@@ -0,0 +1,99 @@
1
+ import { z } from "zod";
2
+ export declare const BrowserElementRefResolveStrategySchema: z.ZodEnum<["backend_node_id", "role_name_nth"]>;
3
+ export declare const BrowserElementRefTargetSchema: z.ZodObject<{
4
+ ref: z.ZodString;
5
+ role: z.ZodString;
6
+ name: z.ZodString;
7
+ x: z.ZodNumber;
8
+ y: z.ZodNumber;
9
+ resolvedBy: z.ZodEnum<["backend_node_id", "role_name_nth"]>;
10
+ backendNodeId: z.ZodOptional<z.ZodNumber>;
11
+ frameId: z.ZodOptional<z.ZodString>;
12
+ disabled: z.ZodBoolean;
13
+ }, "strip", z.ZodTypeAny, {
14
+ ref: string;
15
+ role: string;
16
+ name: string;
17
+ disabled: boolean;
18
+ x: number;
19
+ y: number;
20
+ resolvedBy: "backend_node_id" | "role_name_nth";
21
+ backendNodeId?: number | undefined;
22
+ frameId?: string | undefined;
23
+ }, {
24
+ ref: string;
25
+ role: string;
26
+ name: string;
27
+ disabled: boolean;
28
+ x: number;
29
+ y: number;
30
+ resolvedBy: "backend_node_id" | "role_name_nth";
31
+ backendNodeId?: number | undefined;
32
+ frameId?: string | undefined;
33
+ }>;
34
+ export declare const BrowserElementRefActionOutputSchema: z.ZodObject<{
35
+ success: z.ZodLiteral<true>;
36
+ ref: z.ZodString;
37
+ resolvedBy: z.ZodEnum<["backend_node_id", "role_name_nth"]>;
38
+ target: z.ZodObject<{
39
+ ref: z.ZodString;
40
+ role: z.ZodString;
41
+ name: z.ZodString;
42
+ x: z.ZodNumber;
43
+ y: z.ZodNumber;
44
+ resolvedBy: z.ZodEnum<["backend_node_id", "role_name_nth"]>;
45
+ backendNodeId: z.ZodOptional<z.ZodNumber>;
46
+ frameId: z.ZodOptional<z.ZodString>;
47
+ disabled: z.ZodBoolean;
48
+ }, "strip", z.ZodTypeAny, {
49
+ ref: string;
50
+ role: string;
51
+ name: string;
52
+ disabled: boolean;
53
+ x: number;
54
+ y: number;
55
+ resolvedBy: "backend_node_id" | "role_name_nth";
56
+ backendNodeId?: number | undefined;
57
+ frameId?: string | undefined;
58
+ }, {
59
+ ref: string;
60
+ role: string;
61
+ name: string;
62
+ disabled: boolean;
63
+ x: number;
64
+ y: number;
65
+ resolvedBy: "backend_node_id" | "role_name_nth";
66
+ backendNodeId?: number | undefined;
67
+ frameId?: string | undefined;
68
+ }>;
69
+ }, "strip", z.ZodTypeAny, {
70
+ ref: string;
71
+ target: {
72
+ ref: string;
73
+ role: string;
74
+ name: string;
75
+ disabled: boolean;
76
+ x: number;
77
+ y: number;
78
+ resolvedBy: "backend_node_id" | "role_name_nth";
79
+ backendNodeId?: number | undefined;
80
+ frameId?: string | undefined;
81
+ };
82
+ success: true;
83
+ resolvedBy: "backend_node_id" | "role_name_nth";
84
+ }, {
85
+ ref: string;
86
+ target: {
87
+ ref: string;
88
+ role: string;
89
+ name: string;
90
+ disabled: boolean;
91
+ x: number;
92
+ y: number;
93
+ resolvedBy: "backend_node_id" | "role_name_nth";
94
+ backendNodeId?: number | undefined;
95
+ frameId?: string | undefined;
96
+ };
97
+ success: true;
98
+ resolvedBy: "backend_node_id" | "role_name_nth";
99
+ }>;
@@ -0,0 +1,5 @@
1
+ import type { BrowserElementRefTarget, NativeCdpController } from "@roll-agent/browser";
2
+ import { NativeVisualActivitySession } from "../native-visual-activity-session.ts";
3
+ export type BrowserRefVisualSession = Pick<NativeVisualActivitySession, "begin" | "succeed" | "fail" | "previewMouseMotion" | "previewMouseClick">;
4
+ export declare function createBrowserRefVisualSession(controller: Pick<NativeCdpController, "evaluateJson">): BrowserRefVisualSession;
5
+ export declare function clickBrowserRefVisualTarget(controller: Pick<NativeCdpController, "dispatchMouseEvent">, session: BrowserRefVisualSession, target: BrowserElementRefTarget): Promise<void>;
@@ -0,0 +1,33 @@
1
+ import { type BrowserAxNode } from "@roll-agent/browser";
2
+ export declare const browserSnapshot: import("@roll-agent/sdk").ToolDefinition<{
3
+ pageId?: string | undefined;
4
+ maxNodes?: number | undefined;
5
+ interactiveOnly?: boolean | undefined;
6
+ maxDepth?: number | undefined;
7
+ }, {
8
+ page: {
9
+ pageId: string;
10
+ url: string;
11
+ title: string;
12
+ boundPlatform: "zhipin" | "yupao" | null;
13
+ detectedPlatform: "zhipin" | "yupao" | null;
14
+ isSelectedForPlatform: boolean;
15
+ };
16
+ snapshot: {
17
+ truncated: boolean;
18
+ nodes: BrowserAxNode[];
19
+ refs: {
20
+ ref: string;
21
+ role: string;
22
+ name: string;
23
+ nth: number;
24
+ disabled: boolean;
25
+ backendNodeId?: number | undefined;
26
+ frameId?: string | undefined;
27
+ }[];
28
+ nodeCount: number;
29
+ maxNodes: number;
30
+ interactiveOnly: boolean;
31
+ maxDepth?: number | undefined;
32
+ };
33
+ }>;
@@ -0,0 +1,22 @@
1
+ export declare const clickRef: import("@roll-agent/sdk").ToolDefinition<{
2
+ ref: string;
3
+ pageId?: string | undefined;
4
+ browserActionApproval?: {
5
+ id: string;
6
+ } | undefined;
7
+ }, {
8
+ ref: string;
9
+ target: {
10
+ ref: string;
11
+ role: string;
12
+ name: string;
13
+ disabled: boolean;
14
+ x: number;
15
+ y: number;
16
+ resolvedBy: "backend_node_id" | "role_name_nth";
17
+ backendNodeId?: number | undefined;
18
+ frameId?: string | undefined;
19
+ };
20
+ success: true;
21
+ resolvedBy: "backend_node_id" | "role_name_nth";
22
+ }>;
@@ -0,0 +1,24 @@
1
+ export declare const typeRef: import("@roll-agent/sdk").ToolDefinition<{
2
+ ref: string;
3
+ text: string;
4
+ pageId?: string | undefined;
5
+ clear?: boolean | undefined;
6
+ browserActionApproval?: {
7
+ id: string;
8
+ } | undefined;
9
+ }, {
10
+ ref: string;
11
+ target: {
12
+ ref: string;
13
+ role: string;
14
+ name: string;
15
+ disabled: boolean;
16
+ x: number;
17
+ y: number;
18
+ resolvedBy: "backend_node_id" | "role_name_nth";
19
+ backendNodeId?: number | undefined;
20
+ frameId?: string | undefined;
21
+ };
22
+ success: true;
23
+ resolvedBy: "backend_node_id" | "role_name_nth";
24
+ }>;
@@ -35,8 +35,8 @@ export declare const zhipinGetCandidateInfo: import("@roll-agent/sdk").ToolDefin
35
35
  };
36
36
  chatMessages: {
37
37
  index: number;
38
- time: string;
39
38
  content: string;
39
+ time: string;
40
40
  sender: "candidate" | "recruiter" | "system";
41
41
  messageType: "text" | "resume" | "system" | "wechat-exchange";
42
42
  }[];
@@ -14,8 +14,8 @@ export declare const zhipinGetCandidateList: import("@roll-agent/sdk").ToolDefin
14
14
  }, {
15
15
  success: boolean;
16
16
  candidates: {
17
- index: number;
18
17
  name: string;
18
+ index: number;
19
19
  candidateId: string;
20
20
  age: string;
21
21
  experience: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@roll-agent/browser-use-agent",
3
- "version": "0.11.0",
3
+ "version": "0.12.0",
4
4
  "type": "module",
5
5
  "repository": {
6
6
  "type": "git",
@@ -47,7 +47,7 @@
47
47
  "zod": "^3.25.76",
48
48
  "@roll-agent/reply-authority-client": "0.1.2",
49
49
  "@roll-agent/sdk": "0.2.0",
50
- "@roll-agent/browser": "0.4.0"
50
+ "@roll-agent/browser": "0.5.0"
51
51
  },
52
52
  "devDependencies": {
53
53
  "@types/node": "^22.0.0"
@@ -0,0 +1,151 @@
1
+ # Generic Browser Refs
2
+
3
+ Use this reference when an orchestrator needs to operate a browser page element that is not covered
4
+ by a platform-specific tool.
5
+
6
+ ## Purpose
7
+
8
+ `browser_snapshot`, `click_ref`, and `type_ref` provide a generic accessibility-driven observe/action
9
+ loop:
10
+
11
+ ```text
12
+ AX snapshot -> select emitted @eN -> click/type -> re-observe or read back
13
+ ```
14
+
15
+ This is a fallback layer for unmodeled page operations. For BOSS workflows, prefer `zhipin_*` tools
16
+ when they already express the business action.
17
+
18
+ ## Tool Summary
19
+
20
+ | Tool | Input | Output | Use it for |
21
+ | --- | --- | --- | --- |
22
+ | `browser_snapshot` | `pageId?`, `maxDepth?`, `maxNodes?`, `interactiveOnly?` | `page`, `snapshot` | Observe the current page's AX tree and get `@eN` refs. It also merges limited DOM-action hints for non-semantic controls such as clickable tabs rendered as `span`, including same-target iframes when Chrome exposes child `frameId` values. |
23
+ | `click_ref` | `ref`, `pageId?`, `browserActionApproval?` | `success`, `ref`, `resolvedBy`, `target` | Click a ref returned by `browser_snapshot`. |
24
+ | `type_ref` | `ref`, `text`, `clear?`, `pageId?`, `browserActionApproval?` | `success`, `ref`, `resolvedBy`, `target` | Focus a ref, optionally clear it, then insert text. |
25
+
26
+ `browser_snapshot.snapshot` contains:
27
+
28
+ | Field | Meaning |
29
+ | --- | --- |
30
+ | `nodes` | AX nodes returned to the orchestrator. With `interactiveOnly:true`, this is a flat list of interactive nodes. |
31
+ | `refs` | Current-snapshot handles, shaped as `@e1`, `@e2`, ... |
32
+ | `nodeCount` | Number of returned nodes. |
33
+ | `truncated` | `true` when `maxNodes` stopped traversal. |
34
+ | `maxNodes` | Effective node cap after applying `BROWSER_SECURITY_JSON.maxSnapshotNodes`. |
35
+ | `interactiveOnly` | Whether non-interactive context nodes were omitted. |
36
+ | `maxDepth` | Optional AX tree depth cap used for this snapshot. |
37
+
38
+ Each `refs[]` item contains `ref`, optional `backendNodeId`, optional `frameId`, `role`, `name`,
39
+ `nth`, and `disabled`. AX-native refs use their AX role. DOM-action refs use `role:"clickable"`,
40
+ `role:"focusable"`, or `role:"editable"`, preserve a real `backendNodeId` when Chrome exposes one,
41
+ and appear in `nodes[]` with `properties.domActionable:true`, `properties.domActionKind`, and
42
+ `properties.domActionHints`.
43
+
44
+ Iframe handling:
45
+
46
+ ```text
47
+ main AX tree -> iframe node backendNodeId -> DOM.describeNode -> child frameId
48
+ -> Page.createIsolatedWorld({ frameId }) + Runtime.evaluate in that frame
49
+ -> DOM.getDocument({ pierce:true }) maps marker attributes back to backendNodeId
50
+ -> Accessibility.getFullAXTree({ frameId })
51
+ -> repeat for nested same-target iframes until maxNodes or frame de-duplication
52
+ -> child refs carry frameId
53
+ ```
54
+
55
+ When an iframe child ref is clicked or typed, `click_ref` / `type_ref` keep the `frameId` in the
56
+ resolved `target`. If the original `backendNodeId` is stale, the fallback re-queries that same frame's
57
+ AX tree before dispatching the viewport action.
58
+
59
+ ## Selection Logic
60
+
61
+ Choose a target in this order:
62
+
63
+ 1. Match the user's intent against `role` and `name`, for example `role:"button"` and `name:"交换电话"`.
64
+ 2. Reject refs with `disabled:true`.
65
+ 3. For non-semantic clickable text, match `role:"clickable"`, the visible label, and
66
+ `properties.domActionable:true`; examples include tab/filter labels such as `未读`.
67
+ 4. If there are multiple matching nodes, prefer visible task context from nearby `nodes` data or take
68
+ another snapshot with `interactiveOnly:false` and a small `maxDepth`.
69
+ 5. If a matching ref includes `frameId`, pass the ref normally; do not pass `frameId` manually.
70
+ 6. Keep the selected `page.pageId` and pass it back to `click_ref` / `type_ref` when multiple pages are open.
71
+
72
+ Do not construct refs manually. Only pass refs emitted by the most recent snapshot for that page.
73
+
74
+ ## Action Flow
75
+
76
+ ```bash
77
+ roll run browser-use-agent browser_snapshot --input-json '{"interactiveOnly":true}' --json
78
+ roll run browser-use-agent click_ref --input-json '{"ref":"@e3"}' --json
79
+ roll run browser-use-agent browser_snapshot --input-json '{"interactiveOnly":true}' --json
80
+ ```
81
+
82
+ For text input:
83
+
84
+ ```bash
85
+ roll run browser-use-agent type_ref --input-json '{"ref":"@e5","text":"hello","clear":true}' --json
86
+ ```
87
+
88
+ When `BROWSER_SECURITY_JSON.actionPolicy` is `confirm`, the first side-effecting action can return
89
+ `needs_confirmation`. In that case, take `details.approvalRequest.retryInput` from the structured
90
+ error and merge it into the retry input unchanged.
91
+
92
+ ## Visual Feedback
93
+
94
+ The generic ref tools use the native CDP visual feedback path:
95
+
96
+ | Tool | Activity capsule | Visual cursor |
97
+ | --- | --- | --- |
98
+ | `browser_snapshot` | Shows reading and completion/failure state. | Not applicable because it is read-only. |
99
+ | `click_ref` | Shows click progress and completion/failure state. | Shows pointer placement and click pulse for the resolved target point. |
100
+ | `type_ref` | Shows input progress and completion/failure state. | Shows pointer placement and click pulse before text insertion. |
101
+
102
+ `BROWSER_VISUAL_ACTIVITY=false` disables the capsule. `BROWSER_VISUAL_CURSOR=false` disables the
103
+ pointer and click pulse.
104
+
105
+ ## Staleness Rules
106
+
107
+ Refresh the snapshot before reusing `@eN` after any of these events:
108
+
109
+ - navigation, reload, redirect, or platform switch
110
+ - modal open/close
111
+ - list filtering, search, sort, or virtual-scroll loading
112
+ - a prior `click_ref` that may re-render the target area
113
+ - `click_ref` / `type_ref` returns a stale-ref or not-found error
114
+
115
+ `click_ref` and `type_ref` first resolve by `backendNodeId` when the ref has one. If that fails, they
116
+ fall back to `role/name/nth`. The fallback is useful for small re-renders, but it is not a business
117
+ identity.
118
+
119
+ ## Relationship To BOSS Refs
120
+
121
+ | Ref family | Produced by | Consumed by | Meaning |
122
+ | --- | --- | --- | --- |
123
+ | `@eN` | `browser_snapshot` | `click_ref`, `type_ref` | Generic AX element handle for the current page snapshot. |
124
+ | `@cN` | `zhipin_get_candidate_list` | `zhipin_say_hello`, `zhipin_open_resume` | BOSS recommendation candidate handle. |
125
+ | `@jN` | `zhipin_list_recommend_jobs` | `zhipin_select_recommend_job` | BOSS recommendation job-filter handle. |
126
+
127
+ Do not pass one ref family into another tool family.
128
+
129
+ ## Boundary Conditions
130
+
131
+ - This is an Accessibility Tree snapshot, not a full HTML dump, screenshot, network log, or page state database.
132
+ - DOM-action refs are intentionally narrow: short visible non-semantic elements with click hints such as
133
+ `cursor:pointer`, `onclick`, `tabIndex`, or nearby class names like `filter`, `tab`, `menu`, `dropdown`,
134
+ `button`, or `toggle`. Plain article text is not exposed as clickable.
135
+ - DOM-action augmentation is collected from the active document and same-target iframe execution contexts.
136
+ Non-semantic iframe controls can be promoted when they are visible short-label `span`/`div`/`li`-style
137
+ elements with action hints and Chrome can map their marker attributes back to `backendNodeId`.
138
+ - Composite dropdown option rows such as `li.company-item` are promoted by using their visible descendant
139
+ text only inside dropdown/menu/select/option contexts, so large page containers are still filtered out.
140
+ - Canvas, image-map hotspots, non-accessible custom widgets, and deeply nested iframe/Shadow DOM flows may not expose
141
+ enough AX semantics for reliable operation.
142
+ - Same-target iframe refs are recursively inlined while Chrome's normal page-scoped CDP session can
143
+ resolve child `frameId` values. The recursion stops at `maxNodes`, skipped frame errors, or repeated
144
+ frame IDs. Cross-target/OOPIF iframe traversal is not implemented here because it requires flattened
145
+ CDP `sessionId` routing, which this native controller intentionally does not expose yet.
146
+ - Only click and text input are covered. Drag, hover, keyboard shortcuts, file upload, and complex gestures still need
147
+ dedicated tools.
148
+ - The implementation does not call `Runtime.enable()` for the native CDP path. Fallback matching may use
149
+ `Runtime.evaluate`, but it does not enable the Runtime domain. This avoids that specific detection point, not every
150
+ possible anti-automation signal.
151
+ - `domainAllowlist`, `maxSnapshotNodes`, and `actionPolicy` from `BROWSER_SECURITY_JSON` still apply.