illuma-agents 1.0.27 → 1.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/cjs/graphs/Graph.cjs +28 -2
  2. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  3. package/dist/cjs/graphs/MultiAgentGraph.cjs +108 -0
  4. package/dist/cjs/graphs/MultiAgentGraph.cjs.map +1 -1
  5. package/dist/cjs/messages/format.cjs.map +1 -1
  6. package/dist/cjs/stream.cjs +27 -0
  7. package/dist/cjs/stream.cjs.map +1 -1
  8. package/dist/cjs/tools/BrowserTools.cjs +109 -32
  9. package/dist/cjs/tools/BrowserTools.cjs.map +1 -1
  10. package/dist/esm/graphs/Graph.mjs +28 -2
  11. package/dist/esm/graphs/Graph.mjs.map +1 -1
  12. package/dist/esm/graphs/MultiAgentGraph.mjs +108 -0
  13. package/dist/esm/graphs/MultiAgentGraph.mjs.map +1 -1
  14. package/dist/esm/messages/format.mjs.map +1 -1
  15. package/dist/esm/stream.mjs +27 -0
  16. package/dist/esm/stream.mjs.map +1 -1
  17. package/dist/esm/tools/BrowserTools.mjs +109 -32
  18. package/dist/esm/tools/BrowserTools.mjs.map +1 -1
  19. package/dist/types/graphs/Graph.d.ts +14 -0
  20. package/dist/types/graphs/MultiAgentGraph.d.ts +41 -0
  21. package/dist/types/messages/format.d.ts +1 -1
  22. package/dist/types/tools/BrowserTools.d.ts +4 -1
  23. package/dist/types/types/stream.d.ts +13 -0
  24. package/package.json +4 -2
  25. package/src/graphs/Graph.ts +30 -2
  26. package/src/graphs/MultiAgentGraph.ts +119 -0
  27. package/src/messages/format.ts +2 -2
  28. package/src/scripts/multi-agent-chain.ts +59 -6
  29. package/src/scripts/multi-agent-parallel-start.ts +265 -0
  30. package/src/scripts/multi-agent-parallel.ts +61 -10
  31. package/src/scripts/multi-agent-sequence.ts +6 -1
  32. package/src/scripts/parallel-asymmetric-tools-test.ts +274 -0
  33. package/src/scripts/parallel-full-metadata-test.ts +240 -0
  34. package/src/scripts/parallel-tools-test.ts +340 -0
  35. package/src/scripts/sequential-full-metadata-test.ts +197 -0
  36. package/src/scripts/single-agent-metadata-test.ts +198 -0
  37. package/src/scripts/test-thinking-handoff.ts +8 -0
  38. package/src/scripts/tools.ts +31 -11
  39. package/src/stream.ts +32 -0
  40. package/src/tools/BrowserTools.ts +424 -350
  41. package/src/tools/__tests__/BrowserTools.test.ts +263 -257
  42. package/src/types/stream.ts +15 -0
@@ -1,350 +1,424 @@
1
- import { z } from 'zod';
2
- import { tool, DynamicStructuredTool } from '@langchain/core/tools';
3
- import type * as t from '@/types';
4
-
5
- /**
6
- * Browser tool names - keep in sync with ranger-browser extension
7
- * These tools execute locally in the browser extension, NOT on the server
8
- */
9
- export const EBrowserTools = {
10
- CLICK: 'browser_click',
11
- TYPE: 'browser_type',
12
- NAVIGATE: 'browser_navigate',
13
- SCROLL: 'browser_scroll',
14
- EXTRACT: 'browser_extract',
15
- HOVER: 'browser_hover',
16
- WAIT: 'browser_wait',
17
- BACK: 'browser_back',
18
- SCREENSHOT: 'browser_screenshot',
19
- GET_PAGE_STATE: 'browser_get_page_state',
20
- } as const;
21
-
22
- export type BrowserToolName = typeof EBrowserTools[keyof typeof EBrowserTools];
23
-
24
- /**
25
- * Callback function type for waiting on browser action results
26
- * This allows the server (Ranger) to provide a callback that waits for the extension
27
- * to POST results back to the server before returning to the LLM.
28
- *
29
- * @param action - The browser action (click, type, navigate, etc.)
30
- * @param args - Arguments for the action
31
- * @param toolCallId - Unique ID for this tool call (from config.toolCall.id)
32
- * @returns Promise that resolves with the actual browser result (page state, etc.)
33
- */
34
- export type BrowserToolCallback = (
35
- action: string,
36
- args: Record<string, unknown>,
37
- toolCallId: string
38
- ) => Promise<BrowserActionResult>;
39
-
40
- /**
41
- * Result returned from browser action execution
42
- */
43
- export interface BrowserActionResult {
44
- success: boolean;
45
- url?: string;
46
- title?: string;
47
- elementList?: string; // Text-based element list
48
- error?: string;
49
- screenshot?: string; // Base64 screenshot (if requested)
50
- }
51
-
52
- /**
53
- * Check if browser capability is available based on request headers or context
54
- * The browser extension sets these headers when connected:
55
- * - X-Ranger-Browser-Extension: true
56
- * - X-Ranger-Browser-Capable: true
57
- */
58
- export function hasBrowserCapability(req?: { headers?: Record<string, string | string[] | undefined> }): boolean {
59
- if (!req?.headers) {
60
- return false;
61
- }
62
-
63
- const browserExtension = req.headers['x-ranger-browser-extension'];
64
- const browserCapable = req.headers['x-ranger-browser-capable'];
65
-
66
- return browserExtension === 'true' || browserCapable === 'true';
67
- }
68
-
69
- // Tool schemas
70
- const BrowserClickSchema = z.object({
71
- index: z.number().describe('The index number [0], [1], etc. of the element to click from the page state element list'),
72
- });
73
-
74
- const BrowserTypeSchema = z.object({
75
- index: z.number().describe('The index number of the input element to type into'),
76
- text: z.string().describe('The text to type into the element'),
77
- pressEnter: z.boolean().optional().describe('Whether to press Enter after typing (useful for search forms)'),
78
- });
79
-
80
- const BrowserNavigateSchema = z.object({
81
- url: z.string().describe('The full URL to navigate to (must include https://)'),
82
- });
83
-
84
- const BrowserScrollSchema = z.object({
85
- direction: z.enum(['up', 'down', 'left', 'right']).describe('Direction to scroll'),
86
- amount: z.number().optional().describe('Pixels to scroll (default: one viewport height)'),
87
- });
88
-
89
- const BrowserExtractSchema = z.object({
90
- query: z.string().optional().describe('Optional: specific content to extract from the page'),
91
- });
92
-
93
- const BrowserHoverSchema = z.object({
94
- index: z.number().describe('The index number of the element to hover over'),
95
- });
96
-
97
- const BrowserWaitSchema = z.object({
98
- duration: z.number().optional().describe('Milliseconds to wait (default: 1000)'),
99
- });
100
-
101
- const BrowserBackSchema = z.object({});
102
-
103
- const BrowserScreenshotSchema = z.object({});
104
-
105
- const BrowserGetPageStateSchema = z.object({});
106
-
107
- /**
108
- * Browser tool response interface
109
- * This is what the extension returns after executing the action
110
- */
111
- export interface BrowserToolResponse {
112
- requiresBrowserExecution: true;
113
- action: string;
114
- args: Record<string, unknown>;
115
- toolCallId?: string; // Added to help extension correlate with callback
116
- }
117
-
118
- /**
119
- * Options for creating browser tools
120
- */
121
- export interface CreateBrowserToolsOptions {
122
- /**
123
- * Optional callback that waits for browser action results.
124
- * When provided, tools will await this callback to get actual results from the extension.
125
- * When not provided, tools return markers immediately (for non-server contexts).
126
- */
127
- waitForResult?: BrowserToolCallback;
128
- }
129
-
130
- /**
131
- * Format browser action result for LLM consumption
132
- */
133
- function formatResultForLLM(result: BrowserActionResult, action: string): string {
134
- if (!result.success && result.error) {
135
- return `Browser action "${action}" failed: ${result.error}`;
136
- }
137
-
138
- const parts: string[] = [];
139
-
140
- if (result.url) {
141
- parts.push(`**Current URL:** ${result.url}`);
142
- }
143
- if (result.title) {
144
- parts.push(`**Page Title:** ${result.title}`);
145
- }
146
- if (result.elementList) {
147
- parts.push(`\n**Interactive Elements:**\n${result.elementList}`);
148
- }
149
- if (result.screenshot) {
150
- parts.push(`\n[Screenshot captured and displayed to user]`);
151
- }
152
-
153
- if (parts.length === 0) {
154
- return `Browser action "${action}" completed successfully.`;
155
- }
156
-
157
- return parts.join('\n');
158
- }
159
-
160
- /**
161
- * Create browser tools with optional callback for waiting on results
162
- *
163
- * When waitForResult callback is provided:
164
- * 1. Tool returns marker that triggers extension
165
- * 2. Tool then awaits callback to get actual results
166
- * 3. Returns real page state to LLM
167
- *
168
- * When no callback:
169
- * 1. Tool returns marker only (for non-server contexts)
170
- *
171
- * NOTE: These tools use TEXT-BASED element lists, NOT screenshots
172
- * Screenshots would be 100K+ tokens each - element lists are ~100 tokens
173
- */
174
- export function createBrowserTools(options?: CreateBrowserToolsOptions): DynamicStructuredTool[] {
175
- const { waitForResult } = options || {};
176
- const tools: DynamicStructuredTool[] = [];
177
-
178
- /**
179
- * Helper to create tool function that optionally waits for results
180
- * The toolCallId is extracted from the RunnableConfig passed by LangChain
181
- */
182
- const createToolFunction = (action: string) => {
183
- return async (args: Record<string, unknown>, config?: { toolCall?: { id?: string } }): Promise<string> => {
184
- const toolCallId = config?.toolCall?.id || `tool_${Date.now()}_${Math.random().toString(36).slice(2)}`;
185
-
186
- // Create marker for extension
187
- const marker: BrowserToolResponse = {
188
- requiresBrowserExecution: true,
189
- action,
190
- args,
191
- toolCallId,
192
- };
193
-
194
- // If no callback, return marker immediately (extension handles via SSE interception)
195
- if (!waitForResult) {
196
- return JSON.stringify(marker);
197
- }
198
-
199
- // With callback: wait for actual results from extension
200
- // The marker is still returned initially via SSE, but we wait for the callback
201
- try {
202
- const result = await waitForResult(action, args, toolCallId);
203
- return formatResultForLLM(result, action);
204
- } catch (error) {
205
- const errorMessage = error instanceof Error ? error.message : String(error);
206
- return `Browser action "${action}" failed: ${errorMessage}`;
207
- }
208
- };
209
- };
210
-
211
- // browser_click
212
- tools.push(
213
- tool(
214
- createToolFunction('click'),
215
- {
216
- name: EBrowserTools.CLICK,
217
- description: `Click an element on the current web page by its index number.
218
- The element list shows clickable items like: [0]<button>Submit</button> [1]<a href="/home">Home</a>
219
- Use the index number in brackets to click that element.
220
- After clicking, you receive an updated element list showing the new page state.`,
221
- schema: BrowserClickSchema,
222
- }
223
- )
224
- );
225
-
226
- // browser_type
227
- tools.push(
228
- tool(
229
- createToolFunction('type'),
230
- {
231
- name: EBrowserTools.TYPE,
232
- description: `Type text into an input element on the page.
233
- Find the input element in the list by its index (e.g., [5]<input placeholder="Search">).
234
- Set pressEnter: true to submit forms after typing.
235
- After typing, you receive an updated element list.`,
236
- schema: BrowserTypeSchema,
237
- }
238
- )
239
- );
240
-
241
- // browser_navigate
242
- tools.push(
243
- tool(
244
- createToolFunction('navigate'),
245
- {
246
- name: EBrowserTools.NAVIGATE,
247
- description: `Navigate to a URL. Always include the full URL with https://.
248
- After navigation, you receive the new page's element list.`,
249
- schema: BrowserNavigateSchema,
250
- }
251
- )
252
- );
253
-
254
- // browser_scroll
255
- tools.push(
256
- tool(
257
- createToolFunction('scroll'),
258
- {
259
- name: EBrowserTools.SCROLL,
260
- description: `Scroll the page to reveal more content.
261
- Use 'down' to scroll down, 'up' to scroll up.
262
- After scrolling, you receive an updated element list with newly visible elements.`,
263
- schema: BrowserScrollSchema,
264
- }
265
- )
266
- );
267
-
268
- // browser_extract
269
- tools.push(
270
- tool(
271
- createToolFunction('extract'),
272
- {
273
- name: EBrowserTools.EXTRACT,
274
- description: `Extract content from the current page.
275
- Returns page URL, title, and element list.`,
276
- schema: BrowserExtractSchema,
277
- }
278
- )
279
- );
280
-
281
- // browser_hover
282
- tools.push(
283
- tool(
284
- createToolFunction('hover'),
285
- {
286
- name: EBrowserTools.HOVER,
287
- description: `Hover over an element to reveal tooltips, dropdowns, or other hover-triggered content.
288
- After hovering, you receive an updated element list with any newly revealed elements.`,
289
- schema: BrowserHoverSchema,
290
- }
291
- )
292
- );
293
-
294
- // browser_wait
295
- tools.push(
296
- tool(
297
- createToolFunction('wait'),
298
- {
299
- name: EBrowserTools.WAIT,
300
- description: `Wait for a specified duration for page content to load.
301
- Use this after actions that trigger async content loading.
302
- After waiting, you receive an updated element list.`,
303
- schema: BrowserWaitSchema,
304
- }
305
- )
306
- );
307
-
308
- // browser_back
309
- tools.push(
310
- tool(
311
- createToolFunction('back'),
312
- {
313
- name: EBrowserTools.BACK,
314
- description: `Go back to the previous page in browser history.
315
- After going back, you receive the previous page's element list.`,
316
- schema: BrowserBackSchema,
317
- }
318
- )
319
- );
320
-
321
- // browser_screenshot
322
- tools.push(
323
- tool(
324
- createToolFunction('screenshot'),
325
- {
326
- name: EBrowserTools.SCREENSHOT,
327
- description: `Capture a screenshot of the current page.
328
- Returns the page state with a note that screenshot was displayed to the user.
329
- Use browser_get_page_state to get the element list for automation.`,
330
- schema: BrowserScreenshotSchema,
331
- }
332
- )
333
- );
334
-
335
- // browser_get_page_state
336
- tools.push(
337
- tool(
338
- createToolFunction('get_page_state'),
339
- {
340
- name: EBrowserTools.GET_PAGE_STATE,
341
- description: `Get the current page state including URL, title, and all interactive elements.
342
- Use this at the start of a task to see what elements are available.
343
- Returns a text list of elements with their index numbers for interaction.`,
344
- schema: BrowserGetPageStateSchema,
345
- }
346
- )
347
- );
348
-
349
- return tools;
350
- }
1
+ import { z } from 'zod';
2
+ import { tool, DynamicStructuredTool } from '@langchain/core/tools';
3
+ import type * as _t from '@/types';
4
+
5
+ /**
6
+ * Browser tool names - keep in sync with ranger-browser extension
7
+ * These tools execute locally in the browser extension, NOT on the server
8
+ */
9
+ export const EBrowserTools = {
10
+ CLICK: 'browser_click',
11
+ TYPE: 'browser_type',
12
+ NAVIGATE: 'browser_navigate',
13
+ SCROLL: 'browser_scroll',
14
+ EXTRACT: 'browser_extract',
15
+ HOVER: 'browser_hover',
16
+ WAIT: 'browser_wait',
17
+ BACK: 'browser_back',
18
+ SCREENSHOT: 'browser_screenshot',
19
+ GET_PAGE_STATE: 'browser_get_page_state',
20
+ // Skyvern-inspired additions for robust form handling
21
+ SELECT_OPTION: 'browser_select_option',
22
+ UPLOAD_FILE: 'browser_upload_file',
23
+ KEYPRESS: 'browser_keypress',
24
+ } as const;
25
+
26
+ export type BrowserToolName =
27
+ (typeof EBrowserTools)[keyof typeof EBrowserTools];
28
+
29
+ /**
30
+ * Callback function type for waiting on browser action results
31
+ * This allows the server (Ranger) to provide a callback that waits for the extension
32
+ * to POST results back to the server before returning to the LLM.
33
+ *
34
+ * @param action - The browser action (click, type, navigate, etc.)
35
+ * @param args - Arguments for the action
36
+ * @param toolCallId - Unique ID for this tool call (from config.toolCall.id)
37
+ * @returns Promise that resolves with the actual browser result (page state, etc.)
38
+ */
39
+ export type BrowserToolCallback = (
40
+ action: string,
41
+ args: Record<string, unknown>,
42
+ toolCallId: string
43
+ ) => Promise<BrowserActionResult>;
44
+
45
+ /**
46
+ * Result returned from browser action execution
47
+ */
48
+ export interface BrowserActionResult {
49
+ success: boolean;
50
+ url?: string;
51
+ title?: string;
52
+ elementList?: string; // Text-based element list
53
+ error?: string;
54
+ screenshot?: string; // Base64 screenshot (if requested)
55
+ }
56
+
57
+ /**
58
+ * Check if browser capability is available based on request headers or context
59
+ * The browser extension sets these headers when connected:
60
+ * - X-Ranger-Browser-Extension: true
61
+ * - X-Ranger-Browser-Capable: true
62
+ */
63
+ export function hasBrowserCapability(req?: {
64
+ headers?: Record<string, string | string[] | undefined>;
65
+ }): boolean {
66
+ if (!req?.headers) {
67
+ return false;
68
+ }
69
+
70
+ const browserExtension = req.headers['x-ranger-browser-extension'];
71
+ const browserCapable = req.headers['x-ranger-browser-capable'];
72
+
73
+ return browserExtension === 'true' || browserCapable === 'true';
74
+ }
75
+
76
+ // Tool schemas
77
+ const BrowserClickSchema = z.object({
78
+ index: z
79
+ .number()
80
+ .describe(
81
+ 'The index number [0], [1], etc. of the element to click from the page state element list'
82
+ ),
83
+ });
84
+
85
+ const BrowserTypeSchema = z.object({
86
+ index: z
87
+ .number()
88
+ .describe('The index number of the input element to type into'),
89
+ text: z.string().describe('The text to type into the element'),
90
+ pressEnter: z
91
+ .boolean()
92
+ .optional()
93
+ .describe('Whether to press Enter after typing (useful for search forms)'),
94
+ });
95
+
96
+ const BrowserNavigateSchema = z.object({
97
+ url: z
98
+ .string()
99
+ .describe('The full URL to navigate to (must include https://)'),
100
+ });
101
+
102
+ const BrowserScrollSchema = z.object({
103
+ direction: z
104
+ .enum(['up', 'down', 'left', 'right'])
105
+ .describe('Direction to scroll'),
106
+ amount: z
107
+ .number()
108
+ .optional()
109
+ .describe('Pixels to scroll (default: one viewport height)'),
110
+ });
111
+
112
+ const BrowserExtractSchema = z.object({
113
+ query: z
114
+ .string()
115
+ .optional()
116
+ .describe('Optional: specific content to extract from the page'),
117
+ });
118
+
119
+ const BrowserHoverSchema = z.object({
120
+ index: z.number().describe('The index number of the element to hover over'),
121
+ });
122
+
123
+ const BrowserWaitSchema = z.object({
124
+ duration: z
125
+ .number()
126
+ .optional()
127
+ .describe('Milliseconds to wait (default: 1000)'),
128
+ });
129
+
130
+ const BrowserBackSchema = z.object({});
131
+
132
+ const BrowserScreenshotSchema = z.object({});
133
+
134
+ const BrowserGetPageStateSchema = z.object({});
135
+
136
+ // Skyvern-inspired schemas for robust form handling
137
+ const BrowserSelectOptionSchema = z.object({
138
+ index: z
139
+ .number()
140
+ .describe('The index number of the select/dropdown element'),
141
+ value: z
142
+ .string()
143
+ .optional()
144
+ .describe('The value or label of the option to select. For native <select>, use the option text. For custom dropdowns, this is the option label to click.'),
145
+ });
146
+
147
+ const BrowserUploadFileSchema = z.object({
148
+ index: z
149
+ .number()
150
+ .describe('The index number of the file input element'),
151
+ fileUrl: z
152
+ .string()
153
+ .describe('URL of the file to upload (the system will download and upload it)'),
154
+ });
155
+
156
+ const BrowserKeypressSchema = z.object({
157
+ keys: z
158
+ .string()
159
+ .describe('Key(s) to press. Single key: "Enter", "Escape", "Tab", "ArrowDown". Combo: "Control+A", "Shift+Enter"'),
160
+ });
161
+
162
+ /**
163
+ * Browser tool response interface
164
+ * This is what the extension returns after executing the action
165
+ */
166
+ export interface BrowserToolResponse {
167
+ requiresBrowserExecution: true;
168
+ action: string;
169
+ args: Record<string, unknown>;
170
+ toolCallId?: string; // Added to help extension correlate with callback
171
+ }
172
+
173
+ /**
174
+ * Options for creating browser tools
175
+ */
176
+ export interface CreateBrowserToolsOptions {
177
+ /**
178
+ * Optional callback that waits for browser action results.
179
+ * When provided, tools will await this callback to get actual results from the extension.
180
+ * When not provided, tools return markers immediately (for non-server contexts).
181
+ */
182
+ waitForResult?: BrowserToolCallback;
183
+ }
184
+
185
+ /**
186
+ * Format browser action result for LLM consumption
187
+ */
188
+ function formatResultForLLM(
189
+ result: BrowserActionResult,
190
+ action: string
191
+ ): string {
192
+ if (!result.success && result.error) {
193
+ return `Browser action "${action}" failed: ${result.error}`;
194
+ }
195
+
196
+ const parts: string[] = [];
197
+
198
+ if (result.url != null && result.url !== '') {
199
+ parts.push(`**Current URL:** ${result.url}`);
200
+ }
201
+ if (result.title != null && result.title !== '') {
202
+ parts.push(`**Page Title:** ${result.title}`);
203
+ }
204
+ if (result.elementList != null && result.elementList !== '') {
205
+ parts.push(`\n**Interactive Elements:**\n${result.elementList}`);
206
+ }
207
+ if (result.screenshot != null && result.screenshot !== '') {
208
+ parts.push('\n[Screenshot captured and displayed to user]');
209
+ }
210
+
211
+ if (parts.length === 0) {
212
+ return `Browser action "${action}" completed successfully.`;
213
+ }
214
+
215
+ return parts.join('\n');
216
+ }
217
+
218
+ /**
219
+ * Create browser tools with optional callback for waiting on results
220
+ *
221
+ * When waitForResult callback is provided:
222
+ * 1. Tool returns marker that triggers extension
223
+ * 2. Tool then awaits callback to get actual results
224
+ * 3. Returns real page state to LLM
225
+ *
226
+ * When no callback:
227
+ * 1. Tool returns marker only (for non-server contexts)
228
+ *
229
+ * NOTE: These tools use TEXT-BASED element lists, NOT screenshots
230
+ * Screenshots would be 100K+ tokens each - element lists are ~100 tokens
231
+ */
232
+ export function createBrowserTools(
233
+ options?: CreateBrowserToolsOptions
234
+ ): DynamicStructuredTool[] {
235
+ const { waitForResult } = options || {};
236
+ const tools: DynamicStructuredTool[] = [];
237
+
238
+ /**
239
+ * Helper to create tool function that optionally waits for results
240
+ * The toolCallId is extracted from the RunnableConfig passed by LangChain
241
+ */
242
+ const createToolFunction = (action: string) => {
243
+ return async (
244
+ args: Record<string, unknown>,
245
+ config?: { toolCall?: { id?: string } }
246
+ ): Promise<string> => {
247
+ const toolCallId =
248
+ config?.toolCall?.id ??
249
+ `tool_${Date.now()}_${Math.random().toString(36).slice(2)}`;
250
+
251
+ // Create marker for extension
252
+ const marker: BrowserToolResponse = {
253
+ requiresBrowserExecution: true,
254
+ action,
255
+ args,
256
+ toolCallId,
257
+ };
258
+
259
+ // If no callback, return marker immediately (extension handles via SSE interception)
260
+ if (!waitForResult) {
261
+ return JSON.stringify(marker);
262
+ }
263
+
264
+ // With callback: wait for actual results from extension
265
+ // The marker is still returned initially via SSE, but we wait for the callback
266
+ try {
267
+ const result = await waitForResult(action, args, toolCallId);
268
+ return formatResultForLLM(result, action);
269
+ } catch (error) {
270
+ const errorMessage =
271
+ error instanceof Error ? error.message : String(error);
272
+ return `Browser action "${action}" failed: ${errorMessage}`;
273
+ }
274
+ };
275
+ };
276
+
277
+ // browser_click
278
+ tools.push(
279
+ tool(createToolFunction('click'), {
280
+ name: EBrowserTools.CLICK,
281
+ description: `Click an element on the current web page by its index number.
282
+ The element list shows clickable items like: [0]<button>Submit</button> [1]<a href="/home">Home</a>
283
+ Use the index number in brackets to click that element.
284
+ After clicking, you receive an updated element list showing the new page state.`,
285
+ schema: BrowserClickSchema,
286
+ })
287
+ );
288
+
289
+ // browser_type
290
+ tools.push(
291
+ tool(createToolFunction('type'), {
292
+ name: EBrowserTools.TYPE,
293
+ description: `Type text into an input element on the page.
294
+ Find the input element in the list by its index (e.g., [5]<input placeholder="Search">).
295
+ Set pressEnter: true to submit forms after typing.
296
+ After typing, you receive an updated element list.`,
297
+ schema: BrowserTypeSchema,
298
+ })
299
+ );
300
+
301
+ // browser_navigate
302
+ tools.push(
303
+ tool(createToolFunction('navigate'), {
304
+ name: EBrowserTools.NAVIGATE,
305
+ description: `Navigate to a URL. Always include the full URL with https://.
306
+ After navigation, you receive the new page's element list.`,
307
+ schema: BrowserNavigateSchema,
308
+ })
309
+ );
310
+
311
+ // browser_scroll
312
+ tools.push(
313
+ tool(createToolFunction('scroll'), {
314
+ name: EBrowserTools.SCROLL,
315
+ description: `Scroll the page to reveal more content.
316
+ Use 'down' to scroll down, 'up' to scroll up.
317
+ After scrolling, you receive an updated element list with newly visible elements.`,
318
+ schema: BrowserScrollSchema,
319
+ })
320
+ );
321
+
322
+ // browser_extract
323
+ tools.push(
324
+ tool(createToolFunction('extract'), {
325
+ name: EBrowserTools.EXTRACT,
326
+ description: `Extract content from the current page.
327
+ Returns page URL, title, and element list.`,
328
+ schema: BrowserExtractSchema,
329
+ })
330
+ );
331
+
332
+ // browser_hover
333
+ tools.push(
334
+ tool(createToolFunction('hover'), {
335
+ name: EBrowserTools.HOVER,
336
+ description: `Hover over an element to reveal tooltips, dropdowns, or other hover-triggered content.
337
+ After hovering, you receive an updated element list with any newly revealed elements.`,
338
+ schema: BrowserHoverSchema,
339
+ })
340
+ );
341
+
342
+ // browser_wait
343
+ tools.push(
344
+ tool(createToolFunction('wait'), {
345
+ name: EBrowserTools.WAIT,
346
+ description: `Wait for a specified duration for page content to load.
347
+ Use this after actions that trigger async content loading.
348
+ After waiting, you receive an updated element list.`,
349
+ schema: BrowserWaitSchema,
350
+ })
351
+ );
352
+
353
+ // browser_back
354
+ tools.push(
355
+ tool(createToolFunction('back'), {
356
+ name: EBrowserTools.BACK,
357
+ description: `Go back to the previous page in browser history.
358
+ After going back, you receive the previous page's element list.`,
359
+ schema: BrowserBackSchema,
360
+ })
361
+ );
362
+
363
+ // browser_screenshot
364
+ tools.push(
365
+ tool(createToolFunction('screenshot'), {
366
+ name: EBrowserTools.SCREENSHOT,
367
+ description: `Capture a screenshot of the current page.
368
+ Returns the page state with a note that screenshot was displayed to the user.
369
+ Use browser_get_page_state to get the element list for automation.`,
370
+ schema: BrowserScreenshotSchema,
371
+ })
372
+ );
373
+
374
+ // browser_get_page_state
375
+ tools.push(
376
+ tool(createToolFunction('get_page_state'), {
377
+ name: EBrowserTools.GET_PAGE_STATE,
378
+ description: `Get the current page state including URL, title, and all interactive elements.
379
+ Use this at the start of a task to see what elements are available.
380
+ Returns a text list of elements with their index numbers for interaction.`,
381
+ schema: BrowserGetPageStateSchema,
382
+ })
383
+ );
384
+
385
+ // browser_select_option - Skyvern-inspired for robust dropdown handling
386
+ tools.push(
387
+ tool(createToolFunction('select_option'), {
388
+ name: EBrowserTools.SELECT_OPTION,
389
+ description: `Select an option from a dropdown or select element.
390
+ For native <select> elements: finds and selects the option by value/label.
391
+ For custom dropdowns: clicks to open, then clicks the matching option.
392
+ Use this instead of click for dropdowns - it handles both native and custom selects.
393
+ After selection, you receive an updated element list.`,
394
+ schema: BrowserSelectOptionSchema,
395
+ })
396
+ );
397
+
398
+ // browser_upload_file - Skyvern-inspired for file input handling
399
+ tools.push(
400
+ tool(createToolFunction('upload_file'), {
401
+ name: EBrowserTools.UPLOAD_FILE,
402
+ description: `Upload a file to a file input element.
403
+ Provide the index of the file input and the URL of the file to upload.
404
+ The system will download the file and attach it to the input.
405
+ After upload, you receive an updated element list.`,
406
+ schema: BrowserUploadFileSchema,
407
+ })
408
+ );
409
+
410
+ // browser_keypress - For keyboard shortcuts and special keys
411
+ tools.push(
412
+ tool(createToolFunction('keypress'), {
413
+ name: EBrowserTools.KEYPRESS,
414
+ description: `Press keyboard key(s) on the page.
415
+ Single keys: "Enter", "Escape", "Tab", "ArrowDown", "ArrowUp", "Backspace", "Delete"
416
+ Key combos: "Control+A" (select all), "Control+C" (copy), "Shift+Enter" (newline)
417
+ Use this for form submission, closing modals, navigating dropdowns.
418
+ After keypress, you receive an updated element list.`,
419
+ schema: BrowserKeypressSchema,
420
+ })
421
+ );
422
+
423
+ return tools;
424
+ }