illuma-agents 1.0.23 → 1.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,650 +0,0 @@
1
- /**
2
- * Browser Automation Tools for Ranger Browser Extension
3
- *
4
- * These tools allow the LLM to interact with the browser through the
5
- * ranger-browser extension. They generate structured actions that are
6
- * sent to the extension via SSE streaming for execution.
7
- *
8
- * The extension handles:
9
- * - DOM extraction with element indexing
10
- * - Click, type, hover, scroll actions
11
- * - Navigation and page context
12
- * - Visual element highlighting
13
- */
14
-
15
- import { z } from 'zod';
16
- import { tool, DynamicStructuredTool } from '@langchain/core/tools';
17
-
18
- // ============================================
19
- // Tool Schemas
20
- // ============================================
21
-
22
- /**
23
- * Enhanced click schema that supports both index-based and coordinate-based clicking
24
- */
25
- const BrowserClickSchema = z.object({
26
- index: z.number().optional().describe(
27
- 'The index of the element to click, as shown in the page context (e.g., [0], [1], [2]). ' +
28
- 'Use the element index from the interactive elements list provided in the page context. ' +
29
- 'Either index OR coordinates must be provided.'
30
- ),
31
- coordinates: z.object({
32
- x: z.number().describe('X coordinate in viewport pixels'),
33
- y: z.number().describe('Y coordinate in viewport pixels'),
34
- }).optional().describe(
35
- 'Coordinates for clicking elements that lack semantic info (marked with ⚠️). ' +
36
- 'The coordinates are provided in the element listing as coords:(x,y). ' +
37
- 'Either index OR coordinates must be provided.'
38
- ),
39
- visualDescription: z.string().optional().describe(
40
- 'Description of what the element looks like visually. Used when clicking by appearance ' +
41
- '(e.g., "blue button in top right corner", "hamburger menu icon")'
42
- ),
43
- reason: z.string().optional().describe(
44
- 'Brief explanation of why you are clicking this element (for user transparency)'
45
- ),
46
- });
47
-
48
- const BrowserTypeSchema = z.object({
49
- index: z.number().describe(
50
- 'The index of the input element to type into, as shown in the page context'
51
- ),
52
- text: z.string().describe(
53
- 'The text to type into the input field'
54
- ),
55
- clear: z.boolean().optional().describe(
56
- 'Whether to clear the existing content before typing (default: false)'
57
- ),
58
- pressEnter: z.boolean().optional().describe(
59
- 'Whether to press Enter after typing (useful for search fields, default: false)'
60
- ),
61
- });
62
-
63
- const BrowserNavigateSchema = z.object({
64
- url: z.string().describe(
65
- 'The URL to navigate to. Can be a full URL or a relative path.'
66
- ),
67
- reason: z.string().optional().describe(
68
- 'Brief explanation of why you are navigating to this URL'
69
- ),
70
- });
71
-
72
- const BrowserScrollSchema = z.object({
73
- direction: z.enum(['up', 'down', 'left', 'right']).describe(
74
- 'The direction to scroll'
75
- ),
76
- amount: z.number().optional().describe(
77
- 'The amount to scroll in pixels (default: 500)'
78
- ),
79
- });
80
-
81
- const BrowserExtractSchema = z.object({
82
- query: z.string().optional().describe(
83
- 'Optional query to filter extracted content. If provided, only content related to the query will be extracted.'
84
- ),
85
- selector: z.string().optional().describe(
86
- 'Optional CSS selector to extract content from a specific element'
87
- ),
88
- });
89
-
90
- const BrowserHoverSchema = z.object({
91
- index: z.number().describe(
92
- 'The index of the element to hover over, as shown in the page context'
93
- ),
94
- });
95
-
96
- const BrowserWaitSchema = z.object({
97
- duration: z.number().optional().describe(
98
- 'Duration to wait in milliseconds (default: 1000)'
99
- ),
100
- reason: z.string().optional().describe(
101
- 'Why we are waiting (e.g., "for page to load", "for animation to complete")'
102
- ),
103
- });
104
-
105
- const BrowserGoBackSchema = z.object({
106
- reason: z.string().optional().describe(
107
- 'Brief explanation of why you are going back'
108
- ),
109
- });
110
-
111
- const BrowserScreenshotSchema = z.object({
112
- fullPage: z.boolean().optional().describe(
113
- 'Whether to capture the full page or just the viewport (default: viewport only)'
114
- ),
115
- reason: z.string().optional().describe(
116
- 'Why you need a screenshot (e.g., "to identify visual elements", "to analyze page layout")'
117
- ),
118
- });
119
-
120
- const BrowserGetPageStateSchema = z.object({
121
- reason: z.string().optional().describe(
122
- 'Why you need fresh page state (e.g., "after navigation", "to see updated elements")'
123
- ),
124
- });
125
-
126
- // ============================================
127
- // Tool Implementations
128
- // ============================================
129
-
130
- /**
131
- * Browser click tool - clicks an element by index or coordinates
132
- * Supports both semantic (index-based) and vision (coordinate-based) clicking
133
- */
134
- export function createBrowserClickTool(): DynamicStructuredTool<typeof BrowserClickSchema> {
135
- return tool<typeof BrowserClickSchema>(
136
- async ({ index, coordinates, visualDescription, reason }) => {
137
- // Validate that at least one targeting method is provided
138
- if (index === undefined && !coordinates) {
139
- return JSON.stringify({
140
- type: 'error',
141
- error: 'Either index or coordinates must be provided to click an element',
142
- });
143
- }
144
-
145
- // Return a structured action for the extension to execute
146
- // The actual execution happens in the browser extension
147
- return JSON.stringify({
148
- type: 'browser_action',
149
- action: {
150
- type: 'click',
151
- ...(index !== undefined && { index }),
152
- ...(coordinates && { coordinates }),
153
- ...(visualDescription && { visualDescription }),
154
- reason,
155
- },
156
- // Signal that this requires browser execution
157
- requiresBrowserExecution: true,
158
- });
159
- },
160
- {
161
- name: EBrowserTools.CLICK,
162
- description: `Click an interactive element on the current page.
163
-
164
- **Two ways to target elements:**
165
-
166
- 1. **By index (preferred)**: Use the element's index number from the interactive elements list
167
- - Format: [index] {semantic role} <tag>text</tag>
168
- - Example: browser_click({ index: 5 }) to click element [5]
169
-
170
- 2. **By coordinates (vision fallback)**: For elements marked with ⚠️ that lack semantic info
171
- - Use the coords:(x,y) shown after the ⚠️ marker
172
- - Example: browser_click({ coordinates: { x: 150, y: 200 } })
173
-
174
- **When to use coordinates:**
175
- - Elements marked with ⚠️ have poor semantic understanding
176
- - Icon-only buttons without labels
177
- - Custom canvas/SVG elements
178
- - When you identify an element visually in a screenshot
179
-
180
- Example: If element shows \`[12] {button} <div>⚠️ [left side, small, clickable] coords:(45,120)\`
181
- Use either: browser_click({ index: 12 }) or browser_click({ coordinates: { x: 45, y: 120 } })`,
182
- schema: BrowserClickSchema,
183
- }
184
- );
185
- }
186
-
187
- /**
188
- * Browser type tool - types text into an input field
189
- */
190
- export function createBrowserTypeTool(): DynamicStructuredTool<typeof BrowserTypeSchema> {
191
- return tool<typeof BrowserTypeSchema>(
192
- async ({ index, text, clear, pressEnter }) => {
193
- return JSON.stringify({
194
- type: 'browser_action',
195
- action: {
196
- type: 'type',
197
- index,
198
- text,
199
- clear: clear ?? false,
200
- pressEnter: pressEnter ?? false,
201
- },
202
- requiresBrowserExecution: true,
203
- });
204
- },
205
- {
206
- name: EBrowserTools.TYPE,
207
- description: `Type text into an input field on the current page.
208
-
209
- Use this tool when you need to:
210
- - Fill in a text input or textarea
211
- - Enter a search query
212
- - Fill out form fields
213
-
214
- The element index comes from the page context's interactive elements list.
215
- Set 'clear: true' to clear existing content before typing.
216
- Set 'pressEnter: true' to submit after typing (useful for search fields).
217
-
218
- Example: To type "hello world" into a search field shown as "[2]<input>Search...</input>",
219
- use index: 2, text: "hello world"`,
220
- schema: BrowserTypeSchema,
221
- }
222
- );
223
- }
224
-
225
- /**
226
- * Browser navigate tool - navigates to a URL
227
- */
228
- export function createBrowserNavigateTool(): DynamicStructuredTool<typeof BrowserNavigateSchema> {
229
- return tool<typeof BrowserNavigateSchema>(
230
- async ({ url, reason }) => {
231
- return JSON.stringify({
232
- type: 'browser_action',
233
- action: {
234
- type: 'navigate',
235
- url,
236
- reason,
237
- },
238
- requiresBrowserExecution: true,
239
- });
240
- },
241
- {
242
- name: EBrowserTools.NAVIGATE,
243
- description: `Navigate to a specific URL in the browser.
244
-
245
- Use this tool when you need to:
246
- - Go to a specific website
247
- - Navigate to a different page
248
- - Open a new URL
249
-
250
- **IMPORTANT**: After calling browser_navigate, you MUST call browser_get_page_state
251
- before using browser_click or browser_type. This is because navigation changes the page,
252
- and you need to see the new page's elements before you can interact with them.
253
-
254
- Provide the full URL including the protocol (https://).
255
-
256
- **Correct workflow**:
257
- 1. browser_navigate({ url: "https://www.amazon.com" })
258
- 2. browser_get_page_state({ reason: "see elements on Amazon" })
259
- 3. Now find the search input's [index] in the returned state
260
- 4. browser_type({ index: <search_input_index>, text: "query", pressEnter: true })
261
-
262
- Example: browser_navigate({ url: "https://www.google.com" })`,
263
- schema: BrowserNavigateSchema,
264
- }
265
- );
266
- }
267
-
268
- /**
269
- * Browser scroll tool - scrolls the page
270
- */
271
- export function createBrowserScrollTool(): DynamicStructuredTool<typeof BrowserScrollSchema> {
272
- return tool<typeof BrowserScrollSchema>(
273
- async ({ direction, amount }) => {
274
- return JSON.stringify({
275
- type: 'browser_action',
276
- action: {
277
- type: 'scroll',
278
- scroll: {
279
- direction,
280
- amount: amount ?? 500,
281
- },
282
- },
283
- requiresBrowserExecution: true,
284
- });
285
- },
286
- {
287
- name: EBrowserTools.SCROLL,
288
- description: `Scroll the current page in a specified direction.
289
-
290
- Use this tool when you need to:
291
- - See more content on the page
292
- - Scroll to find elements not currently visible
293
- - Navigate long pages
294
-
295
- Default scroll amount is 500 pixels. Adjust as needed.
296
-
297
- Example: browser_scroll({ direction: "down", amount: 800 })`,
298
- schema: BrowserScrollSchema,
299
- }
300
- );
301
- }
302
-
303
- /**
304
- * Browser extract tool - extracts content from the page
305
- */
306
- export function createBrowserExtractTool(): DynamicStructuredTool<typeof BrowserExtractSchema> {
307
- return tool<typeof BrowserExtractSchema>(
308
- async ({ query, selector }) => {
309
- return JSON.stringify({
310
- type: 'browser_action',
311
- action: {
312
- type: 'extract',
313
- query,
314
- selector,
315
- },
316
- requiresBrowserExecution: true,
317
- });
318
- },
319
- {
320
- name: EBrowserTools.EXTRACT,
321
- description: `Extract text content from the current page.
322
-
323
- Use this tool when you need to:
324
- - Get specific information from the page
325
- - Extract text that matches a query
326
- - Read content from a specific element
327
-
328
- If no query or selector is provided, extracts the main page content.
329
- Use a CSS selector to extract from a specific element.
330
- Use a query to filter for relevant content.
331
-
332
- Example: browser_extract({ query: "price" }) - extracts content related to pricing`,
333
- schema: BrowserExtractSchema,
334
- }
335
- );
336
- }
337
-
338
- /**
339
- * Browser hover tool - hovers over an element
340
- */
341
- export function createBrowserHoverTool(): DynamicStructuredTool<typeof BrowserHoverSchema> {
342
- return tool<typeof BrowserHoverSchema>(
343
- async ({ index }) => {
344
- return JSON.stringify({
345
- type: 'browser_action',
346
- action: {
347
- type: 'hover',
348
- index,
349
- },
350
- requiresBrowserExecution: true,
351
- });
352
- },
353
- {
354
- name: EBrowserTools.HOVER,
355
- description: `Hover over an element to reveal tooltips or dropdown menus.
356
-
357
- Use this tool when you need to:
358
- - Reveal a dropdown menu
359
- - Show a tooltip
360
- - Trigger hover effects
361
-
362
- Example: browser_hover({ index: 3 }) - hovers over element at index 3`,
363
- schema: BrowserHoverSchema,
364
- }
365
- );
366
- }
367
-
368
- /**
369
- * Browser wait tool - waits for a specified duration
370
- */
371
- export function createBrowserWaitTool(): DynamicStructuredTool<typeof BrowserWaitSchema> {
372
- return tool<typeof BrowserWaitSchema>(
373
- async ({ duration, reason }) => {
374
- return JSON.stringify({
375
- type: 'browser_action',
376
- action: {
377
- type: 'wait',
378
- duration: duration ?? 1000,
379
- reason,
380
- },
381
- requiresBrowserExecution: true,
382
- });
383
- },
384
- {
385
- name: EBrowserTools.WAIT,
386
- description: `Wait for a specified duration before the next action.
387
-
388
- Use this tool when you need to:
389
- - Wait for a page to load
390
- - Wait for an animation to complete
391
- - Add delay between actions
392
-
393
- Default wait time is 1000ms (1 second).
394
-
395
- Example: browser_wait({ duration: 2000, reason: "waiting for page to load" })`,
396
- schema: BrowserWaitSchema,
397
- }
398
- );
399
- }
400
-
401
- /**
402
- * Browser go back tool - navigates back in history
403
- */
404
- export function createBrowserGoBackTool(): DynamicStructuredTool<typeof BrowserGoBackSchema> {
405
- return tool<typeof BrowserGoBackSchema>(
406
- async ({ reason }) => {
407
- return JSON.stringify({
408
- type: 'browser_action',
409
- action: {
410
- type: 'back',
411
- reason,
412
- },
413
- requiresBrowserExecution: true,
414
- });
415
- },
416
- {
417
- name: EBrowserTools.BACK,
418
- description: `Navigate back to the previous page in browser history.
419
-
420
- Use this tool when you need to:
421
- - Return to a previous page
422
- - Undo a navigation
423
-
424
- Example: browser_back({ reason: "returning to search results" })`,
425
- schema: BrowserGoBackSchema,
426
- }
427
- );
428
- }
429
-
430
- /**
431
- * Browser screenshot tool - captures a screenshot
432
- */
433
- export function createBrowserScreenshotTool(): DynamicStructuredTool<typeof BrowserScreenshotSchema> {
434
- return tool<typeof BrowserScreenshotSchema>(
435
- async ({ fullPage }) => {
436
- return JSON.stringify({
437
- type: 'browser_action',
438
- action: {
439
- type: 'screenshot',
440
- fullPage: fullPage ?? false,
441
- },
442
- requiresBrowserExecution: true,
443
- });
444
- },
445
- {
446
- name: EBrowserTools.SCREENSHOT,
447
- description: `Capture a screenshot of the current page.
448
-
449
- Use this tool when you need to:
450
- - Capture the current state of a page
451
- - Document visual elements
452
- - Verify page appearance
453
-
454
- Set fullPage: true to capture the entire page (may be large).
455
- Default captures only the visible viewport.
456
-
457
- Example: browser_screenshot({ fullPage: false })`,
458
- schema: BrowserScreenshotSchema,
459
- }
460
- );
461
- }
462
-
463
- /**
464
- * Browser get page state tool - gets fresh page context after navigation or actions
465
- * CRITICAL: Use this after browser_navigate or any action that changes the page
466
- */
467
- export function createBrowserGetPageStateTool(): DynamicStructuredTool<typeof BrowserGetPageStateSchema> {
468
- return tool<typeof BrowserGetPageStateSchema>(
469
- async ({ reason }) => {
470
- return JSON.stringify({
471
- type: 'browser_action',
472
- action: {
473
- type: 'get_page_state',
474
- reason,
475
- },
476
- requiresBrowserExecution: true,
477
- // Special flag: extension should inject fresh context into the conversation
478
- requiresContextRefresh: true,
479
- // IMPORTANT: Tell the agent to wait
480
- message: 'Page state is being captured by the browser extension. The element list will be provided in the next message. DO NOT proceed with click or type actions until you receive the actual element list.',
481
- });
482
- },
483
- {
484
- name: EBrowserTools.GET_PAGE_STATE,
485
- description: `Get fresh page state showing current interactive elements.
486
-
487
- **CRITICAL WORKFLOW**: After calling this tool, you MUST STOP and WAIT. The browser extension will capture the page state and return the element list. DO NOT plan any browser_click or browser_type actions in the same response - you don't have the element indices yet!
488
-
489
- **When to use**:
490
- - After browser_navigate (to see elements on the new page)
491
- - After browser_click (if it caused navigation or page changes)
492
- - Any time you need to see what elements are currently on the page
493
-
494
- **IMPORTANT**: This tool captures the page state asynchronously. The actual element list will be provided AFTER this tool completes. You should:
495
- 1. Call this tool
496
- 2. STOP and wait for the response with the element list
497
- 3. In your NEXT response, use the element indices for click/type actions
498
-
499
- Example workflow:
500
- - Turn 1: browser_navigate to amazon.com, then browser_get_page_state
501
- - Turn 2: (After receiving element list) browser_type with the correct search input index
502
-
503
- Example: browser_get_page_state({ reason: "to see elements after navigation" })`,
504
- schema: BrowserGetPageStateSchema,
505
- }
506
- );
507
- }
508
-
509
- // ============================================
510
- // Tool Collection
511
- // ============================================
512
-
513
- export type BrowserToolsConfig = {
514
- /** Enable click tool */
515
- enableClick?: boolean;
516
- /** Enable type tool */
517
- enableType?: boolean;
518
- /** Enable navigate tool */
519
- enableNavigate?: boolean;
520
- /** Enable scroll tool */
521
- enableScroll?: boolean;
522
- /** Enable extract tool */
523
- enableExtract?: boolean;
524
- /** Enable hover tool */
525
- enableHover?: boolean;
526
- /** Enable wait tool */
527
- enableWait?: boolean;
528
- /** Enable back tool */
529
- enableBack?: boolean;
530
- /** Enable screenshot tool */
531
- enableScreenshot?: boolean;
532
- /** Enable get page state tool */
533
- enableGetPageState?: boolean;
534
- };
535
-
536
- /**
537
- * Create all browser automation tools
538
- *
539
- * IMPORTANT: These tools should ONLY be registered when:
540
- * 1. The request comes from a browser extension that can execute them
541
- * 2. The client has indicated browser capability (e.g., via header or parameter)
542
- *
543
- * DO NOT register these for normal web UI users - they cannot execute browser actions.
544
- *
545
- * Detection in Ranger API:
546
- * - Check for `X-Ranger-Browser-Extension: true` header
547
- * - Or check for `browserCapable: true` in request body
548
- * - Or check user agent for extension identifier
549
- *
550
- * @example
551
- * // In Ranger API endpoint:
552
- * const hasBrowserExtension = req.headers['x-ranger-browser-extension'] === 'true';
553
- * const tools = hasBrowserExtension
554
- * ? [...normalTools, ...createBrowserTools()]
555
- * : normalTools;
556
- */
557
- export function createBrowserTools(config: BrowserToolsConfig = {}): DynamicStructuredTool[] {
558
- const tools: DynamicStructuredTool[] = [];
559
-
560
- // Enable all by default
561
- const {
562
- enableClick = true,
563
- enableType = true,
564
- enableNavigate = true,
565
- enableScroll = true,
566
- enableExtract = true,
567
- enableHover = true,
568
- enableWait = true,
569
- enableBack = true,
570
- enableScreenshot = true,
571
- enableGetPageState = true,
572
- } = config;
573
-
574
- if (enableClick) tools.push(createBrowserClickTool());
575
- if (enableType) tools.push(createBrowserTypeTool());
576
- if (enableNavigate) tools.push(createBrowserNavigateTool());
577
- if (enableScroll) tools.push(createBrowserScrollTool());
578
- if (enableExtract) tools.push(createBrowserExtractTool());
579
- if (enableHover) tools.push(createBrowserHoverTool());
580
- if (enableWait) tools.push(createBrowserWaitTool());
581
- if (enableBack) tools.push(createBrowserGoBackTool());
582
- if (enableScreenshot) tools.push(createBrowserScreenshotTool());
583
- if (enableGetPageState) tools.push(createBrowserGetPageStateTool());
584
-
585
- return tools;
586
- }
587
-
588
- /**
589
- * Browser tool name constants
590
- * Use these instead of magic strings
591
- */
592
- export const EBrowserTools = {
593
- CLICK: 'browser_click',
594
- TYPE: 'browser_type',
595
- NAVIGATE: 'browser_navigate',
596
- SCROLL: 'browser_scroll',
597
- EXTRACT: 'browser_extract',
598
- HOVER: 'browser_hover',
599
- WAIT: 'browser_wait',
600
- BACK: 'browser_back',
601
- SCREENSHOT: 'browser_screenshot',
602
- GET_PAGE_STATE: 'browser_get_page_state',
603
- } as const;
604
-
605
- /**
606
- * Get browser tool names for filtering/identification
607
- */
608
- export const BROWSER_TOOL_NAMES = [
609
- EBrowserTools.CLICK,
610
- EBrowserTools.TYPE,
611
- EBrowserTools.NAVIGATE,
612
- EBrowserTools.SCROLL,
613
- EBrowserTools.EXTRACT,
614
- EBrowserTools.HOVER,
615
- EBrowserTools.WAIT,
616
- EBrowserTools.BACK,
617
- EBrowserTools.SCREENSHOT,
618
- EBrowserTools.GET_PAGE_STATE,
619
- ] as const;
620
-
621
- export type BrowserToolName = typeof BROWSER_TOOL_NAMES[number];
622
-
623
- /**
624
- * Check if a tool call is a browser action
625
- */
626
- export function isBrowserToolCall(toolName: string): toolName is BrowserToolName {
627
- return BROWSER_TOOL_NAMES.includes(toolName as BrowserToolName);
628
- }
629
-
630
- /**
631
- * Check if request indicates browser extension capability
632
- * Use this to conditionally register browser tools
633
- *
634
- * @example
635
- * // In Express middleware or endpoint:
636
- * if (hasBrowserCapability(req.headers)) {
637
- * tools.push(...createBrowserTools());
638
- * }
639
- */
640
- export function hasBrowserCapability(headers: Record<string, string | string[] | undefined>): boolean {
641
- const extensionHeader = headers['x-ranger-browser-extension'];
642
- const capableHeader = headers['x-ranger-browser-capable'];
643
-
644
- return (
645
- extensionHeader === 'true' ||
646
- capableHeader === 'true' ||
647
- (Array.isArray(extensionHeader) && extensionHeader.includes('true')) ||
648
- (Array.isArray(capableHeader) && capableHeader.includes('true'))
649
- );
650
- }