illuma-agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,574 +0,0 @@
1
- import { z } from 'zod';
2
- import { tool, DynamicStructuredTool } from '@langchain/core/tools';
3
-
4
- /**
5
- * Desktop tool names - keep in sync with Ranger Desktop Electron app
6
- * These tools execute locally in the Electron app, NOT on the server
7
- */
8
- export const EDesktopTools = {
9
- SCREENSHOT: 'computer_screenshot',
10
- CLICK: 'computer_click',
11
- DOUBLE_CLICK: 'computer_double_click',
12
- RIGHT_CLICK: 'computer_right_click',
13
- TYPE: 'computer_type',
14
- KEY: 'computer_key',
15
- KEY_COMBO: 'computer_key_combo',
16
- SCROLL: 'computer_scroll',
17
- DRAG: 'computer_drag',
18
- GET_ACTIVE_WINDOW: 'computer_get_active_window',
19
- GET_MOUSE_POSITION: 'computer_get_mouse_position',
20
- CLIPBOARD_READ: 'clipboard_read',
21
- CLIPBOARD_WRITE: 'clipboard_write',
22
- CLIPBOARD_PASTE: 'clipboard_paste',
23
- WAIT: 'computer_wait',
24
- // Native UI Automation tools (Windows) - faster and more reliable than screenshot-based
25
- UI_FIND_ELEMENT: 'ui_find_element',
26
- UI_CLICK_ELEMENT: 'ui_click_element',
27
- UI_GET_WINDOW_TREE: 'ui_get_window_tree',
28
- UI_FIND_BUTTONS: 'ui_find_buttons',
29
- UI_FIND_INPUTS: 'ui_find_inputs',
30
- } as const;
31
-
32
- export type DesktopToolName =
33
- (typeof EDesktopTools)[keyof typeof EDesktopTools];
34
-
35
- /**
36
- * Callback function type for waiting on desktop action results
37
- * This allows the server (Ranger) to provide a callback that waits for the Electron app
38
- * to POST results back to the server before returning to the LLM.
39
- *
40
- * @param action - The desktop action (click, type, screenshot, etc.)
41
- * @param args - Arguments for the action
42
- * @param toolCallId - Unique ID for this tool call (from config.toolCall.id)
43
- * @returns Promise that resolves with the actual desktop result
44
- */
45
- export type DesktopToolCallback = (
46
- action: string,
47
- args: Record<string, unknown>,
48
- toolCallId: string
49
- ) => Promise<DesktopActionResult>;
50
-
51
- /**
52
- * Result returned from desktop action execution
53
- */
54
- export interface DesktopActionResult {
55
- success: boolean;
56
- error?: string;
57
- screenshot?: {
58
- base64: string;
59
- width: number;
60
- height: number;
61
- };
62
- activeWindow?: {
63
- title: string;
64
- app: string;
65
- bounds?: { x: number; y: number; width: number; height: number };
66
- };
67
- mousePosition?: { x: number; y: number };
68
- clipboard?: string;
69
- // UI Automation results
70
- uiElement?: {
71
- name: string;
72
- automationId: string;
73
- controlType: string;
74
- boundingRectangle?: { x: number; y: number; width: number; height: number };
75
- isEnabled: boolean;
76
- };
77
- uiElements?: Array<{
78
- name: string;
79
- automationId: string;
80
- controlType: string;
81
- boundingRectangle?: { x: number; y: number; width: number; height: number };
82
- }>;
83
- uiTree?: unknown;
84
- // Generic data for extended results
85
- data?: unknown;
86
- }
87
-
88
- /**
89
- * Check if desktop capability is available based on request headers or context
90
- * The Ranger Desktop Electron app sets these headers when connected:
91
- * - X-Ranger-Desktop: true
92
- * - X-Ranger-Desktop-Capable: true
93
- */
94
- export function hasDesktopCapability(req?: {
95
- headers?: Record<string, string | string[] | undefined>;
96
- }): boolean {
97
- if (!req?.headers) {
98
- return false;
99
- }
100
-
101
- const desktopApp = req.headers['x-ranger-desktop'];
102
- const desktopCapable = req.headers['x-ranger-desktop-capable'];
103
-
104
- return desktopApp === 'true' || desktopCapable === 'true';
105
- }
106
-
107
- // Tool schemas
108
- const ScreenshotSchema = z.object({});
109
-
110
- const ClickSchema = z.object({
111
- x: z.number().describe('X coordinate to click'),
112
- y: z.number().describe('Y coordinate to click'),
113
- });
114
-
115
- const DoubleClickSchema = z.object({
116
- x: z.number().describe('X coordinate to double-click'),
117
- y: z.number().describe('Y coordinate to double-click'),
118
- });
119
-
120
- const RightClickSchema = z.object({
121
- x: z.number().describe('X coordinate to right-click'),
122
- y: z.number().describe('Y coordinate to right-click'),
123
- });
124
-
125
- const TypeSchema = z.object({
126
- text: z.string().describe('Text to type'),
127
- });
128
-
129
- const KeySchema = z.object({
130
- key: z
131
- .string()
132
- .describe(
133
- 'Key to press (e.g., "Enter", "Tab", "Escape", "Backspace", "Delete", "ArrowUp", "ArrowDown", "ArrowLeft", "ArrowRight", "Home", "End", "PageUp", "PageDown", "F1"-"F12")'
134
- ),
135
- });
136
-
137
- const KeyComboSchema = z.object({
138
- keys: z
139
- .array(z.string())
140
- .describe(
141
- 'Array of keys to press together (e.g., ["Control", "c"] for copy, ["Alt", "Tab"] for window switch)'
142
- ),
143
- });
144
-
145
- const ScrollSchema = z.object({
146
- x: z.number().describe('X coordinate to scroll at'),
147
- y: z.number().describe('Y coordinate to scroll at'),
148
- deltaX: z.number().optional().describe('Horizontal scroll amount (pixels)'),
149
- deltaY: z.number().describe('Vertical scroll amount (pixels, negative = up, positive = down)'),
150
- });
151
-
152
- const DragSchema = z.object({
153
- startX: z.number().describe('Starting X coordinate'),
154
- startY: z.number().describe('Starting Y coordinate'),
155
- endX: z.number().describe('Ending X coordinate'),
156
- endY: z.number().describe('Ending Y coordinate'),
157
- });
158
-
159
- const GetActiveWindowSchema = z.object({});
160
-
161
- const GetMousePositionSchema = z.object({});
162
-
163
- const ClipboardReadSchema = z.object({});
164
-
165
- const ClipboardWriteSchema = z.object({
166
- text: z.string().describe('Text to write to clipboard'),
167
- });
168
-
169
- const ClipboardPasteSchema = z.object({});
170
-
171
- const WaitSchema = z.object({
172
- ms: z.number().describe('Milliseconds to wait'),
173
- });
174
-
175
- // ============ Native UI Automation Schemas (Windows) ============
176
-
177
- const UIFindElementSchema = z.object({
178
- name: z.string().optional().describe('Element name/label to find (e.g., "Submit", "OK", "File")'),
179
- automationId: z.string().optional().describe('Automation ID of element (unique identifier)'),
180
- controlType: z.string().optional().describe('Type of control: Button, Edit, Text, ComboBox, List, Menu, MenuItem, Window, etc.'),
181
- });
182
-
183
- const UIClickElementSchema = z.object({
184
- name: z.string().optional().describe('Element name/label to click'),
185
- automationId: z.string().optional().describe('Automation ID of element to click'),
186
- controlType: z.string().optional().describe('Type of control: Button, Edit, etc.'),
187
- clickType: z.enum(['left', 'right', 'double']).optional().describe('Click type (default: left)'),
188
- });
189
-
190
- const UIGetWindowTreeSchema = z.object({
191
- maxDepth: z.number().optional().describe('Maximum depth to traverse (default: 3)'),
192
- });
193
-
194
- const UIFindButtonsSchema = z.object({});
195
-
196
- const UIFindInputsSchema = z.object({});
197
-
198
- /**
199
- * Desktop tool response interface
200
- * This is what the Electron app returns after executing the action
201
- */
202
- export interface DesktopToolResponse {
203
- requiresDesktopExecution: true;
204
- action: string;
205
- args: Record<string, unknown>;
206
- toolCallId?: string;
207
- }
208
-
209
- /**
210
- * Options for creating desktop tools
211
- */
212
- export interface CreateDesktopToolsOptions {
213
- /**
214
- * Optional callback that waits for desktop action results.
215
- * When provided, tools will await this callback to get actual results from the Electron app.
216
- * When not provided, tools return markers immediately (for non-server contexts).
217
- */
218
- waitForResult?: DesktopToolCallback;
219
- }
220
-
221
- /**
222
- * Format desktop action result for LLM consumption
223
- */
224
- function formatResultForLLM(
225
- result: DesktopActionResult,
226
- action: string
227
- ): string {
228
- if (!result.success && result.error) {
229
- return `Desktop action "${action}" failed: ${result.error}`;
230
- }
231
-
232
- const parts: string[] = [];
233
-
234
- if (result.screenshot) {
235
- parts.push(
236
- `Screenshot captured (${result.screenshot.width}x${result.screenshot.height})`
237
- );
238
- // The base64 image will be handled separately by the message formatter
239
- }
240
-
241
- if (result.activeWindow) {
242
- parts.push(`**Active Window:**`);
243
- parts.push(` - Title: ${result.activeWindow.title}`);
244
- parts.push(` - App: ${result.activeWindow.app}`);
245
- if (result.activeWindow.bounds) {
246
- const b = result.activeWindow.bounds;
247
- parts.push(` - Position: (${b.x}, ${b.y})`);
248
- parts.push(` - Size: ${b.width}x${b.height}`);
249
- }
250
- }
251
-
252
- if (result.mousePosition) {
253
- parts.push(
254
- `**Mouse Position:** (${result.mousePosition.x}, ${result.mousePosition.y})`
255
- );
256
- }
257
-
258
- if (result.clipboard !== undefined) {
259
- parts.push(`**Clipboard Content:** ${result.clipboard}`);
260
- }
261
-
262
- // UI Automation results
263
- if (result.uiElement) {
264
- const el = result.uiElement;
265
- parts.push(`**UI Element Found:**`);
266
- parts.push(` - Name: "${el.name}"`);
267
- parts.push(` - AutomationId: "${el.automationId}"`);
268
- parts.push(` - Type: ${el.controlType}`);
269
- if (el.boundingRectangle) {
270
- const b = el.boundingRectangle;
271
- parts.push(` - Bounds: (${b.x}, ${b.y}) ${b.width}x${b.height}`);
272
- parts.push(` - Center: (${Math.round(b.x + b.width/2)}, ${Math.round(b.y + b.height/2)})`);
273
- }
274
- parts.push(` - Enabled: ${el.isEnabled}`);
275
- }
276
-
277
- if (result.uiElements && result.uiElements.length > 0) {
278
- parts.push(`**UI Elements Found (${result.uiElements.length}):**`);
279
- for (const el of result.uiElements.slice(0, 20)) { // Limit to 20
280
- const bounds = el.boundingRectangle ?
281
- ` at (${el.boundingRectangle.x}, ${el.boundingRectangle.y})` : '';
282
- parts.push(` - [${el.controlType}] "${el.name}"${el.automationId ? ` (id: ${el.automationId})` : ''}${bounds}`);
283
- }
284
- if (result.uiElements.length > 20) {
285
- parts.push(` ... and ${result.uiElements.length - 20} more`);
286
- }
287
- }
288
-
289
- if (result.uiTree) {
290
- parts.push(`**UI Tree:**`);
291
- parts.push('```json');
292
- parts.push(JSON.stringify(result.uiTree, null, 2).slice(0, 3000)); // Limit size
293
- parts.push('```');
294
- }
295
-
296
- if (result.data && !result.uiElement && !result.uiElements && !result.uiTree) {
297
- // Generic data fallback
298
- parts.push(`**Result:**`);
299
- parts.push(JSON.stringify(result.data, null, 2).slice(0, 2000));
300
- }
301
-
302
- if (parts.length === 0) {
303
- parts.push(`Desktop action "${action}" completed successfully.`);
304
- }
305
-
306
- return parts.join('\\n');
307
- }
308
-
309
- /**
310
- * Create desktop automation tools for the agent
311
- * These tools allow AI to control the user's desktop when Ranger Desktop is running
312
- */
313
- export function createDesktopTools(
314
- options: CreateDesktopToolsOptions = {}
315
- ): DynamicStructuredTool[] {
316
- const { waitForResult } = options;
317
- const tools: DynamicStructuredTool[] = [];
318
-
319
- /**
320
- * Helper to create tool function that optionally waits for results
321
- * The toolCallId is extracted from the RunnableConfig passed by LangChain
322
- */
323
- const createToolFunction = (action: string) => {
324
- return async (
325
- args: Record<string, unknown>,
326
- config?: { toolCall?: { id?: string } }
327
- ): Promise<string> => {
328
- const toolCallId =
329
- config?.toolCall?.id ??
330
- `desktop_${Date.now()}_${Math.random().toString(36).slice(2)}`;
331
-
332
- // Create marker for Electron app
333
- const marker: DesktopToolResponse = {
334
- requiresDesktopExecution: true,
335
- action,
336
- args,
337
- toolCallId,
338
- };
339
-
340
- // If no callback, return marker immediately (Electron handles via SSE interception)
341
- if (!waitForResult) {
342
- return JSON.stringify(marker);
343
- }
344
-
345
- // With callback: wait for actual results from Electron app
346
- try {
347
- const result = await waitForResult(action, args, toolCallId);
348
- return formatResultForLLM(result, action);
349
- } catch (error) {
350
- const errorMessage =
351
- error instanceof Error ? error.message : String(error);
352
- return `Desktop action "${action}" failed: ${errorMessage}`;
353
- }
354
- };
355
- };
356
-
357
- // computer_screenshot
358
- tools.push(
359
- tool(createToolFunction(EDesktopTools.SCREENSHOT), {
360
- name: EDesktopTools.SCREENSHOT,
361
- description:
362
- 'Take a screenshot of the entire screen. Use this to see what is currently displayed on the desktop.',
363
- schema: ScreenshotSchema,
364
- })
365
- );
366
-
367
- // computer_click
368
- tools.push(
369
- tool(createToolFunction(EDesktopTools.CLICK), {
370
- name: EDesktopTools.CLICK,
371
- description:
372
- 'Click the mouse at the specified screen coordinates. Use screenshot first to identify the target location.',
373
- schema: ClickSchema,
374
- })
375
- );
376
-
377
- // computer_double_click
378
- tools.push(
379
- tool(createToolFunction(EDesktopTools.DOUBLE_CLICK), {
380
- name: EDesktopTools.DOUBLE_CLICK,
381
- description:
382
- 'Double-click the mouse at the specified screen coordinates.',
383
- schema: DoubleClickSchema,
384
- })
385
- );
386
-
387
- // computer_right_click
388
- tools.push(
389
- tool(createToolFunction(EDesktopTools.RIGHT_CLICK), {
390
- name: EDesktopTools.RIGHT_CLICK,
391
- description:
392
- 'Right-click the mouse at the specified screen coordinates to open context menus.',
393
- schema: RightClickSchema,
394
- })
395
- );
396
-
397
- // computer_type
398
- tools.push(
399
- tool(createToolFunction(EDesktopTools.TYPE), {
400
- name: EDesktopTools.TYPE,
401
- description:
402
- 'Type text using the keyboard. Make sure the target input field is focused first (use click).',
403
- schema: TypeSchema,
404
- })
405
- );
406
-
407
- // computer_key
408
- tools.push(
409
- tool(createToolFunction(EDesktopTools.KEY), {
410
- name: EDesktopTools.KEY,
411
- description:
412
- 'Press a single key on the keyboard (Enter, Tab, Escape, arrow keys, function keys, etc.).',
413
- schema: KeySchema,
414
- })
415
- );
416
-
417
- // computer_key_combo
418
- tools.push(
419
- tool(createToolFunction(EDesktopTools.KEY_COMBO), {
420
- name: EDesktopTools.KEY_COMBO,
421
- description:
422
- 'Press a key combination (e.g., Ctrl+C to copy, Ctrl+V to paste, Alt+Tab to switch windows).',
423
- schema: KeyComboSchema,
424
- })
425
- );
426
-
427
- // computer_scroll
428
- tools.push(
429
- tool(createToolFunction(EDesktopTools.SCROLL), {
430
- name: EDesktopTools.SCROLL,
431
- description:
432
- 'Scroll at the specified screen coordinates. Use negative deltaY to scroll up, positive to scroll down.',
433
- schema: ScrollSchema,
434
- })
435
- );
436
-
437
- // computer_drag
438
- tools.push(
439
- tool(createToolFunction(EDesktopTools.DRAG), {
440
- name: EDesktopTools.DRAG,
441
- description:
442
- 'Drag the mouse from one position to another (for moving windows, selecting text, etc.).',
443
- schema: DragSchema,
444
- })
445
- );
446
-
447
- // computer_get_active_window
448
- tools.push(
449
- tool(createToolFunction(EDesktopTools.GET_ACTIVE_WINDOW), {
450
- name: EDesktopTools.GET_ACTIVE_WINDOW,
451
- description:
452
- 'Get information about the currently active window (title, application name, position, size).',
453
- schema: GetActiveWindowSchema,
454
- })
455
- );
456
-
457
- // computer_get_mouse_position
458
- tools.push(
459
- tool(createToolFunction(EDesktopTools.GET_MOUSE_POSITION), {
460
- name: EDesktopTools.GET_MOUSE_POSITION,
461
- description: 'Get the current mouse cursor position on screen.',
462
- schema: GetMousePositionSchema,
463
- })
464
- );
465
-
466
- // clipboard_read
467
- tools.push(
468
- tool(createToolFunction(EDesktopTools.CLIPBOARD_READ), {
469
- name: EDesktopTools.CLIPBOARD_READ,
470
- description: 'Read the current contents of the system clipboard.',
471
- schema: ClipboardReadSchema,
472
- })
473
- );
474
-
475
- // clipboard_write
476
- tools.push(
477
- tool(createToolFunction(EDesktopTools.CLIPBOARD_WRITE), {
478
- name: EDesktopTools.CLIPBOARD_WRITE,
479
- description: 'Write text to the system clipboard.',
480
- schema: ClipboardWriteSchema,
481
- })
482
- );
483
-
484
- // clipboard_paste
485
- tools.push(
486
- tool(createToolFunction(EDesktopTools.CLIPBOARD_PASTE), {
487
- name: EDesktopTools.CLIPBOARD_PASTE,
488
- description:
489
- 'Paste the clipboard contents (equivalent to Ctrl+V). Use clipboard_write first to set the content.',
490
- schema: ClipboardPasteSchema,
491
- })
492
- );
493
-
494
- // computer_wait
495
- tools.push(
496
- tool(createToolFunction(EDesktopTools.WAIT), {
497
- name: EDesktopTools.WAIT,
498
- description:
499
- 'Wait for the specified number of milliseconds. Use this to wait for UI animations or loading.',
500
- schema: WaitSchema,
501
- })
502
- );
503
-
504
- // ============ Native UI Automation Tools (Windows) ============
505
- // These are FASTER and MORE RELIABLE than screenshot-based automation
506
- // They find elements by semantic properties (name, automationId, type)
507
- // instead of relying on pixel coordinates from screenshots
508
-
509
- // ui_find_element
510
- tools.push(
511
- tool(createToolFunction(EDesktopTools.UI_FIND_ELEMENT), {
512
- name: EDesktopTools.UI_FIND_ELEMENT,
513
- description:
514
- '🚀 PREFERRED: Find a UI element by semantic properties (name, automationId, controlType). MUCH FASTER than screenshot analysis. Returns element bounds for clicking. Windows only.',
515
- schema: UIFindElementSchema,
516
- })
517
- );
518
-
519
- // ui_click_element
520
- tools.push(
521
- tool(createToolFunction(EDesktopTools.UI_CLICK_ELEMENT), {
522
- name: EDesktopTools.UI_CLICK_ELEMENT,
523
- description:
524
- '🚀 PREFERRED: Find and click a UI element by name/automationId. More reliable than coordinate-based clicking. Example: ui_click_element({name: "OK"}) or ui_click_element({controlType: "Button", name: "Submit"}). Windows only.',
525
- schema: UIClickElementSchema,
526
- })
527
- );
528
-
529
- // ui_get_window_tree
530
- tools.push(
531
- tool(createToolFunction(EDesktopTools.UI_GET_WINDOW_TREE), {
532
- name: EDesktopTools.UI_GET_WINDOW_TREE,
533
- description:
534
- 'Get the UI element tree of the active window. Shows all buttons, inputs, menus, etc. with their names and automationIds. Use this to discover elements before clicking. Windows only.',
535
- schema: UIGetWindowTreeSchema,
536
- })
537
- );
538
-
539
- // ui_find_buttons
540
- tools.push(
541
- tool(createToolFunction(EDesktopTools.UI_FIND_BUTTONS), {
542
- name: EDesktopTools.UI_FIND_BUTTONS,
543
- description:
544
- 'Find all clickable buttons in the active window. Returns list with names and positions. Useful for discovering available actions. Windows only.',
545
- schema: UIFindButtonsSchema,
546
- })
547
- );
548
-
549
- // ui_find_inputs
550
- tools.push(
551
- tool(createToolFunction(EDesktopTools.UI_FIND_INPUTS), {
552
- name: EDesktopTools.UI_FIND_INPUTS,
553
- description:
554
- 'Find all text input fields in the active window. Returns list with names and positions. Useful for discovering form fields. Windows only.',
555
- schema: UIFindInputsSchema,
556
- })
557
- );
558
-
559
- return tools;
560
- }
561
-
562
- /**
563
- * Get all desktop tool names
564
- */
565
- export function getDesktopToolNames(): DesktopToolName[] {
566
- return Object.values(EDesktopTools);
567
- }
568
-
569
- /**
570
- * Check if a tool name is a desktop tool
571
- */
572
- export function isDesktopTool(name: string): name is DesktopToolName {
573
- return Object.values(EDesktopTools).includes(name as DesktopToolName);
574
- }