illuma-agents 1.0.43 → 1.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,6 @@
1
1
  import { z } from 'zod';
2
2
  import { tool, DynamicStructuredTool } from '@langchain/core/tools';
3
3
 
4
- /**
5
- * Type for tool configuration passed by LangChain runtime
6
- */
7
- type ToolCallConfig = { toolCall?: { id?: string } };
8
-
9
4
  /**
10
5
  * Desktop tool names - keep in sync with Ranger Desktop Electron app
11
6
  * These tools execute locally in the Electron app, NOT on the server
@@ -26,6 +21,12 @@ export const EDesktopTools = {
26
21
  CLIPBOARD_WRITE: 'clipboard_write',
27
22
  CLIPBOARD_PASTE: 'clipboard_paste',
28
23
  WAIT: 'computer_wait',
24
+ // Native UI Automation tools (Windows) - faster and more reliable than screenshot-based
25
+ UI_FIND_ELEMENT: 'ui_find_element',
26
+ UI_CLICK_ELEMENT: 'ui_click_element',
27
+ UI_GET_WINDOW_TREE: 'ui_get_window_tree',
28
+ UI_FIND_BUTTONS: 'ui_find_buttons',
29
+ UI_FIND_INPUTS: 'ui_find_inputs',
29
30
  } as const;
30
31
 
31
32
  export type DesktopToolName =
@@ -65,6 +66,23 @@ export interface DesktopActionResult {
65
66
  };
66
67
  mousePosition?: { x: number; y: number };
67
68
  clipboard?: string;
69
+ // UI Automation results
70
+ uiElement?: {
71
+ name: string;
72
+ automationId: string;
73
+ controlType: string;
74
+ boundingRectangle?: { x: number; y: number; width: number; height: number };
75
+ isEnabled: boolean;
76
+ };
77
+ uiElements?: Array<{
78
+ name: string;
79
+ automationId: string;
80
+ controlType: string;
81
+ boundingRectangle?: { x: number; y: number; width: number; height: number };
82
+ }>;
83
+ uiTree?: unknown;
84
+ // Generic data for extended results
85
+ data?: unknown;
68
86
  }
69
87
 
70
88
  /**
@@ -154,6 +172,29 @@ const WaitSchema = z.object({
154
172
  ms: z.number().describe('Milliseconds to wait'),
155
173
  });
156
174
 
175
+ // ============ Native UI Automation Schemas (Windows) ============
176
+
177
+ const UIFindElementSchema = z.object({
178
+ name: z.string().optional().describe('Element name/label to find (e.g., "Submit", "OK", "File")'),
179
+ automationId: z.string().optional().describe('Automation ID of element (unique identifier)'),
180
+ controlType: z.string().optional().describe('Type of control: Button, Edit, Text, ComboBox, List, Menu, MenuItem, Window, etc.'),
181
+ });
182
+
183
+ const UIClickElementSchema = z.object({
184
+ name: z.string().optional().describe('Element name/label to click'),
185
+ automationId: z.string().optional().describe('Automation ID of element to click'),
186
+ controlType: z.string().optional().describe('Type of control: Button, Edit, etc.'),
187
+ clickType: z.enum(['left', 'right', 'double']).optional().describe('Click type (default: left)'),
188
+ });
189
+
190
+ const UIGetWindowTreeSchema = z.object({
191
+ maxDepth: z.number().optional().describe('Maximum depth to traverse (default: 3)'),
192
+ });
193
+
194
+ const UIFindButtonsSchema = z.object({});
195
+
196
+ const UIFindInputsSchema = z.object({});
197
+
157
198
  /**
158
199
  * Desktop tool response interface
159
200
  * This is what the Electron app returns after executing the action
@@ -218,44 +259,51 @@ function formatResultForLLM(
218
259
  parts.push(`**Clipboard Content:** ${result.clipboard}`);
219
260
  }
220
261
 
221
- if (parts.length === 0) {
222
- parts.push(`Desktop action "${action}" completed successfully.`);
262
+ // UI Automation results
263
+ if (result.uiElement) {
264
+ const el = result.uiElement;
265
+ parts.push(`**UI Element Found:**`);
266
+ parts.push(` - Name: "${el.name}"`);
267
+ parts.push(` - AutomationId: "${el.automationId}"`);
268
+ parts.push(` - Type: ${el.controlType}`);
269
+ if (el.boundingRectangle) {
270
+ const b = el.boundingRectangle;
271
+ parts.push(` - Bounds: (${b.x}, ${b.y}) ${b.width}x${b.height}`);
272
+ parts.push(` - Center: (${Math.round(b.x + b.width/2)}, ${Math.round(b.y + b.height/2)})`);
273
+ }
274
+ parts.push(` - Enabled: ${el.isEnabled}`);
223
275
  }
224
276
 
225
- return parts.join('\n');
226
- }
227
-
228
- /**
229
- * Create a tool result (either wait for callback or return marker)
230
- */
231
- async function createToolResult(
232
- action: string,
233
- args: Record<string, unknown>,
234
- config: ToolCallConfig | undefined,
235
- waitForResult?: DesktopToolCallback
236
- ): Promise<string> {
237
- const toolCallId = config?.toolCall?.id || `desktop-${Date.now()}`;
238
-
239
- if (waitForResult) {
240
- // Server context: wait for actual result from Electron app
241
- try {
242
- const result = await waitForResult(action, args, toolCallId);
243
- return formatResultForLLM(result, action);
244
- } catch (error) {
245
- const errorMessage =
246
- error instanceof Error ? error.message : String(error);
247
- return `Desktop action "${action}" failed: ${errorMessage}`;
277
+ if (result.uiElements && result.uiElements.length > 0) {
278
+ parts.push(`**UI Elements Found (${result.uiElements.length}):**`);
279
+ for (const el of result.uiElements.slice(0, 20)) { // Limit to 20
280
+ const bounds = el.boundingRectangle ?
281
+ ` at (${el.boundingRectangle.x}, ${el.boundingRectangle.y})` : '';
282
+ parts.push(` - [${el.controlType}] "${el.name}"${el.automationId ? ` (id: ${el.automationId})` : ''}${bounds}`);
283
+ }
284
+ if (result.uiElements.length > 20) {
285
+ parts.push(` ... and ${result.uiElements.length - 20} more`);
248
286
  }
249
287
  }
250
288
 
251
- // Non-server context: return marker for later processing
252
- const response: DesktopToolResponse = {
253
- requiresDesktopExecution: true,
254
- action,
255
- args,
256
- toolCallId,
257
- };
258
- return JSON.stringify(response);
289
+ if (result.uiTree) {
290
+ parts.push(`**UI Tree:**`);
291
+ parts.push('```json');
292
+ parts.push(JSON.stringify(result.uiTree, null, 2).slice(0, 3000)); // Limit size
293
+ parts.push('```');
294
+ }
295
+
296
+ if (result.data && !result.uiElement && !result.uiElements && !result.uiTree) {
297
+ // Generic data fallback
298
+ parts.push(`**Result:**`);
299
+ parts.push(JSON.stringify(result.data, null, 2).slice(0, 2000));
300
+ }
301
+
302
+ if (parts.length === 0) {
303
+ parts.push(`Desktop action "${action}" completed successfully.`);
304
+ }
305
+
306
+ return parts.join('\\n');
259
307
  }
260
308
 
261
309
  /**
@@ -266,275 +314,249 @@ export function createDesktopTools(
266
314
  options: CreateDesktopToolsOptions = {}
267
315
  ): DynamicStructuredTool[] {
268
316
  const { waitForResult } = options;
317
+ const tools: DynamicStructuredTool[] = [];
269
318
 
270
- return [
271
- // computer_screenshot
272
- tool(
273
- async (_args, config) => {
274
- return createToolResult(
275
- EDesktopTools.SCREENSHOT,
276
- {},
277
- config as ToolCallConfig,
278
- waitForResult
279
- );
280
- },
281
- {
282
- name: EDesktopTools.SCREENSHOT,
283
- description:
284
- 'Take a screenshot of the entire screen. Use this to see what is currently displayed on the desktop.',
285
- schema: ScreenshotSchema,
286
- }
287
- ),
288
-
289
- // computer_click
290
- tool(
291
- async (args, config) => {
292
- return createToolResult(
293
- EDesktopTools.CLICK,
294
- args,
295
- config as ToolCallConfig,
296
- waitForResult
297
- );
298
- },
299
- {
300
- name: EDesktopTools.CLICK,
301
- description:
302
- 'Click the mouse at the specified screen coordinates. Use screenshot first to identify the target location.',
303
- schema: ClickSchema,
304
- }
305
- ),
306
-
307
- // computer_double_click
308
- tool(
309
- async (args, config) => {
310
- return createToolResult(
311
- EDesktopTools.DOUBLE_CLICK,
312
- args,
313
- config as ToolCallConfig,
314
- waitForResult
315
- );
316
- },
317
- {
318
- name: EDesktopTools.DOUBLE_CLICK,
319
- description:
320
- 'Double-click the mouse at the specified screen coordinates.',
321
- schema: DoubleClickSchema,
322
- }
323
- ),
324
-
325
- // computer_right_click
326
- tool(
327
- async (args, config) => {
328
- return createToolResult(
329
- EDesktopTools.RIGHT_CLICK,
330
- args,
331
- config as ToolCallConfig,
332
- waitForResult
333
- );
334
- },
335
- {
336
- name: EDesktopTools.RIGHT_CLICK,
337
- description:
338
- 'Right-click the mouse at the specified screen coordinates to open context menus.',
339
- schema: RightClickSchema,
340
- }
341
- ),
342
-
343
- // computer_type
344
- tool(
345
- async (args, config) => {
346
- return createToolResult(
347
- EDesktopTools.TYPE,
348
- args,
349
- config as ToolCallConfig,
350
- waitForResult
351
- );
352
- },
353
- {
354
- name: EDesktopTools.TYPE,
355
- description:
356
- 'Type text using the keyboard. Make sure the target input field is focused first (use click).',
357
- schema: TypeSchema,
358
- }
359
- ),
360
-
361
- // computer_key
362
- tool(
363
- async (args, config) => {
364
- return createToolResult(
365
- EDesktopTools.KEY,
366
- args,
367
- config as ToolCallConfig,
368
- waitForResult
369
- );
370
- },
371
- {
372
- name: EDesktopTools.KEY,
373
- description:
374
- 'Press a single key on the keyboard (Enter, Tab, Escape, arrow keys, function keys, etc.).',
375
- schema: KeySchema,
376
- }
377
- ),
378
-
379
- // computer_key_combo
380
- tool(
381
- async (args, config) => {
382
- return createToolResult(
383
- EDesktopTools.KEY_COMBO,
384
- args,
385
- config as ToolCallConfig,
386
- waitForResult
387
- );
388
- },
389
- {
390
- name: EDesktopTools.KEY_COMBO,
391
- description:
392
- 'Press a key combination (e.g., Ctrl+C to copy, Ctrl+V to paste, Alt+Tab to switch windows).',
393
- schema: KeyComboSchema,
394
- }
395
- ),
396
-
397
- // computer_scroll
398
- tool(
399
- async (args, config) => {
400
- return createToolResult(
401
- EDesktopTools.SCROLL,
402
- args,
403
- config as ToolCallConfig,
404
- waitForResult
405
- );
406
- },
407
- {
408
- name: EDesktopTools.SCROLL,
409
- description:
410
- 'Scroll at the specified screen coordinates. Use negative deltaY to scroll up, positive to scroll down.',
411
- schema: ScrollSchema,
412
- }
413
- ),
414
-
415
- // computer_drag
416
- tool(
417
- async (args, config) => {
418
- return createToolResult(
419
- EDesktopTools.DRAG,
420
- args,
421
- config as ToolCallConfig,
422
- waitForResult
423
- );
424
- },
425
- {
426
- name: EDesktopTools.DRAG,
427
- description:
428
- 'Drag the mouse from one position to another (for moving windows, selecting text, etc.).',
429
- schema: DragSchema,
430
- }
431
- ),
432
-
433
- // computer_get_active_window
434
- tool(
435
- async (_args, config) => {
436
- return createToolResult(
437
- EDesktopTools.GET_ACTIVE_WINDOW,
438
- {},
439
- config as ToolCallConfig,
440
- waitForResult
441
- );
442
- },
443
- {
444
- name: EDesktopTools.GET_ACTIVE_WINDOW,
445
- description:
446
- 'Get information about the currently active window (title, application name, position, size).',
447
- schema: GetActiveWindowSchema,
448
- }
449
- ),
450
-
451
- // computer_get_mouse_position
452
- tool(
453
- async (_args, config) => {
454
- return createToolResult(
455
- EDesktopTools.GET_MOUSE_POSITION,
456
- {},
457
- config as ToolCallConfig,
458
- waitForResult
459
- );
460
- },
461
- {
462
- name: EDesktopTools.GET_MOUSE_POSITION,
463
- description: 'Get the current mouse cursor position on screen.',
464
- schema: GetMousePositionSchema,
465
- }
466
- ),
467
-
468
- // clipboard_read
469
- tool(
470
- async (_args, config) => {
471
- return createToolResult(
472
- EDesktopTools.CLIPBOARD_READ,
473
- {},
474
- config as ToolCallConfig,
475
- waitForResult
476
- );
477
- },
478
- {
479
- name: EDesktopTools.CLIPBOARD_READ,
480
- description: 'Read the current contents of the system clipboard.',
481
- schema: ClipboardReadSchema,
482
- }
483
- ),
484
-
485
- // clipboard_write
486
- tool(
487
- async (args, config) => {
488
- return createToolResult(
489
- EDesktopTools.CLIPBOARD_WRITE,
490
- args,
491
- config as ToolCallConfig,
492
- waitForResult
493
- );
494
- },
495
- {
496
- name: EDesktopTools.CLIPBOARD_WRITE,
497
- description: 'Write text to the system clipboard.',
498
- schema: ClipboardWriteSchema,
319
+ /**
320
+ * Helper to create tool function that optionally waits for results
321
+ * The toolCallId is extracted from the RunnableConfig passed by LangChain
322
+ */
323
+ const createToolFunction = (action: string) => {
324
+ return async (
325
+ args: Record<string, unknown>,
326
+ config?: { toolCall?: { id?: string } }
327
+ ): Promise<string> => {
328
+ const toolCallId =
329
+ config?.toolCall?.id ??
330
+ `desktop_${Date.now()}_${Math.random().toString(36).slice(2)}`;
331
+
332
+ // Create marker for Electron app
333
+ const marker: DesktopToolResponse = {
334
+ requiresDesktopExecution: true,
335
+ action,
336
+ args,
337
+ toolCallId,
338
+ };
339
+
340
+ // If no callback, return marker immediately (Electron handles via SSE interception)
341
+ if (!waitForResult) {
342
+ return JSON.stringify(marker);
499
343
  }
500
- ),
501
344
 
502
- // clipboard_paste
503
- tool(
504
- async (_args, config) => {
505
- return createToolResult(
506
- EDesktopTools.CLIPBOARD_PASTE,
507
- {},
508
- config as ToolCallConfig,
509
- waitForResult
510
- );
511
- },
512
- {
513
- name: EDesktopTools.CLIPBOARD_PASTE,
514
- description:
515
- 'Paste the clipboard contents (equivalent to Ctrl+V). Use clipboard_write first to set the content.',
516
- schema: ClipboardPasteSchema,
345
+ // With callback: wait for actual results from Electron app
346
+ try {
347
+ const result = await waitForResult(action, args, toolCallId);
348
+ return formatResultForLLM(result, action);
349
+ } catch (error) {
350
+ const errorMessage =
351
+ error instanceof Error ? error.message : String(error);
352
+ return `Desktop action "${action}" failed: ${errorMessage}`;
517
353
  }
518
- ),
354
+ };
355
+ };
519
356
 
520
- // computer_wait
521
- tool(
522
- async (args, config) => {
523
- return createToolResult(
524
- EDesktopTools.WAIT,
525
- args,
526
- config as ToolCallConfig,
527
- waitForResult
528
- );
529
- },
530
- {
531
- name: EDesktopTools.WAIT,
532
- description:
533
- 'Wait for the specified number of milliseconds. Use this to wait for UI animations or loading.',
534
- schema: WaitSchema,
535
- }
536
- ),
537
- ];
357
+ // computer_screenshot
358
+ tools.push(
359
+ tool(createToolFunction(EDesktopTools.SCREENSHOT), {
360
+ name: EDesktopTools.SCREENSHOT,
361
+ description:
362
+ 'Take a screenshot of the entire screen. Use this to see what is currently displayed on the desktop.',
363
+ schema: ScreenshotSchema,
364
+ })
365
+ );
366
+
367
+ // computer_click
368
+ tools.push(
369
+ tool(createToolFunction(EDesktopTools.CLICK), {
370
+ name: EDesktopTools.CLICK,
371
+ description:
372
+ 'Click the mouse at the specified screen coordinates. Use screenshot first to identify the target location.',
373
+ schema: ClickSchema,
374
+ })
375
+ );
376
+
377
+ // computer_double_click
378
+ tools.push(
379
+ tool(createToolFunction(EDesktopTools.DOUBLE_CLICK), {
380
+ name: EDesktopTools.DOUBLE_CLICK,
381
+ description:
382
+ 'Double-click the mouse at the specified screen coordinates.',
383
+ schema: DoubleClickSchema,
384
+ })
385
+ );
386
+
387
+ // computer_right_click
388
+ tools.push(
389
+ tool(createToolFunction(EDesktopTools.RIGHT_CLICK), {
390
+ name: EDesktopTools.RIGHT_CLICK,
391
+ description:
392
+ 'Right-click the mouse at the specified screen coordinates to open context menus.',
393
+ schema: RightClickSchema,
394
+ })
395
+ );
396
+
397
+ // computer_type
398
+ tools.push(
399
+ tool(createToolFunction(EDesktopTools.TYPE), {
400
+ name: EDesktopTools.TYPE,
401
+ description:
402
+ 'Type text using the keyboard. Make sure the target input field is focused first (use click).',
403
+ schema: TypeSchema,
404
+ })
405
+ );
406
+
407
+ // computer_key
408
+ tools.push(
409
+ tool(createToolFunction(EDesktopTools.KEY), {
410
+ name: EDesktopTools.KEY,
411
+ description:
412
+ 'Press a single key on the keyboard (Enter, Tab, Escape, arrow keys, function keys, etc.).',
413
+ schema: KeySchema,
414
+ })
415
+ );
416
+
417
+ // computer_key_combo
418
+ tools.push(
419
+ tool(createToolFunction(EDesktopTools.KEY_COMBO), {
420
+ name: EDesktopTools.KEY_COMBO,
421
+ description:
422
+ 'Press a key combination (e.g., Ctrl+C to copy, Ctrl+V to paste, Alt+Tab to switch windows).',
423
+ schema: KeyComboSchema,
424
+ })
425
+ );
426
+
427
+ // computer_scroll
428
+ tools.push(
429
+ tool(createToolFunction(EDesktopTools.SCROLL), {
430
+ name: EDesktopTools.SCROLL,
431
+ description:
432
+ 'Scroll at the specified screen coordinates. Use negative deltaY to scroll up, positive to scroll down.',
433
+ schema: ScrollSchema,
434
+ })
435
+ );
436
+
437
+ // computer_drag
438
+ tools.push(
439
+ tool(createToolFunction(EDesktopTools.DRAG), {
440
+ name: EDesktopTools.DRAG,
441
+ description:
442
+ 'Drag the mouse from one position to another (for moving windows, selecting text, etc.).',
443
+ schema: DragSchema,
444
+ })
445
+ );
446
+
447
+ // computer_get_active_window
448
+ tools.push(
449
+ tool(createToolFunction(EDesktopTools.GET_ACTIVE_WINDOW), {
450
+ name: EDesktopTools.GET_ACTIVE_WINDOW,
451
+ description:
452
+ 'Get information about the currently active window (title, application name, position, size).',
453
+ schema: GetActiveWindowSchema,
454
+ })
455
+ );
456
+
457
+ // computer_get_mouse_position
458
+ tools.push(
459
+ tool(createToolFunction(EDesktopTools.GET_MOUSE_POSITION), {
460
+ name: EDesktopTools.GET_MOUSE_POSITION,
461
+ description: 'Get the current mouse cursor position on screen.',
462
+ schema: GetMousePositionSchema,
463
+ })
464
+ );
465
+
466
+ // clipboard_read
467
+ tools.push(
468
+ tool(createToolFunction(EDesktopTools.CLIPBOARD_READ), {
469
+ name: EDesktopTools.CLIPBOARD_READ,
470
+ description: 'Read the current contents of the system clipboard.',
471
+ schema: ClipboardReadSchema,
472
+ })
473
+ );
474
+
475
+ // clipboard_write
476
+ tools.push(
477
+ tool(createToolFunction(EDesktopTools.CLIPBOARD_WRITE), {
478
+ name: EDesktopTools.CLIPBOARD_WRITE,
479
+ description: 'Write text to the system clipboard.',
480
+ schema: ClipboardWriteSchema,
481
+ })
482
+ );
483
+
484
+ // clipboard_paste
485
+ tools.push(
486
+ tool(createToolFunction(EDesktopTools.CLIPBOARD_PASTE), {
487
+ name: EDesktopTools.CLIPBOARD_PASTE,
488
+ description:
489
+ 'Paste the clipboard contents (equivalent to Ctrl+V). Use clipboard_write first to set the content.',
490
+ schema: ClipboardPasteSchema,
491
+ })
492
+ );
493
+
494
+ // computer_wait
495
+ tools.push(
496
+ tool(createToolFunction(EDesktopTools.WAIT), {
497
+ name: EDesktopTools.WAIT,
498
+ description:
499
+ 'Wait for the specified number of milliseconds. Use this to wait for UI animations or loading.',
500
+ schema: WaitSchema,
501
+ })
502
+ );
503
+
504
+ // ============ Native UI Automation Tools (Windows) ============
505
+ // These are FASTER and MORE RELIABLE than screenshot-based automation
506
+ // They find elements by semantic properties (name, automationId, type)
507
+ // instead of relying on pixel coordinates from screenshots
508
+
509
+ // ui_find_element
510
+ tools.push(
511
+ tool(createToolFunction(EDesktopTools.UI_FIND_ELEMENT), {
512
+ name: EDesktopTools.UI_FIND_ELEMENT,
513
+ description:
514
+ '🚀 PREFERRED: Find a UI element by semantic properties (name, automationId, controlType). MUCH FASTER than screenshot analysis. Returns element bounds for clicking. Windows only.',
515
+ schema: UIFindElementSchema,
516
+ })
517
+ );
518
+
519
+ // ui_click_element
520
+ tools.push(
521
+ tool(createToolFunction(EDesktopTools.UI_CLICK_ELEMENT), {
522
+ name: EDesktopTools.UI_CLICK_ELEMENT,
523
+ description:
524
+ '🚀 PREFERRED: Find and click a UI element by name/automationId. More reliable than coordinate-based clicking. Example: ui_click_element({name: "OK"}) or ui_click_element({controlType: "Button", name: "Submit"}). Windows only.',
525
+ schema: UIClickElementSchema,
526
+ })
527
+ );
528
+
529
+ // ui_get_window_tree
530
+ tools.push(
531
+ tool(createToolFunction(EDesktopTools.UI_GET_WINDOW_TREE), {
532
+ name: EDesktopTools.UI_GET_WINDOW_TREE,
533
+ description:
534
+ 'Get the UI element tree of the active window. Shows all buttons, inputs, menus, etc. with their names and automationIds. Use this to discover elements before clicking. Windows only.',
535
+ schema: UIGetWindowTreeSchema,
536
+ })
537
+ );
538
+
539
+ // ui_find_buttons
540
+ tools.push(
541
+ tool(createToolFunction(EDesktopTools.UI_FIND_BUTTONS), {
542
+ name: EDesktopTools.UI_FIND_BUTTONS,
543
+ description:
544
+ 'Find all clickable buttons in the active window. Returns list with names and positions. Useful for discovering available actions. Windows only.',
545
+ schema: UIFindButtonsSchema,
546
+ })
547
+ );
548
+
549
+ // ui_find_inputs
550
+ tools.push(
551
+ tool(createToolFunction(EDesktopTools.UI_FIND_INPUTS), {
552
+ name: EDesktopTools.UI_FIND_INPUTS,
553
+ description:
554
+ 'Find all text input fields in the active window. Returns list with names and positions. Useful for discovering form fields. Windows only.',
555
+ schema: UIFindInputsSchema,
556
+ })
557
+ );
558
+
559
+ return tools;
538
560
  }
539
561
 
540
562
  /**