askui 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. package/dist/cjs/core/models/anthropic/askui-agent.js +20 -32
  2. package/dist/cjs/core/models/anthropic/claude-agent.d.ts +16 -4
  3. package/dist/cjs/core/models/anthropic/claude-agent.js +43 -5
  4. package/dist/cjs/core/models/anthropic/index.d.ts +1 -2
  5. package/dist/cjs/core/models/anthropic/tools/os-agent-tools.d.ts +59 -5
  6. package/dist/cjs/core/models/anthropic/tools/os-agent-tools.js +340 -75
  7. package/dist/cjs/core/ui-control-commands/input-event.d.ts +3 -1
  8. package/dist/cjs/core/ui-control-commands/input-event.js +2 -0
  9. package/dist/cjs/execution/execution-runtime.d.ts +4 -0
  10. package/dist/cjs/execution/inference-client.d.ts +4 -0
  11. package/dist/cjs/execution/ui-control-client.d.ts +84 -34
  12. package/dist/cjs/execution/ui-control-client.js +4 -48
  13. package/dist/esm/core/models/anthropic/askui-agent.js +21 -33
  14. package/dist/esm/core/models/anthropic/claude-agent.d.ts +16 -4
  15. package/dist/esm/core/models/anthropic/claude-agent.js +43 -5
  16. package/dist/esm/core/models/anthropic/index.d.ts +1 -2
  17. package/dist/esm/core/models/anthropic/tools/os-agent-tools.d.ts +59 -5
  18. package/dist/esm/core/models/anthropic/tools/os-agent-tools.js +332 -72
  19. package/dist/esm/core/ui-control-commands/input-event.d.ts +3 -1
  20. package/dist/esm/core/ui-control-commands/input-event.js +2 -0
  21. package/dist/esm/execution/execution-runtime.d.ts +4 -0
  22. package/dist/esm/execution/inference-client.d.ts +4 -0
  23. package/dist/esm/execution/ui-control-client.d.ts +84 -34
  24. package/dist/esm/execution/ui-control-client.js +4 -48
  25. package/package.json +2 -2
@@ -5,7 +5,7 @@ import { AnnotationRequest } from '../core/model/annotation-result/annotation-in
5
5
  import { DetectedElement } from '../core/model/annotation-result/detected-element';
6
6
  import { ClientArgs } from './ui-controller-client-interface';
7
7
  import { ModelCompositionBranch } from './model-composition-branch';
8
- import { AskUIAgent, AgentHistory } from '../core/models/anthropic';
8
+ import { AskUIAgent, AgentHistory, ActOptions } from '../core/models/anthropic';
9
9
  export type RelationsForConvenienceMethods = 'nearestTo' | 'leftOf' | 'above' | 'rightOf' | 'below' | 'contains';
10
10
  export type TextMatchingOption = 'similar' | 'exact' | 'regex';
11
11
  export type ElementExistsQueryType = 'otherElement' | 'switch' | 'element' | 'container' | 'checkbox' | 'element' | 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield';
@@ -476,51 +476,101 @@ export declare class UiControlClient extends ApiCommands {
476
476
  */
477
477
  expectAllExist(query: ElementExistsQuery[]): Promise<ExpectAllExistResult>;
478
478
  /**
479
- * Instructs the agent to achieve a specified goal through autonomous actions.
479
+ * Instructs the agent to autonomously achieve a specified goal through UI interactions.
480
480
  *
481
- * The agent will analyze the screen, determine necessary steps, and perform actions
482
- * to accomplish the goal. This may include clicking, typing, scrolling, and other
483
- * interface interactions.
481
+ * This method enables AI-powered automation by allowing the agent to:
482
+ * - Analyze the current screen state and/or provided images
483
+ * - Plan and execute a sequence of UI interactions
484
+ * - Handle complex tasks through natural language instructions
485
+ * - Maintain context across multiple actions
484
486
  *
485
- * The `options` parameter allows the caller to maintain contextual continuity across
486
- * multiple `act` calls, either from the same or different agent interfaces.
487
+ * The agent can perform various UI interactions including:
488
+ * - Clicking buttons, links, and other interactive elements
489
+ * - Typing text into input fields
490
+ * - Scrolling and navigating through interfaces
487
491
  *
488
- * **Examples:**
492
+ * ### Method Signatures
493
+ * ```typescript
494
+ * act(goal: string, options?: ActOptions): Promise<AgentHistory>
495
+ * act(goal: string, imagePathOrBase64: string, options?: ActOptions): Promise<AgentHistory>
496
+ * ```
489
497
  *
490
- * ```ts
491
- * // Use chatId to maintain context across consecutive steps
492
- * await aui.act("Search online for the current gold price", {
493
- * chatId: "session-gold-price"
498
+ * ### Parameters
499
+ * @param goal - A natural language instruction describing the task to accomplish.
500
+ * Be specific and clear about the desired outcome.
501
+ * @param imagePathOrBase64 - (Optional) Path to an image file or base64-encoded image string.
502
+ * Used to provide additional visual context for the task.
503
+ * @param options - (Optional) Configuration options for the agent's behavior.
504
+ * @param options.chatId - A unique identifier to maintain context between related actions.
505
+ * Useful for multi-step tasks that require state preservation.
506
+ * @param options.agentHistory
507
+ * - (Optional) Previous interaction history to share between
508
+ * different agent instances. Enables cross-platform task coordination.
509
+ *
510
+ * ### Returns
511
+ * @returns Promise<AgentHistory> - A promise that resolves to the updated interaction history,
512
+ * containing details about the actions taken and their outcomes.
513
+ *
514
+ * ### Throws
515
+ * - If the agent is not properly connected
516
+ * - If the provided goal cannot be understood or executed
517
+ * - If required UI elements are not found or accessible
518
+ * - If the image path is invalid or the base64 string is malformed
519
+ *
520
+ * ### Examples
521
+ *
522
+ * #### Basic Usage
523
+ * ```typescript
524
+ * // Simple task execution
525
+ * await aui.act("Open Chrome and navigate to google.com");
526
+ * ```
527
+ *
528
+ * #### Maintaining Context
529
+ * ```typescript
530
+ * // Multi-step task with context preservation
531
+ * await aui.act("Search for current gold prices", {
532
+ * chatId: "gold-price-task"
494
533
  * });
495
- * await aui.act("Create a new text file and type the gold price result into it", {
496
- * chatId: "session-gold-price"
534
+ *
535
+ * await aui.act("Create a new text file and save the price", {
536
+ * chatId: "gold-price-task"
497
537
  * });
538
+ * ```
498
539
  *
499
- * // Share history explicitly between separate agents (e.g., desktop and Android)
500
- * // By default, the agent operates as a computer agent.
501
- * // To control an Android device, you must configure it explicitly:
540
+ * #### Cross-Platform Coordination
541
+ * ```typescript
542
+ * // Share context between desktop and mobile agents
502
543
  * await auiAndroid.agent.configureAsAndroidAgent();
544
+ *
503
545
  * const history = await auiDesktop.act("Copy username from desktop app");
504
- * await auiAndroid.act("Paste username into the mobile login screen", {
546
+ * await auiAndroid.act("Paste username into mobile login", {
505
547
  * agentHistory: history
506
548
  * });
507
549
  * ```
508
550
  *
509
- * @param {string} goal - A description of what the agent should achieve.
510
- * @param {Object} [options] - Optional parameters to maintain or share context.
511
- * @param {string} [options.chatId] - A session identifier used to persist memory between
512
- * consecutive `act` calls. When multiple actions share the
513
- * same `chatId`, the agent retains knowledge of prior steps,
514
- * such as extracted data or navigation history.
515
- * @param {AgentHistory} [options.agentHistory] - A shared interaction history object that can be
516
- * passed between different agent clients (e.g., between
517
- * `auiDesktop` and `auiAndroid`) to ensure continuity
518
- * of understanding and task flow.
519
- * @returns {Promise<AgentHistory>} - Updated action history after executing the goal.
520
- * @throws {Error} If the agent is not connected when the method is called.
551
+ * #### Using Images for Context
552
+ * ```typescript
553
+ * // Using image file
554
+ * await aui.act(
555
+ * "Click the 'Submit' button in the provided image",
556
+ * 'path/to/screenshot.png'
557
+ * );
558
+ *
559
+ * // Using base64 image
560
+ * const base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...";
561
+ * await aui.act(
562
+ * "Click the 'Submit' button in the provided image",
563
+ * base64Image
564
+ * );
565
+ * ```
566
+ *
567
+ * ### Best Practices
568
+ * 1. Be specific in your goal descriptions
569
+ * 2. Use chatId for related tasks to maintain context
570
+ * 3. Provide clear visual context when needed
571
+ * 4. Handle errors appropriately in your implementation
572
+ * 5. Consider using agentHistory for complex cross-platform workflows
521
573
  */
522
- act(goal: string, options?: {
523
- chatId?: string;
524
- agentHistory?: AgentHistory;
525
- }): Promise<AgentHistory>;
574
+ act(goal: string, options?: ActOptions): Promise<AgentHistory>;
575
+ act(goal: string, imagePathOrBase64String: string, options?: ActOptions): Promise<AgentHistory>;
526
576
  }
@@ -735,56 +735,12 @@ export class UiControlClient extends ApiCommands {
735
735
  };
736
736
  });
737
737
  }
738
- /**
739
- * Instructs the agent to achieve a specified goal through autonomous actions.
740
- *
741
- * The agent will analyze the screen, determine necessary steps, and perform actions
742
- * to accomplish the goal. This may include clicking, typing, scrolling, and other
743
- * interface interactions.
744
- *
745
- * The `options` parameter allows the caller to maintain contextual continuity across
746
- * multiple `act` calls, either from the same or different agent interfaces.
747
- *
748
- * **Examples:**
749
- *
750
- * ```ts
751
- * // Use chatId to maintain context across consecutive steps
752
- * await aui.act("Search online for the current gold price", {
753
- * chatId: "session-gold-price"
754
- * });
755
- * await aui.act("Create a new text file and type the gold price result into it", {
756
- * chatId: "session-gold-price"
757
- * });
758
- *
759
- * // Share history explicitly between separate agents (e.g., desktop and Android)
760
- * // By default, the agent operates as a computer agent.
761
- * // To control an Android device, you must configure it explicitly:
762
- * await auiAndroid.agent.configureAsAndroidAgent();
763
- * const history = await auiDesktop.act("Copy username from desktop app");
764
- * await auiAndroid.act("Paste username into the mobile login screen", {
765
- * agentHistory: history
766
- * });
767
- * ```
768
- *
769
- * @param {string} goal - A description of what the agent should achieve.
770
- * @param {Object} [options] - Optional parameters to maintain or share context.
771
- * @param {string} [options.chatId] - A session identifier used to persist memory between
772
- * consecutive `act` calls. When multiple actions share the
773
- * same `chatId`, the agent retains knowledge of prior steps,
774
- * such as extracted data or navigation history.
775
- * @param {AgentHistory} [options.agentHistory] - A shared interaction history object that can be
776
- * passed between different agent clients (e.g., between
777
- * `auiDesktop` and `auiAndroid`) to ensure continuity
778
- * of understanding and task flow.
779
- * @returns {Promise<AgentHistory>} - Updated action history after executing the goal.
780
- * @throws {Error} If the agent is not connected when the method is called.
781
- */
782
- act(goal, options) {
738
+ act(goal, imageOrOptions, options) {
783
739
  return __awaiter(this, void 0, void 0, function* () {
784
- if (!this.agent.isConnected()) {
785
- throw new Error('Agent is not connected, Please call connect() first');
740
+ if (typeof imageOrOptions === 'string') {
741
+ return this.agent.act(goal, imageOrOptions, options);
786
742
  }
787
- return this.agent.act(goal, options);
743
+ return this.agent.act(goal, undefined, imageOrOptions);
788
744
  });
789
745
  }
790
746
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "askui",
3
- "version": "0.25.1",
3
+ "version": "0.26.0",
4
4
  "license": "MIT",
5
5
  "author": "askui GmbH <info@askui.com> (http://www.askui.com/)",
6
6
  "description": "Reliable, automated end-to-end-testing that depends on what is shown on your screen instead of the technology you are running on",
@@ -51,7 +51,7 @@
51
51
  "dist/example_projects_templates/"
52
52
  ],
53
53
  "dependencies": {
54
- "@anthropic-ai/sdk": "0.52.0",
54
+ "@anthropic-ai/sdk": "0.53.0",
55
55
  "chalk": "4.1.1",
56
56
  "commander": "12.1.0",
57
57
  "fkill": "7.2.1",