askui 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/core/models/anthropic/askui-agent.js +20 -32
- package/dist/cjs/core/models/anthropic/claude-agent.d.ts +16 -4
- package/dist/cjs/core/models/anthropic/claude-agent.js +43 -5
- package/dist/cjs/core/models/anthropic/index.d.ts +1 -2
- package/dist/cjs/core/models/anthropic/tools/os-agent-tools.d.ts +59 -5
- package/dist/cjs/core/models/anthropic/tools/os-agent-tools.js +340 -75
- package/dist/cjs/core/ui-control-commands/input-event.d.ts +3 -1
- package/dist/cjs/core/ui-control-commands/input-event.js +2 -0
- package/dist/cjs/execution/execution-runtime.d.ts +4 -0
- package/dist/cjs/execution/inference-client.d.ts +4 -0
- package/dist/cjs/execution/ui-control-client.d.ts +84 -34
- package/dist/cjs/execution/ui-control-client.js +4 -48
- package/dist/cjs/lib/interactive_cli/create-example-project.js +1 -4
- package/dist/esm/core/models/anthropic/askui-agent.js +21 -33
- package/dist/esm/core/models/anthropic/claude-agent.d.ts +16 -4
- package/dist/esm/core/models/anthropic/claude-agent.js +43 -5
- package/dist/esm/core/models/anthropic/index.d.ts +1 -2
- package/dist/esm/core/models/anthropic/tools/os-agent-tools.d.ts +59 -5
- package/dist/esm/core/models/anthropic/tools/os-agent-tools.js +332 -72
- package/dist/esm/core/ui-control-commands/input-event.d.ts +3 -1
- package/dist/esm/core/ui-control-commands/input-event.js +2 -0
- package/dist/esm/execution/execution-runtime.d.ts +4 -0
- package/dist/esm/execution/inference-client.d.ts +4 -0
- package/dist/esm/execution/ui-control-client.d.ts +84 -34
- package/dist/esm/execution/ui-control-client.js +4 -48
- package/dist/esm/lib/interactive_cli/create-example-project.js +1 -4
- package/dist/example_projects_templates/typescript/askui_example/my-first-askui-test-suite.test.ts +7 -17
- package/package.json +2 -2
|
@@ -5,7 +5,7 @@ import { AnnotationRequest } from '../core/model/annotation-result/annotation-in
|
|
|
5
5
|
import { DetectedElement } from '../core/model/annotation-result/detected-element';
|
|
6
6
|
import { ClientArgs } from './ui-controller-client-interface';
|
|
7
7
|
import { ModelCompositionBranch } from './model-composition-branch';
|
|
8
|
-
import { AskUIAgent, AgentHistory } from '../core/models/anthropic';
|
|
8
|
+
import { AskUIAgent, AgentHistory, ActOptions } from '../core/models/anthropic';
|
|
9
9
|
export type RelationsForConvenienceMethods = 'nearestTo' | 'leftOf' | 'above' | 'rightOf' | 'below' | 'contains';
|
|
10
10
|
export type TextMatchingOption = 'similar' | 'exact' | 'regex';
|
|
11
11
|
export type ElementExistsQueryType = 'otherElement' | 'switch' | 'element' | 'container' | 'checkbox' | 'element' | 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield';
|
|
@@ -476,51 +476,101 @@ export declare class UiControlClient extends ApiCommands {
|
|
|
476
476
|
*/
|
|
477
477
|
expectAllExist(query: ElementExistsQuery[]): Promise<ExpectAllExistResult>;
|
|
478
478
|
/**
|
|
479
|
-
* Instructs the agent to achieve a specified goal through
|
|
479
|
+
* Instructs the agent to autonomously achieve a specified goal through UI interactions.
|
|
480
480
|
*
|
|
481
|
-
*
|
|
482
|
-
*
|
|
483
|
-
*
|
|
481
|
+
* This method enables AI-powered automation by allowing the agent to:
|
|
482
|
+
* - Analyze the current screen state and/or provided images
|
|
483
|
+
* - Plan and execute a sequence of UI interactions
|
|
484
|
+
* - Handle complex tasks through natural language instructions
|
|
485
|
+
* - Maintain context across multiple actions
|
|
484
486
|
*
|
|
485
|
-
* The
|
|
486
|
-
*
|
|
487
|
+
* The agent can perform various UI interactions including:
|
|
488
|
+
* - Clicking buttons, links, and other interactive elements
|
|
489
|
+
* - Typing text into input fields
|
|
490
|
+
* - Scrolling and navigating through interfaces
|
|
487
491
|
*
|
|
488
|
-
*
|
|
492
|
+
* ### Method Signatures
|
|
493
|
+
* ```typescript
|
|
494
|
+
* act(goal: string, options?: ActOptions): Promise<AgentHistory>
|
|
495
|
+
* act(goal: string, imagePathOrBase64: string, options?: ActOptions): Promise<AgentHistory>
|
|
496
|
+
* ```
|
|
489
497
|
*
|
|
490
|
-
*
|
|
491
|
-
*
|
|
492
|
-
*
|
|
493
|
-
*
|
|
498
|
+
* ### Parameters
|
|
499
|
+
* @param goal - A natural language instruction describing the task to accomplish.
|
|
500
|
+
* Be specific and clear about the desired outcome.
|
|
501
|
+
* @param imagePathOrBase64 - (Optional) Path to an image file or base64-encoded image string.
|
|
502
|
+
* Used to provide additional visual context for the task.
|
|
503
|
+
* @param options - (Optional) Configuration options for the agent's behavior.
|
|
504
|
+
* @param options.chatId - A unique identifier to maintain context between related actions.
|
|
505
|
+
* Useful for multi-step tasks that require state preservation.
|
|
506
|
+
* @param options.agentHistory
|
|
507
|
+
* - (Optional) Previous interaction history to share between
|
|
508
|
+
* different agent instances. Enables cross-platform task coordination.
|
|
509
|
+
*
|
|
510
|
+
* ### Returns
|
|
511
|
+
* @returns Promise<AgentHistory> - A promise that resolves to the updated interaction history,
|
|
512
|
+
* containing details about the actions taken and their outcomes.
|
|
513
|
+
*
|
|
514
|
+
* ### Throws
|
|
515
|
+
* - If the agent is not properly connected
|
|
516
|
+
* - If the provided goal cannot be understood or executed
|
|
517
|
+
* - If required UI elements are not found or accessible
|
|
518
|
+
* - If the image path is invalid or the base64 string is malformed
|
|
519
|
+
*
|
|
520
|
+
* ### Examples
|
|
521
|
+
*
|
|
522
|
+
* #### Basic Usage
|
|
523
|
+
* ```typescript
|
|
524
|
+
* // Simple task execution
|
|
525
|
+
* await aui.act("Open Chrome and navigate to google.com");
|
|
526
|
+
* ```
|
|
527
|
+
*
|
|
528
|
+
* #### Maintaining Context
|
|
529
|
+
* ```typescript
|
|
530
|
+
* // Multi-step task with context preservation
|
|
531
|
+
* await aui.act("Search for current gold prices", {
|
|
532
|
+
* chatId: "gold-price-task"
|
|
494
533
|
* });
|
|
495
|
-
*
|
|
496
|
-
*
|
|
534
|
+
*
|
|
535
|
+
* await aui.act("Create a new text file and save the price", {
|
|
536
|
+
* chatId: "gold-price-task"
|
|
497
537
|
* });
|
|
538
|
+
* ```
|
|
498
539
|
*
|
|
499
|
-
*
|
|
500
|
-
*
|
|
501
|
-
* //
|
|
540
|
+
* #### Cross-Platform Coordination
|
|
541
|
+
* ```typescript
|
|
542
|
+
* // Share context between desktop and mobile agents
|
|
502
543
|
* await auiAndroid.agent.configureAsAndroidAgent();
|
|
544
|
+
*
|
|
503
545
|
* const history = await auiDesktop.act("Copy username from desktop app");
|
|
504
|
-
* await auiAndroid.act("Paste username into
|
|
546
|
+
* await auiAndroid.act("Paste username into mobile login", {
|
|
505
547
|
* agentHistory: history
|
|
506
548
|
* });
|
|
507
549
|
* ```
|
|
508
550
|
*
|
|
509
|
-
*
|
|
510
|
-
*
|
|
511
|
-
*
|
|
512
|
-
*
|
|
513
|
-
*
|
|
514
|
-
*
|
|
515
|
-
*
|
|
516
|
-
*
|
|
517
|
-
*
|
|
518
|
-
*
|
|
519
|
-
*
|
|
520
|
-
*
|
|
551
|
+
* #### Using Images for Context
|
|
552
|
+
* ```typescript
|
|
553
|
+
* // Using image file
|
|
554
|
+
* await aui.act(
|
|
555
|
+
* "Click the 'Submit' button in the provided image",
|
|
556
|
+
* 'path/to/screenshot.png'
|
|
557
|
+
* );
|
|
558
|
+
*
|
|
559
|
+
* // Using base64 image
|
|
560
|
+
* const base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...";
|
|
561
|
+
* await aui.act(
|
|
562
|
+
* "Click the 'Submit' button in the provided image",
|
|
563
|
+
* base64Image
|
|
564
|
+
* );
|
|
565
|
+
* ```
|
|
566
|
+
*
|
|
567
|
+
* ### Best Practices
|
|
568
|
+
* 1. Be specific in your goal descriptions
|
|
569
|
+
* 2. Use chatId for related tasks to maintain context
|
|
570
|
+
* 3. Provide clear visual context when needed
|
|
571
|
+
* 4. Handle errors appropriately in your implementation
|
|
572
|
+
* 5. Consider using agentHistory for complex cross-platform workflows
|
|
521
573
|
*/
|
|
522
|
-
act(goal: string, options?:
|
|
523
|
-
|
|
524
|
-
agentHistory?: AgentHistory;
|
|
525
|
-
}): Promise<AgentHistory>;
|
|
574
|
+
act(goal: string, options?: ActOptions): Promise<AgentHistory>;
|
|
575
|
+
act(goal: string, imagePathOrBase64String: string, options?: ActOptions): Promise<AgentHistory>;
|
|
526
576
|
}
|
|
@@ -738,56 +738,12 @@ class UiControlClient extends dsl_1.ApiCommands {
|
|
|
738
738
|
};
|
|
739
739
|
});
|
|
740
740
|
}
|
|
741
|
-
|
|
742
|
-
* Instructs the agent to achieve a specified goal through autonomous actions.
|
|
743
|
-
*
|
|
744
|
-
* The agent will analyze the screen, determine necessary steps, and perform actions
|
|
745
|
-
* to accomplish the goal. This may include clicking, typing, scrolling, and other
|
|
746
|
-
* interface interactions.
|
|
747
|
-
*
|
|
748
|
-
* The `options` parameter allows the caller to maintain contextual continuity across
|
|
749
|
-
* multiple `act` calls, either from the same or different agent interfaces.
|
|
750
|
-
*
|
|
751
|
-
* **Examples:**
|
|
752
|
-
*
|
|
753
|
-
* ```ts
|
|
754
|
-
* // Use chatId to maintain context across consecutive steps
|
|
755
|
-
* await aui.act("Search online for the current gold price", {
|
|
756
|
-
* chatId: "session-gold-price"
|
|
757
|
-
* });
|
|
758
|
-
* await aui.act("Create a new text file and type the gold price result into it", {
|
|
759
|
-
* chatId: "session-gold-price"
|
|
760
|
-
* });
|
|
761
|
-
*
|
|
762
|
-
* // Share history explicitly between separate agents (e.g., desktop and Android)
|
|
763
|
-
* // By default, the agent operates as a computer agent.
|
|
764
|
-
* // To control an Android device, you must configure it explicitly:
|
|
765
|
-
* await auiAndroid.agent.configureAsAndroidAgent();
|
|
766
|
-
* const history = await auiDesktop.act("Copy username from desktop app");
|
|
767
|
-
* await auiAndroid.act("Paste username into the mobile login screen", {
|
|
768
|
-
* agentHistory: history
|
|
769
|
-
* });
|
|
770
|
-
* ```
|
|
771
|
-
*
|
|
772
|
-
* @param {string} goal - A description of what the agent should achieve.
|
|
773
|
-
* @param {Object} [options] - Optional parameters to maintain or share context.
|
|
774
|
-
* @param {string} [options.chatId] - A session identifier used to persist memory between
|
|
775
|
-
* consecutive `act` calls. When multiple actions share the
|
|
776
|
-
* same `chatId`, the agent retains knowledge of prior steps,
|
|
777
|
-
* such as extracted data or navigation history.
|
|
778
|
-
* @param {AgentHistory} [options.agentHistory] - A shared interaction history object that can be
|
|
779
|
-
* passed between different agent clients (e.g., between
|
|
780
|
-
* `auiDesktop` and `auiAndroid`) to ensure continuity
|
|
781
|
-
* of understanding and task flow.
|
|
782
|
-
* @returns {Promise<AgentHistory>} - Updated action history after executing the goal.
|
|
783
|
-
* @throws {Error} If the agent is not connected when the method is called.
|
|
784
|
-
*/
|
|
785
|
-
act(goal, options) {
|
|
741
|
+
act(goal, imageOrOptions, options) {
|
|
786
742
|
return __awaiter(this, void 0, void 0, function* () {
|
|
787
|
-
if (
|
|
788
|
-
|
|
743
|
+
if (typeof imageOrOptions === 'string') {
|
|
744
|
+
return this.agent.act(goal, imageOrOptions, options);
|
|
789
745
|
}
|
|
790
|
-
return this.agent.act(goal,
|
|
746
|
+
return this.agent.act(goal, undefined, imageOrOptions);
|
|
791
747
|
});
|
|
792
748
|
}
|
|
793
749
|
}
|
|
@@ -176,10 +176,6 @@ class CreateExampleProject {
|
|
|
176
176
|
title: 'Add eslint run command',
|
|
177
177
|
task: () => __awaiter(this, void 0, void 0, function* () { return this.addESLintRunCommand(); }),
|
|
178
178
|
},
|
|
179
|
-
{
|
|
180
|
-
title: 'Add vscode settings',
|
|
181
|
-
task: () => __awaiter(this, void 0, void 0, function* () { return this.addVSCodeSettings(); }),
|
|
182
|
-
},
|
|
183
179
|
]);
|
|
184
180
|
}),
|
|
185
181
|
}];
|
|
@@ -274,6 +270,7 @@ class CreateExampleProject {
|
|
|
274
270
|
const tasks = new listr_1.default();
|
|
275
271
|
tasks.add([
|
|
276
272
|
...(yield this.copyTemplateProject()),
|
|
273
|
+
...(yield this.addVSCodeSettings()),
|
|
277
274
|
...(yield this.setupTestFrameWork()),
|
|
278
275
|
...(yield this.copyESLintConfigFiles()),
|
|
279
276
|
...(yield this.copyGitignore()),
|
|
@@ -7,7 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
-
import {
|
|
10
|
+
import { DesktopPressAndReleaseKeysTool, MouseClickTool, MouseMoveTool, MouseScrollTool, OsAgentHandler, ScreenShotTool, TypeTool, AgentErrorTool, AndroidSequenceKeyPressTool, AndroidSingleKeyPressTool, ExecuteShellCommandTool, DesktopKeyHoldDownTool, DesktopKeyReleaseTool, MouseReleaseLeftButtonTool, MouseHoldLeftButtonDownTool, MouseDragAndDropTool, WaitTool, } from './tools/os-agent-tools';
|
|
11
11
|
import { ClaudeAgent } from './claude-agent';
|
|
12
12
|
export class AskUIAgent extends ClaudeAgent {
|
|
13
13
|
constructor(executionRuntime) {
|
|
@@ -34,9 +34,14 @@ export class AskUIAgent extends ClaudeAgent {
|
|
|
34
34
|
new MouseMoveTool(this.osAgentHandler),
|
|
35
35
|
new MouseClickTool(this.osAgentHandler),
|
|
36
36
|
new MouseScrollTool(this.osAgentHandler),
|
|
37
|
-
new DesktopKeyPressSequenceTool(this.osAgentHandler),
|
|
38
|
-
new DesktopSingleKeyPressTool(this.osAgentHandler),
|
|
39
37
|
new TypeTool(this.osAgentHandler),
|
|
38
|
+
new DesktopPressAndReleaseKeysTool(this.osAgentHandler),
|
|
39
|
+
new DesktopKeyHoldDownTool(this.osAgentHandler),
|
|
40
|
+
new DesktopKeyReleaseTool(this.osAgentHandler),
|
|
41
|
+
new MouseHoldLeftButtonDownTool(this.osAgentHandler),
|
|
42
|
+
new MouseReleaseLeftButtonTool(this.osAgentHandler),
|
|
43
|
+
new MouseDragAndDropTool(this.osAgentHandler),
|
|
44
|
+
new WaitTool(),
|
|
40
45
|
];
|
|
41
46
|
this.setTools(tools);
|
|
42
47
|
this.setSystemPrompt(AskUIAgent.DesktopSystemPrompt);
|
|
@@ -57,6 +62,7 @@ export class AskUIAgent extends ClaudeAgent {
|
|
|
57
62
|
new AndroidSequenceKeyPressTool(this.osAgentHandler),
|
|
58
63
|
new TypeTool(this.osAgentHandler),
|
|
59
64
|
new ExecuteShellCommandTool(this.osAgentHandler),
|
|
65
|
+
new WaitTool(),
|
|
60
66
|
];
|
|
61
67
|
this.setTools(tools);
|
|
62
68
|
this.setSystemPrompt(AskUIAgent.AndroidSystemPrompt);
|
|
@@ -66,22 +72,13 @@ export class AskUIAgent extends ClaudeAgent {
|
|
|
66
72
|
AskUIAgent.DesktopSystemPrompt = `
|
|
67
73
|
<SYSTEM_CAPABILITY>
|
|
68
74
|
You are an autonomous AI assistant operating on a ${process.platform} machine with ${process.arch} architecture. You have full access to the system and internet connectivity.
|
|
69
|
-
Your main goal is to mimic a human user interacting with a desktop computer.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
*
|
|
73
|
-
*
|
|
74
|
-
*
|
|
75
|
-
*
|
|
76
|
-
* PDF document handling and text extraction
|
|
77
|
-
* Error handling and recovery mechanisms
|
|
78
|
-
|
|
79
|
-
Available Tools:
|
|
80
|
-
* Mouse control (move, click, scroll)
|
|
81
|
-
* Keyboard input (single keys, key combinations, typing)
|
|
82
|
-
* Screen capture and analysis
|
|
83
|
-
* Error reporting and recovery
|
|
84
|
-
|
|
75
|
+
Your main goal is to mimic a human user interacting with a desktop computer.
|
|
76
|
+
Use a mouse and keyboard to interact with a computer, and take screenshots.
|
|
77
|
+
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
|
|
78
|
+
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot.
|
|
79
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
|
|
80
|
+
* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
|
|
81
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.
|
|
85
82
|
Current Date: ${new Date().toUTCString()} UTC
|
|
86
83
|
</SYSTEM_CAPABILITY>
|
|
87
84
|
|
|
@@ -130,20 +127,11 @@ AskUIAgent.AndroidSystemPrompt = `
|
|
|
130
127
|
<SYSTEM_CAPABILITY>
|
|
131
128
|
You are an autonomous AI assistant operating on an Android device via ADB. The host machine is ${process.platform} with ${process.arch} architecture and internet connectivity.
|
|
132
129
|
Your main goal is to mimic a human user interacting with an Android device. So you should try to use the tools in a way that a human would use a touch screen to interact with an Android device.
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
*
|
|
136
|
-
*
|
|
137
|
-
*
|
|
138
|
-
* Android-specific key events
|
|
139
|
-
* Error handling and recovery mechanisms
|
|
140
|
-
|
|
141
|
-
Available Tools:
|
|
142
|
-
* Touch control (click, swipe, scroll)
|
|
143
|
-
* Android key events (single and sequence)
|
|
144
|
-
* Screen capture and analysis
|
|
145
|
-
* Error reporting and recovery
|
|
146
|
-
|
|
130
|
+
Use a gestures and adb commands to interact with the android device, and take screenshots.
|
|
131
|
+
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot.
|
|
132
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
|
|
133
|
+
* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
|
|
134
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.
|
|
147
135
|
Current Date: ${new Date().toUTCString()} UTC
|
|
148
136
|
</SYSTEM_CAPABILITY>
|
|
149
137
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { Beta } from '@anthropic-ai/sdk/resources';
|
|
2
2
|
import { BaseAgentTool } from './tools/base';
|
|
3
|
+
import { BetaMessageParam } from '@anthropic-ai/sdk/resources/beta/messages';
|
|
3
4
|
type PredictActResponseFunction = (params: {
|
|
4
5
|
max_tokens: number;
|
|
5
6
|
messages: Beta.BetaMessageParam[];
|
|
@@ -7,7 +8,16 @@ type PredictActResponseFunction = (params: {
|
|
|
7
8
|
system?: string;
|
|
8
9
|
tools?: any[];
|
|
9
10
|
betas?: string[];
|
|
11
|
+
tool_choice?: {
|
|
12
|
+
type: 'tool' | 'any' | 'auto';
|
|
13
|
+
name?: string;
|
|
14
|
+
};
|
|
10
15
|
}) => Promise<Beta.BetaMessage>;
|
|
16
|
+
export type AgentHistory = BetaMessageParam[];
|
|
17
|
+
export interface ActOptions {
|
|
18
|
+
chatId?: string;
|
|
19
|
+
agentHistory?: Beta.BetaMessageParam[];
|
|
20
|
+
}
|
|
11
21
|
export declare class ClaudeAgent {
|
|
12
22
|
private predictActResponseFunction;
|
|
13
23
|
private maxTokens;
|
|
@@ -19,7 +29,12 @@ export declare class ClaudeAgent {
|
|
|
19
29
|
private _toolCollection;
|
|
20
30
|
private tools;
|
|
21
31
|
private history;
|
|
32
|
+
private toolChoice;
|
|
22
33
|
constructor(predictActResponseFunction: PredictActResponseFunction);
|
|
34
|
+
setToolChoice(toolChoice: {
|
|
35
|
+
type: 'tool' | 'any' | 'auto';
|
|
36
|
+
name?: string;
|
|
37
|
+
}): void;
|
|
23
38
|
setTools(tools: BaseAgentTool[]): void;
|
|
24
39
|
addTool(tool: BaseAgentTool): void;
|
|
25
40
|
listToolNames(): string[];
|
|
@@ -29,10 +44,7 @@ export declare class ClaudeAgent {
|
|
|
29
44
|
private get toolCollection();
|
|
30
45
|
private setHistory;
|
|
31
46
|
private getHistory;
|
|
32
|
-
act(goal: string, options?:
|
|
33
|
-
chatId?: string;
|
|
34
|
-
agentHistory?: Beta.BetaMessageParam[];
|
|
35
|
-
}): Promise<Beta.BetaMessageParam[]>;
|
|
47
|
+
act(goal: string, imagePathOrBase64String?: string, options?: ActOptions): Promise<Beta.BetaMessageParam[]>;
|
|
36
48
|
private makeApiToolResult;
|
|
37
49
|
private maybePrependSystemToolResult;
|
|
38
50
|
private static filterNMostRecentImages;
|
|
@@ -9,6 +9,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
};
|
|
10
10
|
import { ToolCollection } from './tools/base';
|
|
11
11
|
import { logger } from '../../../lib/logger';
|
|
12
|
+
import { Base64Image } from '../../../utils/base_64_image/base-64-image';
|
|
12
13
|
export class ClaudeAgent {
|
|
13
14
|
constructor(predictActResponseFunction) {
|
|
14
15
|
this.predictActResponseFunction = predictActResponseFunction;
|
|
@@ -16,11 +17,17 @@ export class ClaudeAgent {
|
|
|
16
17
|
this.onlyNMostRecentImages = 3;
|
|
17
18
|
this.imageTruncationThreshold = 10;
|
|
18
19
|
this.systemPrompt = '';
|
|
19
|
-
this.model = 'claude-
|
|
20
|
-
this.betas = ['computer-use-
|
|
20
|
+
this.model = 'claude-sonnet-4-20250514';
|
|
21
|
+
this.betas = ['computer-use-2025-01-24'];
|
|
21
22
|
this._toolCollection = undefined;
|
|
22
23
|
this.tools = [];
|
|
23
24
|
this.history = {};
|
|
25
|
+
this.toolChoice = {
|
|
26
|
+
type: 'any',
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
setToolChoice(toolChoice) {
|
|
30
|
+
this.toolChoice = toolChoice;
|
|
24
31
|
}
|
|
25
32
|
setTools(tools) {
|
|
26
33
|
this._toolCollection = undefined;
|
|
@@ -37,7 +44,14 @@ export class ClaudeAgent {
|
|
|
37
44
|
this.tools = this.tools.filter((tool) => tool.ToolName !== toolName);
|
|
38
45
|
}
|
|
39
46
|
setSystemPrompt(systemPrompt) {
|
|
40
|
-
|
|
47
|
+
const enhancedPrompt = `${systemPrompt}
|
|
48
|
+
If you cannot complete a request due to safety concerns, please:
|
|
49
|
+
1. Explain what specific aspect is problematic
|
|
50
|
+
2. Suggest alternative approaches that would be acceptable
|
|
51
|
+
3. Provide partial assistance where possible within guidelines.
|
|
52
|
+
Raise an exception After you have provided the above information. include the error message in the exception.
|
|
53
|
+
`;
|
|
54
|
+
this.systemPrompt = enhancedPrompt;
|
|
41
55
|
}
|
|
42
56
|
IsConfigured() {
|
|
43
57
|
return this.tools.length > 0 && this.systemPrompt !== '';
|
|
@@ -54,7 +68,7 @@ export class ClaudeAgent {
|
|
|
54
68
|
getHistory(key) {
|
|
55
69
|
return this.history[key] || [];
|
|
56
70
|
}
|
|
57
|
-
act(goal, options) {
|
|
71
|
+
act(goal, imagePathOrBase64String, options) {
|
|
58
72
|
return __awaiter(this, void 0, void 0, function* () {
|
|
59
73
|
if (!goal.trim()) {
|
|
60
74
|
throw new Error('Goal cannot be empty');
|
|
@@ -70,8 +84,24 @@ export class ClaudeAgent {
|
|
|
70
84
|
messages.push(...this.getHistory(options.chatId));
|
|
71
85
|
}
|
|
72
86
|
// Add the new goal as a user message
|
|
87
|
+
const userContent = [{
|
|
88
|
+
type: 'text',
|
|
89
|
+
text: goal,
|
|
90
|
+
}];
|
|
91
|
+
if (imagePathOrBase64String !== undefined) {
|
|
92
|
+
const image = yield Base64Image.fromPathOrString(imagePathOrBase64String);
|
|
93
|
+
const imageString = image.toString(false);
|
|
94
|
+
userContent.push({
|
|
95
|
+
type: 'image',
|
|
96
|
+
source: {
|
|
97
|
+
type: 'base64',
|
|
98
|
+
media_type: 'image/png',
|
|
99
|
+
data: imageString,
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
}
|
|
73
103
|
messages.push({
|
|
74
|
-
content:
|
|
104
|
+
content: userContent,
|
|
75
105
|
role: 'user',
|
|
76
106
|
});
|
|
77
107
|
if (this.onlyNMostRecentImages) {
|
|
@@ -85,7 +115,15 @@ export class ClaudeAgent {
|
|
|
85
115
|
system: this.systemPrompt,
|
|
86
116
|
tools: (new ToolCollection(this.tools).toParams()),
|
|
87
117
|
betas: this.betas,
|
|
118
|
+
tool_choice: this.toolChoice,
|
|
88
119
|
});
|
|
120
|
+
if (response.stop_reason === 'refusal') {
|
|
121
|
+
const refusalMessage = response.content
|
|
122
|
+
.filter(block => block.type === 'text')
|
|
123
|
+
.map(block => block.text)
|
|
124
|
+
.join(' ');
|
|
125
|
+
throw new Error(`Agent refused to answer: ${refusalMessage || 'The request violates agent\'s usage policies'}`);
|
|
126
|
+
}
|
|
89
127
|
messages.push({
|
|
90
128
|
content: response.content,
|
|
91
129
|
role: 'assistant',
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { BetaMessageParam } from '@anthropic-ai/sdk/resources/beta/messages';
|
|
2
|
-
export type AgentHistory = BetaMessageParam[];
|
|
3
1
|
export { AskUIAgent } from './askui-agent';
|
|
4
2
|
export { ToolFailure, ToolError, BaseAgentTool } from './tools/base';
|
|
3
|
+
export { AgentHistory, ActOptions } from './claude-agent';
|
|
@@ -5,21 +5,36 @@ import { ExecutionRuntime } from '../../../../execution/execution-runtime';
|
|
|
5
5
|
import { ControlCommand } from '../../../ui-control-commands';
|
|
6
6
|
export declare class OsAgentHandler {
|
|
7
7
|
private AgentOsClient;
|
|
8
|
-
private
|
|
8
|
+
private targetResolution;
|
|
9
9
|
private screenDimensions;
|
|
10
|
+
private paddingInfo;
|
|
10
11
|
constructor(AgentOsClient: ExecutionRuntime, screenDimensions: {
|
|
11
12
|
width: number;
|
|
12
13
|
height: number;
|
|
13
14
|
});
|
|
15
|
+
private updatePaddingInfo;
|
|
14
16
|
static createInstance(AgentOsClient: ExecutionRuntime): Promise<OsAgentHandler>;
|
|
15
17
|
getTargetResolution(): {
|
|
16
18
|
width: number;
|
|
17
19
|
height: number;
|
|
18
20
|
};
|
|
21
|
+
getScreenDimensions(): {
|
|
22
|
+
width: number;
|
|
23
|
+
height: number;
|
|
24
|
+
};
|
|
19
25
|
setTargetResolution(width: number, height: number): void;
|
|
20
26
|
takeScreenshot(): Promise<string>;
|
|
21
27
|
private scaleCoordinates;
|
|
22
28
|
requestControl(controlCommand: ControlCommand): Promise<void>;
|
|
29
|
+
mouseMove(x: number, y: number): Promise<void>;
|
|
30
|
+
mouseClick(button: "left" | "right" | "middle", doubleClick: boolean): Promise<void>;
|
|
31
|
+
mouseScroll(dx: number, dy: number): Promise<void>;
|
|
32
|
+
mouseHoldLeftButtonDown(): Promise<void>;
|
|
33
|
+
mouseReleaseLeftButton(): Promise<void>;
|
|
34
|
+
desktopKeyPressAndRelease(key: PC_AND_MODIFIER_KEY, modifiers?: MODIFIER_KEY[]): Promise<void>;
|
|
35
|
+
desktopKeyHoldDown(key: PC_AND_MODIFIER_KEY, modifiers?: MODIFIER_KEY[]): Promise<void>;
|
|
36
|
+
desktopKeyRelease(key: PC_AND_MODIFIER_KEY, modifiers?: MODIFIER_KEY[]): Promise<void>;
|
|
37
|
+
typeText(text: string): Promise<void>;
|
|
23
38
|
}
|
|
24
39
|
export declare class ScreenShotTool extends BaseAgentTool {
|
|
25
40
|
private osAgentHandler;
|
|
@@ -54,21 +69,53 @@ export declare class MouseScrollTool extends BaseAgentTool {
|
|
|
54
69
|
}): Promise<ToolResult>;
|
|
55
70
|
toParams(): BetaTool;
|
|
56
71
|
}
|
|
57
|
-
export declare class
|
|
72
|
+
export declare class MouseDragAndDropTool extends BaseAgentTool {
|
|
73
|
+
private osAgentHandler;
|
|
74
|
+
constructor(osAgentHandler: OsAgentHandler);
|
|
75
|
+
execute(command: {
|
|
76
|
+
startX: number;
|
|
77
|
+
startY: number;
|
|
78
|
+
endX: number;
|
|
79
|
+
endY: number;
|
|
80
|
+
}): Promise<ToolResult>;
|
|
81
|
+
toParams(): BetaTool;
|
|
82
|
+
}
|
|
83
|
+
export declare class MouseHoldLeftButtonDownTool extends BaseAgentTool {
|
|
84
|
+
private osAgentHandler;
|
|
85
|
+
constructor(osAgentHandler: OsAgentHandler);
|
|
86
|
+
execute(): Promise<ToolResult>;
|
|
87
|
+
toParams(): BetaTool;
|
|
88
|
+
}
|
|
89
|
+
export declare class MouseReleaseLeftButtonTool extends BaseAgentTool {
|
|
90
|
+
private osAgentHandler;
|
|
91
|
+
constructor(osAgentHandler: OsAgentHandler);
|
|
92
|
+
execute(): Promise<ToolResult>;
|
|
93
|
+
toParams(): BetaTool;
|
|
94
|
+
}
|
|
95
|
+
export declare class DesktopPressAndReleaseKeysTool extends BaseAgentTool {
|
|
58
96
|
private osAgentHandler;
|
|
59
97
|
constructor(osAgentHandler: OsAgentHandler);
|
|
60
98
|
execute(command: {
|
|
61
99
|
key: PC_KEY;
|
|
62
|
-
|
|
63
|
-
secondModifier?: MODIFIER_KEY;
|
|
100
|
+
modifiers?: MODIFIER_KEY[];
|
|
64
101
|
}): Promise<ToolResult>;
|
|
65
102
|
toParams(): BetaTool;
|
|
66
103
|
}
|
|
67
|
-
export declare class
|
|
104
|
+
export declare class DesktopKeyHoldDownTool extends BaseAgentTool {
|
|
68
105
|
private osAgentHandler;
|
|
69
106
|
constructor(osAgentHandler: OsAgentHandler);
|
|
70
107
|
execute(command: {
|
|
71
108
|
key: PC_AND_MODIFIER_KEY;
|
|
109
|
+
modifiers?: MODIFIER_KEY[];
|
|
110
|
+
}): Promise<ToolResult>;
|
|
111
|
+
toParams(): BetaTool;
|
|
112
|
+
}
|
|
113
|
+
export declare class DesktopKeyReleaseTool extends BaseAgentTool {
|
|
114
|
+
private osAgentHandler;
|
|
115
|
+
constructor(osAgentHandler: OsAgentHandler);
|
|
116
|
+
execute(command: {
|
|
117
|
+
key: PC_AND_MODIFIER_KEY;
|
|
118
|
+
modifiers?: MODIFIER_KEY[];
|
|
72
119
|
}): Promise<ToolResult>;
|
|
73
120
|
toParams(): BetaTool;
|
|
74
121
|
}
|
|
@@ -111,3 +158,10 @@ export declare class ExecuteShellCommandTool extends BaseAgentTool {
|
|
|
111
158
|
}): Promise<ToolResult>;
|
|
112
159
|
toParams(): BetaTool;
|
|
113
160
|
}
|
|
161
|
+
export declare class WaitTool extends BaseAgentTool {
|
|
162
|
+
constructor();
|
|
163
|
+
execute(command: {
|
|
164
|
+
milliseconds: number;
|
|
165
|
+
}): Promise<ToolResult>;
|
|
166
|
+
toParams(): BetaTool;
|
|
167
|
+
}
|