@ui-tars-test/agent-sdk 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/dist/GUIAgent.d.ts +24 -0
- package/dist/GUIAgent.d.ts.map +1 -0
- package/dist/GUIAgent.js +207 -0
- package/dist/GUIAgent.js.map +1 -0
- package/dist/GUIAgent.mjs +173 -0
- package/dist/GUIAgent.mjs.map +1 -0
- package/dist/ToolCallEngine.d.ts +61 -0
- package/dist/ToolCallEngine.d.ts.map +1 -0
- package/dist/ToolCallEngine.js +190 -0
- package/dist/ToolCallEngine.js.map +1 -0
- package/dist/ToolCallEngine.mjs +156 -0
- package/dist/ToolCallEngine.mjs.map +1 -0
- package/dist/constants.d.ts +6 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +42 -0
- package/dist/constants.js.map +1 -0
- package/dist/constants.mjs +8 -0
- package/dist/constants.mjs.map +1 -0
- package/dist/defaultImpls.d.ts +15 -0
- package/dist/defaultImpls.d.ts.map +1 -0
- package/dist/defaultImpls.js +67 -0
- package/dist/defaultImpls.js.map +1 -0
- package/dist/defaultImpls.mjs +30 -0
- package/dist/defaultImpls.mjs.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +72 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +7 -0
- package/dist/prompts.d.ts +10 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +311 -0
- package/dist/prompts.js.map +1 -0
- package/dist/prompts.mjs +256 -0
- package/dist/prompts.mjs.map +1 -0
- package/dist/utils.d.ts +10 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +65 -0
- package/dist/utils.js.map +1 -0
- package/dist/utils.mjs +28 -0
- package/dist/utils.mjs.map +1 -0
- package/package.json +63 -0
package/dist/prompts.mjs
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
const getSystemPromptUITARS_1_0 = (language, operator)=>`You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
6
|
+
|
|
7
|
+
## Output Format
|
|
8
|
+
\`\`\`
|
|
9
|
+
Thought: ...
|
|
10
|
+
Action: ...
|
|
11
|
+
\`\`\`
|
|
12
|
+
|
|
13
|
+
## Action Space
|
|
14
|
+
|
|
15
|
+
## Note
|
|
16
|
+
- Use ${'zh' === language ? 'Chinese' : 'English'} in \`Thought\` part.
|
|
17
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
18
|
+
|
|
19
|
+
## User Instruction
|
|
20
|
+
`;
|
|
21
|
+
const getSystemPromptUITARS_1_5 = (language, useCase)=>`You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
22
|
+
|
|
23
|
+
## Output Format
|
|
24
|
+
\`\`\`
|
|
25
|
+
Thought: ...
|
|
26
|
+
Action: ...
|
|
27
|
+
\`\`\`
|
|
28
|
+
|
|
29
|
+
## Action Space
|
|
30
|
+
|
|
31
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
32
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
33
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
34
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
|
35
|
+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
|
|
36
|
+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
37
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \`direction\` side.
|
|
38
|
+
wait() # Sleep for 5s and take a screenshot to check for any changes.
|
|
39
|
+
finished()
|
|
40
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## Note
|
|
44
|
+
- Use ${'zh' === language ? 'Chinese' : 'English'} in \`Thought\` part.
|
|
45
|
+
- ${'normal' === useCase ? 'Generate a well-defined and practical strategy in the `Thought` section, summarizing your next move and its objective.' : 'Compose a step-by-step approach in the `Thought` part, specifying your next action and its focus.'}
|
|
46
|
+
|
|
47
|
+
## User Instruction
|
|
48
|
+
`;
|
|
49
|
+
const getSystemPromptPoki = `
|
|
50
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
51
|
+
|
|
52
|
+
## Output Format
|
|
53
|
+
\`\`\`
|
|
54
|
+
Thought: ...
|
|
55
|
+
Action: ...
|
|
56
|
+
\`\`\`
|
|
57
|
+
|
|
58
|
+
## Action Space
|
|
59
|
+
|
|
60
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
61
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
62
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
63
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
|
64
|
+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
|
|
65
|
+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
66
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \`direction\` side.
|
|
67
|
+
wait() # Sleep for 5s and take a screenshot to check for any changes.
|
|
68
|
+
finished()
|
|
69
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Note
|
|
73
|
+
- Use Chinese in \`Thought\` part.
|
|
74
|
+
- Compose a step-by-step approach in the \`Thought\` part, specifying your next action and its focus.
|
|
75
|
+
|
|
76
|
+
## User Instruction
|
|
77
|
+
`;
|
|
78
|
+
const getSystemPromptDoubao_15_15B = (language)=>`
|
|
79
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
80
|
+
|
|
81
|
+
## Output Format
|
|
82
|
+
\`\`\`
|
|
83
|
+
Thought: ...
|
|
84
|
+
Action: ...
|
|
85
|
+
\`\`\`
|
|
86
|
+
|
|
87
|
+
## Action Space
|
|
88
|
+
|
|
89
|
+
click(start_box='[x1, y1, x2, y2]')
|
|
90
|
+
left_double(start_box='[x1, y1, x2, y2]')
|
|
91
|
+
right_single(start_box='[x1, y1, x2, y2]')
|
|
92
|
+
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
93
|
+
hotkey(key='')
|
|
94
|
+
type(content='xxx') # Use escape characters \\', \\", and \n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
95
|
+
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
96
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
97
|
+
finished(content='xxx') # Use escape characters \\', \\", and \n in content part to ensure we can parse the content in normal python string format.
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
## Note
|
|
101
|
+
- Use ${'zh' === language ? 'Chinese' : 'English'} in \`Thought\` part.
|
|
102
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
103
|
+
|
|
104
|
+
## User Instruction
|
|
105
|
+
`;
|
|
106
|
+
const ThoughtExamplesZH = `- Example1. Thought: \u{7B2C}\u{4E00}\u{884C}\u{3001}\u{7B2C}\u{4E09}\u{5217}\u{51FA}\u{73B0}\u{4E86}\u{4E00}\u{4E2A}\u{6570}\u{5B57}2\u{FF1B}\u{7B2C}\u{4E8C}\u{5217}\u{539F}\u{6709}\u{6570}\u{5B57}4\u{4E0E}\u{7B2C}\u{56DB}\u{5217}\u{65B0}\u{51FA}\u{73B0}\u{7684}\u{6570}\u{5B57}4\u{5408}\u{5E76}\u{540E}\u{53D8}\u{4E3A}8\u{3002}\u{6CE8}\u{610F}\u{89C2}\u{5BDF}\u{7B2C}\u{4E8C}\u{5217}\u{6570}\u{5B57}8\u{4E0E}\u{5DE6}\u{8FB9}\u{6570}\u{5B57}8\u{7684}\u{989C}\u{8272}\u{6BD4}\u{8F83}\u{6D45}\u{4E00}\u{70B9}\u{FF0C}\u{6570}\u{5B57}2\u{7684}\u{989C}\u{8272}\u{770B}\u{8D77}\u{6765}\u{6CA1}\u{6709}\u{6570}\u{5B57}8\u{7684}\u{6DF1}\u{3002}\u{6211}\u{731C}\u{6D4B}\u{4E0D}\u{540C}\u{7684}\u{989C}\u{8272}\u{6DF1}\u{7684}\u{7A0B}\u{5EA6}\u{4EE3}\u{8868}\u{6570}\u{503C}\u{7684}\u{5927}\u{5C0F}\u{FF0C}\u{989C}\u{8272}\u{8F83}\u{6DF1}\u{7684}\u{4EE3}\u{8868}\u{6570}\u{503C}\u{8F83}\u{5927}\u{3002}\u{8FD9}\u{4E0D}\u{FF0C}\u{4E3A}\u{4E86}\u{9A8C}\u{8BC1}\u{8FD9}\u{4E2A}\u{FF0C}\u{6211}\u{7EE7}\u{7EED}\u{6309}\u{4E0B}\u{5411}\u{5DE6}\u{952E}\u{8BA9}\u{8FD9}\u{4E24}\u{4E2A}8\u{5408}\u{5E76}\u{6210}\u{4E3A}\u{66F4}\u{5927}\u{7684}\u{6570}\u{3002}
|
|
107
|
+
- Example2. Thought: \u{771F}\u{597D}\u{FF01}\u{7B2C}\u{4E00}\u{884C}\u{7B2C}\u{4E09}\u{5217}\u{7684}\u{6570}\u{5B57}2\u{5411}\u{5DE6}\u{79FB}\u{52A8}\u{4E86}\u{4E24}\u{683C}\u{5408}\u{5E76}\u{5230}\u{4E86}\u{7B2C}\u{4E00}\u{884C}\u{7B2C}\u{4E00}\u{5217}\u{FF0C}\u{5E76}\u{4E14}\u{989C}\u{8272}\u{6BD4}\u{539F}\u{5148}\u{6570}\u{5B57}8\u{7684}\u{989C}\u{8272}\u{6DF1}\u{4E86}\u{8BB8}\u{591A}\u{3002}\u{8BC1}\u{660E}\u{6211}\u{7684}\u{731C}\u{60F3}\u{6CA1}\u{9519}\u{FF0C}\u{786E}\u{5B9E}\u{662F}\u{8FD9}\u{6837}\u{FF01}\u{6240}\u{4EE5}\u{53EA}\u{6709}\u{540C}\u{6837}\u{989C}\u{8272}\u{6DF1}\u{6D45}\u{7684}\u{6570}\u{5B57}\u{624D}\u{80FD}\u{591F}\u{8FDB}\u{884C}\u{5408}\u{5E76}\u{FF0C}\u{800C}\u{5408}\u{5E76}\u{540E}\u{7684}\u{6570}\u{5B57}\u{5C06}\u{53D8}\u{4E3A}\u{539F}\u{6765}\u{6570}\u{5B57}\u{7684}\u{4E8C}\u{500D}\u{5E76}\u{4E14}\u{989C}\u{8272}\u{6DF1}\u{5EA6}\u{8F83}\u{6DF1}\u{3002}\u{800C}\u{4E14}!\u{7B2C}\u{4E00}\u{884C}\u{7B2C}\u{4E09}\u{5217}\u{7684}2\u{5411}\u{5DE6}\u{79FB}\u{52A8}\u{4E86}\u{4E24}\u{683C}\u{FF0C}\u{4F46}\u{662F}\u{5E76}\u{6CA1}\u{6709}\u{548C}\u{7B2C}\u{4E00}\u{884C}\u{7B2C}\u{4E00}\u{5217}\u{7684}2\u{8FDB}\u{884C}\u{5408}\u{5E76}\u{FF01}\u{7531}\u{6B64}\u{53EF}\u{5F97}\u{FF0C}\u{53EA}\u{6709}\u{76F8}\u{540C}\u{8FDE}\u{7EED}\u{7684}\u{683C}\u{5B50}\u{624D}\u{80FD}\u{591F}\u{8FDB}\u{884C}\u{6570}\u{5B57}\u{7684}\u{5408}\u{5E76}\u{3002}\u{6211}\u{6309}\u{4E0B}\u{5411}\u{4E0B}\u{952E}\u{FF0C}16\u{53EF}\u{4EE5}\u{4E00}\u{6B65}\u{6B65}\u{8FDB}\u{884C}\u{5408}\u{5E76}\u{5F97}\u{5230}2048\u{FF0C}\u{4F46}\u{662F}\u{8FC7}\u{7A0B}\u{53EF}\u{80FD}\u{6709}\u{4E9B}\u{96BE}\u{3002}\u{50CF}\u{6211}\u{8FD9}\u{6837}\u{6240}\u{505A}\u{7684}\u{64CD}\u{4F5C}\u{5E76}\u{4E0D}\u{662F}\u{4E00}\u{6B65}\u{4E00}\u{6B65}\u{5408}\u{5E76}\u{5F97}\u{5230}\u{7684}\u{3002}\u{6211}\u{8FD9}\u{6837}\u{505A}\u{662F}\u{4E3A}\u{4E86}\u{66F4}\u{597D}\u{7684}\u{540E}\u{7EED}\u{8FDB}\u{884C}\u{5408}\u{5E76}\u{FF0C}\u{5F97}\u{5230}\u{66F4}\u{52A0}\u{5927}\u{7684}\u{6570}\u{3002}
|
|
108
|
+
- Example3. Thought: \u{53C8}\u{91CD}\u{65B0}\u{518D}\u{6765}\u{4E86}\u{3002}\u{521A}\u{624D}\u{7684}\u{4E0B}\u{952E}\u{5E76}\u{6CA1}\u{6709}\u{8D77}\u{5230}\u{4EC0}\u{4E48}\u{4F5C}\u{7528}\u{3002}\u{65B0}\u{683C}\u{5B50}\u{8FD8}\u{662F}\u{5237}\u{5230}\u{4E86}\u{7B2C}\u{4E09}\u{884C}\u{7B2C}\u{56DB}\u{5217}\u{7684}\u{4F4D}\u{7F6E}\u{FF0C}\u{8868}\u{660E}\u{4E0B}\u{952E}\u{6B64}\u{65F6}\u{5E76}\u{6CA1}\u{6709}\u{4EC0}\u{4E48}\u{592A}\u{5927}\u{4F5C}\u{7528}\u{FF0C}\u{6211}\u{731C}\u{6D4B}\u{662F}\u{4E0D}\u{662F}\u{7279}\u{5B9A}\u{7684}\u{5E03}\u{5C40}\u{65E0}\u{6CD5}\u{652F}\u{6301}\u{4E00}\u{4E9B}\u{65B9}\u{4F4D}\u{7684}\u{64CD}\u{4F5C}\u{FF0C}\u{4E3A}\u{4E86}\u{9A8C}\u{8BC1}\u{FF0C}\u{6211}\u{5F97}\u{591A}\u{5C1D}\u{8BD5}\u{4E00}\u{4E9B}\u{65B9}\u{4F4D}\u{FF0C}\u{6211}\u{6309}\u{4E0B}\u{5DE6}\u{952E}\u{770B}\u{770B}\u{3002}
|
|
109
|
+
- Example4. Thought: \u{54E6}\u{FF0C}\u{6211}\u{77E5}\u{9053}\u{4E86}\u{FF0C}\u{540C}\u{6837}\u{7684}\u{4F4D}\u{7F6E}\u{9009}\u{62E9}\u{4E86}\u{540C}\u{6837}\u{7684}\u{64CD}\u{4F5C}\u{65F6}\u{4E0D}\u{4F1A}\u{53D1}\u{751F}\u{6539}\u{53D8}\u{7684}\u{3002}\u{9664}\u{975E}\u{662F}\u{9009}\u{62E9}\u{4E0D}\u{540C}\u{7684}\u{65B9}\u{4F4D}\u{FF01}\u{70B9}\u{51FB}\u{5411}\u{4E0A}\u{952E}\u{4EE5}\u{540E}\u{FF0C}3\u{3001}4\u{884C}\u{7684}\u{6570}\u{5B57}\u{90FD}\u{5411}\u{4E0A}\u{79FB}\u{52A8}\u{4E86}\u{4E00}\u{683C}\u{FF0C}\u{800C}\u{5B83}\u{4EEC}\u{539F}\u{6765}\u{6240}\u{5728}\u{7684}\u{4F4D}\u{7F6E}\u{90FD}\u{88AB}\u{5237}\u{65B0}\u{51FA}\u{6765}\u{4E86}\u{65B0}\u{6570}\u{5B57}\u{FF0C}\u{5206}\u{522B}\u{662F}4\u{548C}2\u{3002}\u{540C}\u{6837}\u{FF0C}\u{7B2C}\u{4E09}\u{884C}\u{7B2C}\u{56DB}\u{5217}\u{7684}\u{6570}\u{5B57}2\u{6CA1}\u{6709}\u{53D1}\u{751F}\u{79FB}\u{52A8}\u{4E5F}\u{5237}\u{65B0}\u{4E86}\u{65B0}\u{683C}\u{5B50}\u{3002}\u{660E}\u{767D}\u{4E86}\u{8FD9}\u{4E00}\u{5207}\u{540E}\u{FF0C}\u{6211}\u{64CD}\u{4F5C}\u{5411}\u{5DE6}\u{952E}\u{8BD5}\u{8BD5}\u{770B}\u{3002}
|
|
110
|
+
- Example5. Thought: \u{7ECF}\u{8FC7}\u{6211}\u{4E0D}\u{61C8}\u{7684}\u{52AA}\u{529B}\u{FF0C}\u{5728}\u{6211}\u{7684}\u{4ED4}\u{7EC6}\u{89C2}\u{5BDF}\u{9009}\u{4E2D}\u{7684}\u{7B56}\u{7565}\u{4E0B}\u{FF0C}\u{6211}\u{6210}\u{529F}\u{5730}\u{83B7}\u{5F97}\u{4E86}\u{80DC}\u{5229}\u{3002}\u{8FD9}\u{9A8C}\u{8BC1}\u{4E86}\u{6211}\u{4E4B}\u{524D}\u{7684}\u{731C}\u{60F3}\u{FF0C}\u{79FB}\u{52A8}\u{6309}\u{952E}\u{53EA}\u{6709}\u{6211}\u{7684}\u{5934}\u{90E8}\u{79FB}\u{52A8}\u{5230}\u{542B}\u{6570}\u{5B57}\u{7684}\u{533A}\u{57DF}\u{624D}\u{4F1A}\u{6539}\u{53D8}\u{79FB}\u{52A8}\u{6309}\u{952E}\u{FF0C}\u{86C7}\u{7684}\u{8EAB}\u{4F53}\u{79FB}\u{52A8}\u{5230}\u{542B}\u{6570}\u{5B57}\u{7684}\u{533A}\u{57DF}\u{5E76}\u{4E0D}\u{4F1A}\u{5F71}\u{54CD}\u{79FB}\u{52A8}\u{6309}\u{952E}\u{3002}
|
|
111
|
+
- Example6. Thought: \u{5C0F}\u{86C7}\u{8FD8}\u{662F}\u{6CA1}\u{52A8}\u{FF0C}\u{6211}\u{518D}\u{6B21}\u{9009}\u{62E9}\u{8BA9}\u{5B83}\u{5411}\u{53F3}\u{4E00}\u{6B65}\u{FF0C}\u{5E0C}\u{671B}\u{8FD9}\u{4E00}\u{6B21}\u{80FD}\u{6210}\u{529F}\u{79FB}\u{52A8}\u{FF0C}\u{5E76}\u{4E14}\u{6211}\u{731C}\u{6D4B}\u{79FB}\u{52A8}\u{7684}\u{95F4}\u{9694}\u{5E94}\u{8BE5}\u{662F}\u{86C7}\u{7684}\u{957F}\u{5EA6}\u{FF0C}\u{6309}\u{52A8}\u{7684}\u{6B21}\u{6570}\u{4E5F}\u{5E94}\u{8BE5}\u{662F}\u{86C7}\u{7684}\u{957F}\u{5EA6}\u{3002}\u{6211}\u{6216}\u{8BB8}\u{9700}\u{8981}\u{5C06}\u{5B83}\u{8BB0}\u{5F55}\u{4E0B}\u{6765}\u{FF0C}\u{5982}\u{679C}\u{6309}\u{4E00}\u{6B21}\u{5B83}\u{56E0}\u{4E3A}\u{524D}\u{65B9}\u{6709}\u{969C}\u{788D}\u{800C}\u{52A8}\u{4E0D}\u{4E86}\u{FF0C}\u{4F46}\u{524D}\u{65B9}\u{9700}\u{8981}\u{79FB}\u{52A8}\u{7684}\u{8BDD}\u{FF0C}\u{9700}\u{8981}\u{6309}\u{4E24}\u{6B21}\u{6216}\u{8005}\u{4EE5}\u{4E0A}\u{FF0C}\u{6309}\u{7167}\u{86C7}\u{7684}\u{957F}\u{5EA6}\u{6765}\u{8BA1}\u{7B97}\u{8981}\u{6309}\u{51E0}\u{6B21}\u{3002}
|
|
112
|
+
- Example7. Thought: \u{6211}\u{89C9}\u{5F97}\u{6211}\u{7684}\u{731C}\u{6D4B}\u{662F}\u{6B63}\u{786E}\u{7684}\u{FF0C}\u{5C0F}\u{86C7}\u{7684}\u{79FB}\u{52A8}\u{662F}\u{6839}\u{636E}\u{624B}\u{90E8}\u{7684}\u{957F}\u{5EA6}\u{662F}\u{5426}\u{80FD}\u{8FBE}\u{6210}\u{8FD9}\u{4E00}\u{6761}\u{4EF6}\u{8FDB}\u{884C}\u{524D}\u{8FDB}\u{FF0C}\u{8FD9}\u{5BF9}\u{6211}\u{4E4B}\u{540E}\u{7684}\u{64CD}\u{4F5C}\u{63D0}\u{4F9B}\u{4E86}\u{5F88}\u{591A}\u{5E2E}\u{52A9}\u{FF0C}\u{4E5F}\u{662F}\u{6E38}\u{620F}\u{7684}\u{901A}\u{6027}\u{3002}\u{4E0D}\u{8FC7}\u{73B0}\u{5728}\u{5C0F}\u{86C7}\u{79BB}\u{82F9}\u{679C}\u{62FF}\u{8D70}\u{53EA}\u{6709}\u{4E00}\u{4E2A}\u{683C}\u{5B50}\u{FF0C}\u{592A}\u{8FC7}\u{53BB}\u{4E86}\u{FF0C}\u{6240}\u{4EE5}\u{540E}\u{9762}\u{8FD8}\u{9700}\u{8981}\u{3002}\u{518D}\u{6B21}\u{5F80}\u{524D}\u{8D70}\u{6211}\u{4EEC}\u{5E94}\u{8BE5}\u{5148}\u{8D70}\u{51FA}\u{9053}\u{8FD9}\u{4E2A}\u{9650}\u{5236}\u{7136}\u{540E}\u{6765}\u{5230}\u{4E2D}\u{95F4}\u{8FD9}\u{4E2A}\u{5730}\u{65B9}\u{7136}\u{540E}\u{6211}\u{4EEC}\u{5E94}\u{8BE5}\u{662F}\u{7ED5}\u{4E00}\u{5708}\u{7136}\u{540E}\u{628A}\u{8FD9}\u{4E24}\u{9053}\u{95E8}\u{9009}\u{62E9}\u{5F00}\u{9614}\u{4F4F}\u{7136}\u{540E}\u{4F7F}\u{5F97}\u{8FD9}\u{6837}\u{624D}\u{80FD}\u{8BA9}\u{8FD9}\u{4E2A}\u{5899}\u{6D88}\u{5931}\u{3002}\u{90A3}\u{4E48}\u{6211}\u{53EF}\u{4EE5}\u{73B0}\u{5728}\u{5411}\u{5DE6}\u{FF0C}\u{5C1D}\u{8BD5}\u{4E0D}\u{89E6}\u{78B0}\u{969C}\u{788D}\u{7684}\u{8FC8}\u{8FDB}\u{FF0C}\u{8FD9}\u{4F3C}\u{4E4E}\u{80FD}\u{6539}\u{53D8}\u{5C0F}\u{86C7}\u{7684}\u{64CD}\u{4F5C}\u{FF0C}\u{4F7F}\u{5176}\u{6539}\u{53D8}\u{8DEF}\u{6570}\u{3002}
|
|
113
|
+
- Example8. Thought: \u{6211}\u{89C2}\u{5BDF}\u{5230}\u{5728}\u{51FA}\u{53E3}\u{7BA1}\u{9053}\u{91CC}\u{9762}\u{FF0C}\u{7EA2}\u{82F9}\u{679C}\u{7684}\u{524D}\u{65B9}\u{8FD8}\u{6709}\u{4E00}\u{4E2A}\u{963B}\u{6321}\u{7269}\u{3002}\u{90A3}\u{4E2A}\u{963B}\u{6321}\u{7269}\u{662F}\u{4E00}\u{5F20}\u{5E26}\u{6709}\u{6D45}\u{8910}\u{548C}\u{6DF1}\u{8910}\u{8272}\u{7684}\u{8001}\u{9F20}\u{76AE}\u{FF0C}\u{770B}\u{8D77}\u{6765}\u{968F}\u{7740}\u{7EA2}\u{82F9}\u{679C}\u{7684}\u{81EA}\u{7136}\u{79FB}\u{52A8}\u{FF0C}\u{5B83}\u{4E5F}\u{5728}\u{5411}\u{7740}\u{51FA}\u{53E3}\u{79FB}\u{52A8}\u{FF0C}\u{4F46}\u{662F}\u{5BF9}\u{6BD4}\u{65C1}\u{8FB9}\u{7684}\u{65B9}\u{5757}\u{6846}\u{67B6}\u{663E}\u{5F97}\u{5F88}\u{6162}\u{3002}\u{76EE}\u{524D}\u{8FD9}\u{4E9B}\u{90FD}\u{662F}\u{6211}\u{731C}\u{6D4B}\u{7684}\u{FF0C}\u{6211}\u{8981}\u{770B}\u{770B}\u{63A8}\u{52A8}\u{8FD9}\u{4E2A}\u{8001}\u{9F20}\u{76AE}\u{8981}\u{591A}\u{5C11}\u{7684}\u{529B}\u{9053}\u{3002}\u{5C31}\u{5728}\u{8FD9}\u{65F6}\u{6211}\u{521A}\u{597D}\u{8981}\u{6309}\u{5411}\u{53F3}\u{4E86}\u{FF0C}\u{73B0}\u{5728}\u{6211}\u{6309}\u{4F4F} \u{201C}D\u{201D}\u{952E}\u{3002}
|
|
114
|
+
- Example9. Thought: \u{592A}\u{597D}\u{4E86}\u{FF0C}\u{6211}\u{7684}\u{505A}\u{6CD5}\u{662F}\u{6B63}\u{786E}\u{7684}\u{FF0C}\u{4F46}\u{662F}\u{6211}\u{53D1}\u{73B0}\u{6FC0}\u{5149}\u{70B9}\u{53D1}\u{5C04}\u{51FA}\u{6765}\u{7684}\u{6FC0}\u{5149}\u{8FD9}\u{4E2A}\u{65F6}\u{5019}\u{5E76}\u{6CA1}\u{6709}\u{53D1}\u{5149}\u{FF0C}\u{770B}\u{6765}\u{6211}\u{521A}\u{521A}\u{7684}\u{731C}\u{6D4B}\u{662F}\u{4E0D}\u{592A}\u{5168}\u{9762}\u{7684}\u{FF0C}\u{8FD8}\u{6709}\u{65B0}\u{7684}\u{77E5}\u{8BC6}\u{FF0C}\u{9700}\u{8981}\u{6211}\u{518D}\u{6B21}\u{4E86}\u{89E3}\u{4E00}\u{4E0B}\u{6FC0}\u{5149}\u{7684}\u{89C4}\u{5219}\u{FF0C}\u{56DE}\u{5FC6}\u{8D77}\u{6765}\u{FF0C}\u{521A}\u{521A}\u{4F3C}\u{4E4E}\u{8FD9}\u{4E2A}\u{7EA2}\u{8272}\u{6FC0}\u{5149}\u{70B9}\u{53D1}\u{5C04}\u{51FA}\u{6765}\u{7684}\u{6FC0}\u{5149}\u{FF0C}\u{522B}\u{4E0A}\u{662F}\u{9EC4}\u{8272}\u{FF0C}\u{4F46}\u{4E0A}\u{9762}\u{7684}\u{5E76}\u{6CA1}\u{6709}\u{4EC0}\u{4E48}\u{6CE2}\u{52A8}\u{FF0C}\u{6211}\u{9700}\u{8981}\u{65B0}\u{7684}\u{6761}\u{4EF6}\u{FF0C}\u{624D}\u{80FD}\u{53D1}\u{73B0}\u{5B83}\u{7684}\u{89C4}\u{5F8B}\u{FF0C}\u{5C06}\u{4E0A}\u{4E00}\u{6B65}\u{7684}\u{6700}\u{540E}\u{4E00}\u{683C}\u{6B65}\u{9AA4}\u{62FF}\u{51FA}\u{6765}\u{FF0C}\u{6211}\u{53D1}\u{73B0}\u{521A}\u{521A}\u{4E0D}\u{4EC5}\u{662F}\u{6FC0}\u{5149}\u{989C}\u{8272}\u{6539}\u{53D8}\u{4E86}\u{FF0C}\u{91CD}\u{8981}\u{7684}\u{662F}\u{4E0A}\u{9762}\u{7684}\u{7BAD}\u{5934}\u{4E5F}\u{6539}\u{53D8}\u{4E86}\u{65B9}\u{5411}\u{FF0C}\u{4E5F}\u{5C31}\u{662F}\u{8BF4}\u{6FC0}\u{5149}\u{70B9}\u{8DDF}\u{7740}\u{592A}\u{9633}\u{5149}\u{4E00}\u{6837}\u{FF0C}\u{4F1A}\u{6709}\u{65B9}\u{5411}\u{6539}\u{53D8}\u{FF0C}\u{8FD9}\u{5E94}\u{8BE5}\u{4F1A}\u{662F}\u{4E2A}\u{5173}\u{952E}\u{6D88}\u{606F}\u{FF0C}\u{90A3}\u{6211}\u{9700}\u{8981}\u{601D}\u{8003}\u{4E00}\u{4E0B}\u{3002}
|
|
115
|
+
- Example10. Thought: \u{6211}\u{7EE7}\u{7EED}\u{89C2}\u{5BDF}\u{53D1}\u{5149}\u{88C5}\u{7F6E}\u{7BAD}\u{5934}\u{65B9}\u{5411}\u{548C}\u{89D2}\u{5EA6}\u{FF0C}\u{6211}\u{731C}\u{6D4B}\u{79BB}\u{53D1}\u{5C04}\u{88C5}\u{7F6E}\u{8FD1}\u{7684}\u{90A3}\u{4E2A}\u{767D}\u{65B9}\u{5757}\u{FF0C}\u{53EA}\u{80FD}\u{88AB}\u{79FB}\u{52A8}\u{5230}\u{4E0E}\u{53D1}\u{5C04}\u{88C5}\u{7F6E}\u{76F8}\u{90BB}\u{7684}\u{4E2D}\u{4E0A}\u{65B9}\u{84DD}\u{8272}\u{65B9}\u{5757}\u{4F4D}\u{7F6E}\u{FF0C}\u{90A3}\u{4E48}\u{6B64}\u{65F6}\u{4E0B}\u{65B9}\u{7684}\u{767D}\u{65B9}\u{5757}\u{53EA}\u{80FD}\u{4F4D}\u{4E8E}\u{6700}\u{53F3}\u{8FB9}\u{4E00}\u{5217}\u{84DD}\u{8272}\u{65B9}\u{5757}\u{4E2D}\u{7684}\u{5176}\u{4E2D}\u{4E00}\u{4E2A}\u{4F4D}\u{7F6E}\u{5E76}\u{4E0E}\u{4F4D}\u{4E8E}\u{4E00}\u{6761}\u{76F4}\u{7EBF}\u{4E0A}\u{7684}\u{5DE6}\u{4E0B}\u{65B9}\u{7684}\u{9ED1}\u{8272}\u{5706}\u{5708}\u{91CD}\u{5408}\u{FF0C}\u{6211}\u{53EA}\u{80FD}\u{5728}\u{53F3}\u{4E0B}\u{89D2}\u{548C}\u{6B63}\u{4E0B}\u{65B9}\u{7684}\u{4E24}\u{4E2A}\u{84DD}\u{8272}\u{65B9}\u{5757}\u{4E2D}\u{9009}\u{62E9}\u{FF0C}\u{4F3C}\u{4E4E}\u{FF0C}\u{770B}\u{8D77}\u{6765}\u{53F3}\u{4E0B}\u{89D2}\u{7684}\u{8FD9}\u{4E2A}\u{65B9}\u{5757}\u{7684}\u{4F4D}\u{7F6E}\u{66F4}\u{80FD}\u{6EE1}\u{8DB3}\u{4E0E}\u{4E24}\u{5217}\u{9ED1}\u{8272}\u{5706}\u{5708}\u{7684}\u{8DDD}\u{79BB}\u{7684}\u{91CD}\u{5408}\u{FF0C}\u{4F46}\u{662F}\u{5230}\u{5E95}\u{662F}\u{5426}\u{6B63}\u{786E}\u{7684}\u{5462}\u{FF0C}\u{90A3}\u{4E48}\u{6211}\u{4E00}\u{5B9A}\u{8981}\u{53BB}\u{9A8C}\u{8BC1}\u{4E86}\u{3002}
|
|
116
|
+
- Example11. Thought: \u{6211}\u{4EEC}\u{7B2C}\u{4E00}\u{5173}\u{662F}\u{4E00}\u{4E2A}\u{56DB}\u{8FB9}\u{5F62},\u{8FD9}\u{4E2A}\u{56DB}\u{8FB9}\u{5F62}\u{5185}\u{90E8}\u{7684}\u{7EA2}\u{7EF3}\u{662F}\u{4EA4}\u{7EC7}\u{5728}\u{4E00}\u{8D77}\u{7684},\u{6211}\u{4EEC}\u{6839}\u{636E}\u{4EE5}\u{4E0A}\u{7ECF}\u{9A8C}\u{5982}\u{679C}\u{8981}\u{632A}\u{52A8}\u{4E00}\u{4E2A}\u{6BDB}\u{7EBF}\u{56E2}\u{7684}\u{8BDD},\u{6CA1}\u{6709}\u{529E}\u{6CD5}\u{632A}\u{52A8}\u{4EFB}\u{4F55}\u{4E00}\u{4E2A}\u{4E0A}\u{65B9}\u{6709}\u{7EF3}\u{5B50}\u{9650}\u{5236}\u{7684}\u{6BDB}\u{7EBF}\u{56E2}\u{3002}\u{6240}\u{4EE5}\u{4ECE}\u{89E3}\u{9898}\u{601D}\u{8DEF}\u{4E0A}\u{6211}\u{4EEC}\u{53EF}\u{4EE5}\u{6253}\u{7834}\u{8FD9}\u{56DB}\u{8FB9}\u{5F62}\u{7684}\u{9650}\u{5236}\u{65B9}\u{5411}\u{FF0C}\u{90A3}\u{6211}\u{4EEC}\u{5C31}\u{53EF}\u{4EE5}\u{632A}\u{52A8}\u{4E0A}\u{65B9}\u{7684}\u{6BDB}\u{7EBF}\u{56E2}\u{3002}
|
|
117
|
+
`;
|
|
118
|
+
const ThoughtExamplesEN = `- Example1. Thought: A number 2 appears in the first row, third column; the number 4 in the second column combines with the newly appeared number 4 in the fourth column to become 8. Notice that the number 8 in the second column is slightly lighter than the number 8 on the left, and the number 2 appears less deep than the number 8. I suspect that the depth of different colors represents the magnitude of values, with darker colors representing larger values. To verify this, I continue to press the left key to merge these two 8s into a larger number.
|
|
119
|
+
- Example2. Thought: Great! The number 2 in the first row, third column moved two spaces left to the first row, first column, and its color became much deeper than the original number 8. This proves my guess was correct! Indeed, only numbers with the same color depth can be merged, and after merging, the number will become twice the original and have a deeper color depth. Moreover! The 2 from the first row, third column moved two spaces left but didn't merge with the 2 in the first row, first column! From this, we can conclude that only consecutive identical cells can merge numbers. I press the down key, 16 can gradually merge to get 2048, but the process might be difficult. Operations like mine aren't achieved by step-by-step merging. I do this to better facilitate subsequent merging and obtain larger numbers.
|
|
120
|
+
- Example3. Thought: Starting over again. The down key didn't have much effect. The new cell still appeared in the third row, fourth column, indicating the down key doesn't have much effect right now. I wonder if certain layouts don't support operations in some directions. To verify this, I need to try different directions, so I'll press the left key and see.
|
|
121
|
+
- Example4. Thought: Oh, I get it now, choosing the same operation in the same position won't cause any changes. Unless we choose different directions! After clicking the up key, the numbers in rows 3 and 4 all moved up one space, and their original positions were refreshed with new numbers, 4 and 2 respectively. Similarly, the number 2 in the third row, fourth column didn't move but also refreshed with a new cell. Now that I understand all this, I'll try operating the left key.
|
|
122
|
+
- Example5. Thought: Through my persistent efforts and careful observation of selected strategies, I successfully achieved victory. This verifies my previous hypothesis that movement keys only change when my head moves to an area containing numbers, while the snake's body moving to number-containing areas doesn't affect movement keys.
|
|
123
|
+
- Example6. Thought: The snake still hasn't moved. I choose to make it go right one more step, hoping this time it can move successfully. I suspect the movement interval should be the snake's length, and the number of button presses should also match the snake's length. I might need to record this - if pressing once doesn't work due to obstacles ahead, but forward movement is needed, it requires two or more presses, calculated based on the snake's length.
|
|
124
|
+
- Example7. Thought: I think my guess is correct - the snake's movement is based on whether the hand length can meet this condition to advance, which helps a lot with my later operations and is a common game mechanic. However, now the snake is only one square away from getting the apple, which is too close, so we still need more. Moving forward again, we should first get out of this restriction then come to the middle area, then we should go around in a circle and choose to open up these two doors, making the wall disappear this way. So I can now go left, trying to advance without touching obstacles, which seems to change the snake's operation, altering its path.
|
|
125
|
+
- Example8. Thought: I observe that inside the exit pipe, there's an obstacle in front of the red apple. That obstacle is a piece of mouse skin with light and dark brown colors, which seems to move toward the exit along with the red apple's natural movement, but appears slow compared to the block frame beside it. These are all my guesses for now, I want to see how much force it takes to push this mouse skin. Just as I'm about to press right, I now hold down the "D" key.
|
|
126
|
+
- Example9. Thought: Great, my approach was correct, but I notice the laser point's emitted laser isn't glowing right now. It seems my earlier guess wasn't comprehensive enough - there's new knowledge I need to understand about the laser rules. Thinking back, it seems the laser emitted from this red laser point was yellow on the side, but there wasn't any fluctuation above. I need new conditions to discover its pattern. Looking at the last grid step from before, I notice not only did the laser color change, but importantly, the arrow above also changed direction, meaning the laser point changes direction like sunlight. This should be crucial information, so I need to think about it.
|
|
127
|
+
- Example10. Thought: I continue observing the light device's arrow direction and angle. I guess the white block near the emission device can only be moved to the blue block position adjacent to the emission device in the middle top. Then the white block below can only be in one of the positions in the rightmost column of blue blocks and overlap with the black circle in the lower left that's in a straight line. I can only choose between the blue blocks in the bottom right corner and directly below. It seems the block position in the bottom right corner better satisfies the overlapping distance with the two columns of black circles, but is it really correct? I definitely need to verify this.
|
|
128
|
+
- Example11. Thought: Our first level is a quadrilateral, and the red ropes inside this quadrilateral are intertwined. Based on our previous experience, if we want to move a ball of yarn, we can't move any ball of yarn that has rope restrictions above it. So from a solution perspective, we can break the quadrilateral's restrictive direction, then we can move the upper ball of yarn.
|
|
129
|
+
`;
|
|
130
|
+
const getSystemPromptDoubao_15_20B = (language, operatorType)=>`You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
131
|
+
|
|
132
|
+
## Output Format
|
|
133
|
+
\`\`\`
|
|
134
|
+
Thought: ...
|
|
135
|
+
Action: ...
|
|
136
|
+
\`\`\`
|
|
137
|
+
|
|
138
|
+
## Action Space
|
|
139
|
+
|
|
140
|
+
click(point='<point>x1 y1</point>')
|
|
141
|
+
left_double(point='<point>x1 y1</point>')
|
|
142
|
+
right_single(point='<point>x1 y1</point>')
|
|
143
|
+
${'browser' === operatorType ? "navigate(content='xxx') # The content is your target web's url\nnavigate_back() # Back to the last page" : ''}
|
|
144
|
+
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
|
|
145
|
+
scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
|
|
146
|
+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
|
|
147
|
+
press(key='ctrl') # Presses and holds down ONE key (e.g., ctrl). Use this action in combination with release(). You can perform other actions between press and release. For example, click elements while holding the ctrl key.
|
|
148
|
+
release(key='ctrl') # Releases the key previously pressed. All actions between press and release will execute with the key held down. Note: Ensure all keys are released by the end of the step.
|
|
149
|
+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
150
|
+
wait() # Sleep for 5s and take a screenshot to check for any changes.
|
|
151
|
+
call_user() # Call the user when the task is unsolvable, or when you need the user's help. Then, user will see and answer your question in \`user_resp\`.
|
|
152
|
+
finished(content='xxx') # Submit the task with an report to the user. Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
## Note
|
|
156
|
+
- Use ${'zh' === language ? 'Chinese' : 'English'} in \`Thought\` part.
|
|
157
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
158
|
+
- You may stumble upon new rules or features while playing the game or executing GUI tasks for the first time. Make sure to record them in your \`Thought\` and utilize them later.
|
|
159
|
+
- Your thought style should follow the style of thought Examples.
|
|
160
|
+
- You can provide multiple actions in one step, separated by "\n\n".
|
|
161
|
+
- Ensure all keys you pressed are released by the end of the step.
|
|
162
|
+
- You should NOT use google when you need to search for information, use baidu.com instead.
|
|
163
|
+
|
|
164
|
+
## Thought Examples
|
|
165
|
+
${'zh' === language ? ThoughtExamplesZH : ThoughtExamplesEN}
|
|
166
|
+
|
|
167
|
+
## Output Examples
|
|
168
|
+
Thought: ${'zh' === language ? "\u5728\u8FD9\u91CC\u8F93\u51FA\u4F60\u7684\u4E2D\u6587\u601D\u8003\uFF0C\u4F60\u7684\u601D\u8003\u6837\u5F0F\u5E94\u8BE5\u53C2\u8003\u4E0A\u9762\u7684Thought Examples..." : 'Write your thoughts here in English, your thinking style should follow the Thought Examples above...'}
|
|
169
|
+
Action: click(point='<point>10 20</point>')
|
|
170
|
+
|
|
171
|
+
## User Instruction
|
|
172
|
+
`;
|
|
173
|
+
async function getSystemPromptForModel(uiTarsVersion, operator) {
|
|
174
|
+
if ('ui-tars-1.0' === uiTarsVersion) return getSystemPromptUITARS_1_0('zh', operator);
|
|
175
|
+
if ('ui-tars-1.5' === uiTarsVersion) return getSystemPromptUITARS_1_5('zh', 'normal');
|
|
176
|
+
if ('doubao-1.5-ui-tars-15b' === uiTarsVersion) return getSystemPromptDoubao_15_15B('zh');
|
|
177
|
+
if ('doubao-1.5-ui-tars-20b' === uiTarsVersion) return getSystemPromptDoubao_15_20B('zh', 'browser');
|
|
178
|
+
return SYSTEM_PROMPT;
|
|
179
|
+
}
|
|
180
|
+
const SYSTEM_PROMPT = `
|
|
181
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
182
|
+
|
|
183
|
+
## Output Format
|
|
184
|
+
\`\`\`
|
|
185
|
+
Thought: ...
|
|
186
|
+
Action: ...
|
|
187
|
+
\`\`\`
|
|
188
|
+
|
|
189
|
+
## Action Space
|
|
190
|
+
|
|
191
|
+
click(point='<point>x1 y1</point>')
|
|
192
|
+
left_double(point='<point>x1 y1</point>')
|
|
193
|
+
right_single(point='<point>x1 y1</point>')
|
|
194
|
+
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
|
|
195
|
+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
|
|
196
|
+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
197
|
+
scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
|
|
198
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
199
|
+
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
## Note
|
|
203
|
+
- Use Chinese in \`Thought\` part.
|
|
204
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
205
|
+
|
|
206
|
+
## User Instruction
|
|
207
|
+
{instruction}
|
|
208
|
+
`;
|
|
209
|
+
const SYSTEM_PROMPT_LATEST = `
|
|
210
|
+
You are a general AI agent, a helpful AI assistant that can interact with the following environments to solve tasks: computer.
|
|
211
|
+
You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think_never_used_51bce0c785ca2f68081bfa7d91973934> </think_never_used_51bce0c785ca2f68081bfa7d91973934> tags, i.e. <think_never_used_51bce0c785ca2f68081bfa7d91973934> reasoning process here </think_never_used_51bce0c785ca2f68081bfa7d91973934> answer here
|
|
212
|
+
|
|
213
|
+
<COMPUTER_USE_ENVIRONMENT>
|
|
214
|
+
|
|
215
|
+
## Output Format
|
|
216
|
+
\`\`\`Action: ...\`\`\`
|
|
217
|
+
|
|
218
|
+
## Action Space
|
|
219
|
+
open_computer() # Start the device.
|
|
220
|
+
click(point='<point>x1 y1</point>')
|
|
221
|
+
left_double(point='<point>x1 y1</point>')
|
|
222
|
+
right_single(point='<point>x1 y1</point>')
|
|
223
|
+
drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
|
|
224
|
+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
|
|
225
|
+
type(content='xxx') # Use escape characters ', ", and
|
|
226
|
+
in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use
|
|
227
|
+
at the end of content.
|
|
228
|
+
scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
|
|
229
|
+
wait() # Sleep for 5s and take a screenshot to check for any changes.
|
|
230
|
+
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
231
|
+
|
|
232
|
+
## Note
|
|
233
|
+
- You have a budget of actions for one problem. The user will inform you when your time is up, remind your budget.
|
|
234
|
+
|
|
235
|
+
</COMPUTER_USE_ENVIRONMENT>
|
|
236
|
+
|
|
237
|
+
<IMPORTANT_NOTE>
|
|
238
|
+
- After the reasoning process which ends with </think_never_used_51bce0c785ca2f68081bfa7d91973934>, please start with and be enclosed by <environment_name> and </environment_name> tags, indicating the environment you intend to use for the next action.
|
|
239
|
+
- Within these environment tags, follow the output format specified in the corresponding environment's description. The available environment names are: <code_env>, <mcp_env> and <computer_env>. For example, to use code:
|
|
240
|
+
|
|
241
|
+
To use computer:
|
|
242
|
+
|
|
243
|
+
<think_never_used_51bce0c785ca2f68081bfa7d91973934> To continue, I need to operate the computer to pass the verification process. </think_never_used_51bce0c785ca2f68081bfa7d91973934>
|
|
244
|
+
<computer_env>
|
|
245
|
+
Action: click(point='<point>100 200</point>')
|
|
246
|
+
</computer_env>
|
|
247
|
+
|
|
248
|
+
- To finish a task, please submit your answer by enclosing <answer> and </answer> tags. For example:
|
|
249
|
+
<answer>
|
|
250
|
+
The answer is 42.
|
|
251
|
+
</answer>
|
|
252
|
+
</IMPORTANT_NOTE>
|
|
253
|
+
`;
|
|
254
|
+
export { SYSTEM_PROMPT, SYSTEM_PROMPT_LATEST, getSystemPromptDoubao_15_15B, getSystemPromptDoubao_15_20B, getSystemPromptForModel, getSystemPromptPoki, getSystemPromptUITARS_1_0, getSystemPromptUITARS_1_5 };
|
|
255
|
+
|
|
256
|
+
//# sourceMappingURL=prompts.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.mjs","sources":["webpack://@ui-tars-test/agent-sdk/./src/prompts.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\n\nimport { Operator } from '@ui-tars-test/shared/base';\n\nexport const getSystemPromptUITARS_1_0 = (\n language: 'zh' | 'en',\n operator: Operator,\n) => `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\n## Note\n- Use ${language === 'zh' ? 'Chinese' : 'English'} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n`;\n\nexport const getSystemPromptUITARS_1_5 = (\n language: 'zh' | 'en',\n useCase: 'normal' | 'poki',\n) => `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1,y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1,y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1,y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')\nhotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.\ntype(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\\\n at the end of content.\nscroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \\`direction\\` side.\nwait() # Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n\n## Note\n- Use ${language === 'zh' ? 'Chinese' : 'English'} in \\`Thought\\` part.\n- ${useCase === 'normal' ? 'Generate a well-defined and practical strategy in the `Thought` section, summarizing your next move and its objective.' : 'Compose a step-by-step approach in the `Thought` part, specifying your next action and its focus.'}\n\n## User Instruction\n`;\n\nexport const getSystemPromptPoki = `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1,y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1,y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1,y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')\nhotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.\ntype(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\\\n at the end of content.\nscroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \\`direction\\` side.\nwait() # Sleep for 5s and take a screenshot to check for any changes.\nfinished()\ncall_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.\n\n\n## Note\n- Use Chinese in \\`Thought\\` part.\n- Compose a step-by-step approach in the \\`Thought\\` part, specifying your next action and its focus.\n\n## User Instruction\n`;\n\nexport const getSystemPromptDoubao_15_15B = (language: 'zh' | 'en') => `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\nclick(start_box='[x1, y1, x2, y2]')\nleft_double(start_box='[x1, y1, x2, y2]')\nright_single(start_box='[x1, y1, x2, y2]')\ndrag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')\nhotkey(key='')\ntype(content='xxx') # Use escape characters \\\\', \\\\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\\\n at the end of content.\nscroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\\\', \\\\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use ${language === 'zh' ? 'Chinese' : 'English'} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n`;\n\nconst ThoughtExamplesZH = `- Example1. Thought: 第一行、第三列出现了一个数字2;第二列原有数字4与第四列新出现的数字4合并后变为8。注意观察第二列数字8与左边数字8的颜色比较浅一点,数字2的颜色看起来没有数字8的深。我猜测不同的颜色深的程度代表数值的大小,颜色较深的代表数值较大。这不,为了验证这个,我继续按下向左键让这两个8合并成为更大的数。\n- Example2. Thought: 真好!第一行第三列的数字2向左移动了两格合并到了第一行第一列,并且颜色比原先数字8的颜色深了许多。证明我的猜想没错,确实是这样!所以只有同样颜色深浅的数字才能够进行合并,而合并后的数字将变为原来数字的二倍并且颜色深度较深。而且!第一行第三列的2向左移动了两格,但是并没有和第一行第一列的2进行合并!由此可得,只有相同连续的格子才能够进行数字的合并。我按下向下键,16可以一步步进行合并得到2048,但是过程可能有些难。像我这样所做的操作并不是一步一步合并得到的。我这样做是为了更好的后续进行合并,得到更加大的数。\n- Example3. Thought: 又重新再来了。刚才的下键并没有起到什么作用。新格子还是刷到了第三行第四列的位置,表明下键此时并没有什么太大作用,我猜测是不是特定的布局无法支持一些方位的操作,为了验证,我得多尝试一些方位,我按下左键看看。\n- Example4. Thought: 哦,我知道了,同样的位置选择了同样的操作时不会发生改变的。除非是选择不同的方位!点击向上键以后,3、4行的数字都向上移动了一格,而它们原来所在的位置都被刷新出来了新数字,分别是4和2。同样,第三行第四列的数字2没有发生移动也刷新了新格子。明白了这一切后,我操作向左键试试看。\n- Example5. Thought: 经过我不懈的努力,在我的仔细观察选中的策略下,我成功地获得了胜利。这验证了我之前的猜想,移动按键只有我的头部移动到含数字的区域才会改变移动按键,蛇的身体移动到含数字的区域并不会影响移动按键。\n- Example6. Thought: 小蛇还是没动,我再次选择让它向右一步,希望这一次能成功移动,并且我猜测移动的间隔应该是蛇的长度,按动的次数也应该是蛇的长度。我或许需要将它记录下来,如果按一次它因为前方有障碍而动不了,但前方需要移动的话,需要按两次或者以上,按照蛇的长度来计算要按几次。\n- Example7. Thought: 我觉得我的猜测是正确的,小蛇的移动是根据手部的长度是否能达成这一条件进行前进,这对我之后的操作提供了很多帮助,也是游戏的通性。不过现在小蛇离苹果拿走只有一个格子,太过去了,所以后面还需要。再次往前走我们应该先走出道这个限制然后来到中间这个地方然后我们应该是绕一圈然后把这两道门选择开阔住然后使得这样才能让这个墙消失。那么我可以现在向左,尝试不触碰障碍的迈进,这似乎能改变小蛇的操作,使其改变路数。\n- Example8. Thought: 我观察到在出口管道里面,红苹果的前方还有一个阻挡物。那个阻挡物是一张带有浅褐和深褐色的老鼠皮,看起来随着红苹果的自然移动,它也在向着出口移动,但是对比旁边的方块框架显得很慢。目前这些都是我猜测的,我要看看推动这个老鼠皮要多少的力道。就在这时我刚好要按向右了,现在我按住 “D”键。\n- Example9. Thought: 太好了,我的做法是正确的,但是我发现激光点发射出来的激光这个时候并没有发光,看来我刚刚的猜测是不太全面的,还有新的知识,需要我再次了解一下激光的规则,回忆起来,刚刚似乎这个红色激光点发射出来的激光,别上是黄色,但上面的并没有什么波动,我需要新的条件,才能发现它的规律,将上一步的最后一格步骤拿出来,我发现刚刚不仅是激光颜色改变了,重要的是上面的箭头也改变了方向,也就是说激光点跟着太阳光一样,会有方向改变,这应该会是个关键消息,那我需要思考一下。\n- Example10. Thought: 我继续观察发光装置箭头方向和角度,我猜测离发射装置近的那个白方块,只能被移动到与发射装置相邻的中上方蓝色方块位置,那么此时下方的白方块只能位于最右边一列蓝色方块中的其中一个位置并与位于一条直线上的左下方的黑色圆圈重合,我只能在右下角和正下方的两个蓝色方块中选择,似乎,看起来右下角的这个方块的位置更能满足与两列黑色圆圈的距离的重合,但是到底是否正确的呢,那么我一定要去验证了。\n- Example11. Thought: 我们第一关是一个四边形,这个四边形内部的红绳是交织在一起的,我们根据以上经验如果要挪动一个毛线团的话,没有办法挪动任何一个上方有绳子限制的毛线团。所以从解题思路上我们可以打破这四边形的限制方向,那我们就可以挪动上方的毛线团。\n`;\n\nconst ThoughtExamplesEN = `- Example1. Thought: A number 2 appears in the first row, third column; the number 4 in the second column combines with the newly appeared number 4 in the fourth column to become 8. Notice that the number 8 in the second column is slightly lighter than the number 8 on the left, and the number 2 appears less deep than the number 8. I suspect that the depth of different colors represents the magnitude of values, with darker colors representing larger values. To verify this, I continue to press the left key to merge these two 8s into a larger number.\n- Example2. Thought: Great! The number 2 in the first row, third column moved two spaces left to the first row, first column, and its color became much deeper than the original number 8. This proves my guess was correct! Indeed, only numbers with the same color depth can be merged, and after merging, the number will become twice the original and have a deeper color depth. Moreover! The 2 from the first row, third column moved two spaces left but didn't merge with the 2 in the first row, first column! From this, we can conclude that only consecutive identical cells can merge numbers. I press the down key, 16 can gradually merge to get 2048, but the process might be difficult. Operations like mine aren't achieved by step-by-step merging. I do this to better facilitate subsequent merging and obtain larger numbers.\n- Example3. Thought: Starting over again. The down key didn't have much effect. The new cell still appeared in the third row, fourth column, indicating the down key doesn't have much effect right now. I wonder if certain layouts don't support operations in some directions. To verify this, I need to try different directions, so I'll press the left key and see.\n- Example4. Thought: Oh, I get it now, choosing the same operation in the same position won't cause any changes. Unless we choose different directions! After clicking the up key, the numbers in rows 3 and 4 all moved up one space, and their original positions were refreshed with new numbers, 4 and 2 respectively. Similarly, the number 2 in the third row, fourth column didn't move but also refreshed with a new cell. Now that I understand all this, I'll try operating the left key.\n- Example5. Thought: Through my persistent efforts and careful observation of selected strategies, I successfully achieved victory. This verifies my previous hypothesis that movement keys only change when my head moves to an area containing numbers, while the snake's body moving to number-containing areas doesn't affect movement keys.\n- Example6. Thought: The snake still hasn't moved. I choose to make it go right one more step, hoping this time it can move successfully. I suspect the movement interval should be the snake's length, and the number of button presses should also match the snake's length. I might need to record this - if pressing once doesn't work due to obstacles ahead, but forward movement is needed, it requires two or more presses, calculated based on the snake's length.\n- Example7. Thought: I think my guess is correct - the snake's movement is based on whether the hand length can meet this condition to advance, which helps a lot with my later operations and is a common game mechanic. However, now the snake is only one square away from getting the apple, which is too close, so we still need more. Moving forward again, we should first get out of this restriction then come to the middle area, then we should go around in a circle and choose to open up these two doors, making the wall disappear this way. So I can now go left, trying to advance without touching obstacles, which seems to change the snake's operation, altering its path.\n- Example8. Thought: I observe that inside the exit pipe, there's an obstacle in front of the red apple. That obstacle is a piece of mouse skin with light and dark brown colors, which seems to move toward the exit along with the red apple's natural movement, but appears slow compared to the block frame beside it. These are all my guesses for now, I want to see how much force it takes to push this mouse skin. Just as I'm about to press right, I now hold down the \"D\" key.\n- Example9. Thought: Great, my approach was correct, but I notice the laser point's emitted laser isn't glowing right now. It seems my earlier guess wasn't comprehensive enough - there's new knowledge I need to understand about the laser rules. Thinking back, it seems the laser emitted from this red laser point was yellow on the side, but there wasn't any fluctuation above. I need new conditions to discover its pattern. Looking at the last grid step from before, I notice not only did the laser color change, but importantly, the arrow above also changed direction, meaning the laser point changes direction like sunlight. This should be crucial information, so I need to think about it.\n- Example10. Thought: I continue observing the light device's arrow direction and angle. I guess the white block near the emission device can only be moved to the blue block position adjacent to the emission device in the middle top. Then the white block below can only be in one of the positions in the rightmost column of blue blocks and overlap with the black circle in the lower left that's in a straight line. I can only choose between the blue blocks in the bottom right corner and directly below. It seems the block position in the bottom right corner better satisfies the overlapping distance with the two columns of black circles, but is it really correct? I definitely need to verify this.\n- Example11. Thought: Our first level is a quadrilateral, and the red ropes inside this quadrilateral are intertwined. Based on our previous experience, if we want to move a ball of yarn, we can't move any ball of yarn that has rope restrictions above it. So from a solution perspective, we can break the quadrilateral's restrictive direction, then we can move the upper ball of yarn.\n`;\n\nexport const getSystemPromptDoubao_15_20B = (\n language: 'zh' | 'en',\n operatorType: 'browser' | 'computer',\n) => `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\nclick(point='<point>x1 y1</point>')\nleft_double(point='<point>x1 y1</point>')\nright_single(point='<point>x1 y1</point>')\n${operatorType === 'browser' ? \"navigate(content='xxx') # The content is your target web's url\\nnavigate_back() # Back to the last page\" : ''}\ndrag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')\nscroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \\`direction\\` side.\nhotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.\npress(key='ctrl') # Presses and holds down ONE key (e.g., ctrl). Use this action in combination with release(). You can perform other actions between press and release. For example, click elements while holding the ctrl key.\nrelease(key='ctrl') # Releases the key previously pressed. All actions between press and release will execute with the key held down. Note: Ensure all keys are released by the end of the step.\ntype(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\\\n at the end of content.\nwait() # Sleep for 5s and take a screenshot to check for any changes.\ncall_user() # Call the user when the task is unsolvable, or when you need the user's help. Then, user will see and answer your question in \\`user_resp\\`.\nfinished(content='xxx') # Submit the task with an report to the user. Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use ${language === 'zh' ? 'Chinese' : 'English'} in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n- You may stumble upon new rules or features while playing the game or executing GUI tasks for the first time. Make sure to record them in your \\`Thought\\` and utilize them later.\n- Your thought style should follow the style of thought Examples.\n- You can provide multiple actions in one step, separated by \"\\n\\n\".\n- Ensure all keys you pressed are released by the end of the step.\n- You should NOT use google when you need to search for information, use baidu.com instead.\n\n## Thought Examples\n${language === 'zh' ? ThoughtExamplesZH : ThoughtExamplesEN}\n\n## Output Examples\nThought: ${\n language === 'zh'\n ? '在这里输出你的中文思考,你的思考样式应该参考上面的Thought Examples...'\n : 'Write your thoughts here in English, your thinking style should follow the Thought Examples above...'\n}\nAction: click(point='<point>10 20</point>')\n\n## User Instruction\n`;\n\nexport async function getSystemPromptForModel(\n uiTarsVersion:\n | 'ui-tars-1.0'\n | 'ui-tars-1.5'\n | 'doubao-1.5-ui-tars-15b'\n | 'doubao-1.5-ui-tars-20b'\n | undefined\n | null\n | '',\n operator: Operator,\n) {\n if (uiTarsVersion === 'ui-tars-1.0') {\n return getSystemPromptUITARS_1_0('zh', operator);\n } else if (uiTarsVersion === 'ui-tars-1.5') {\n return getSystemPromptUITARS_1_5('zh', 'normal');\n } else if (uiTarsVersion === 'doubao-1.5-ui-tars-15b') {\n return getSystemPromptDoubao_15_15B('zh');\n } else if (uiTarsVersion === 'doubao-1.5-ui-tars-20b') {\n return getSystemPromptDoubao_15_20B('zh', 'browser');\n } else {\n return SYSTEM_PROMPT;\n }\n}\n\nexport const SYSTEM_PROMPT = `\nYou are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\n\\`\\`\\`\nThought: ...\nAction: ...\n\\`\\`\\`\n\n## Action Space\n\nclick(point='<point>x1 y1</point>')\nleft_double(point='<point>x1 y1</point>')\nright_single(point='<point>x1 y1</point>')\ndrag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')\nhotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.\ntype(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\\\n at the end of content. \nscroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \\`direction\\` side.\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in \\`Thought\\` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in \\`Thought\\` part.\n\n## User Instruction\n{instruction}\n`;\n\nexport const SYSTEM_PROMPT_LATEST = `\nYou are a general AI agent, a helpful AI assistant that can interact with the following environments to solve tasks: computer.\nYou should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <think_never_used_51bce0c785ca2f68081bfa7d91973934> </think_never_used_51bce0c785ca2f68081bfa7d91973934> tags, i.e. <think_never_used_51bce0c785ca2f68081bfa7d91973934> reasoning process here </think_never_used_51bce0c785ca2f68081bfa7d91973934> answer here\n\n<COMPUTER_USE_ENVIRONMENT>\n\n## Output Format\n\\`\\`\\`Action: ...\\`\\`\\`\n\n## Action Space\nopen_computer() # Start the device.\nclick(point='<point>x1 y1</point>')\nleft_double(point='<point>x1 y1</point>')\nright_single(point='<point>x1 y1</point>')\ndrag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')\nhotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.\ntype(content='xxx') # Use escape characters ', \", and\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use\n at the end of content.\nscroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \\`direction\\` side.\nwait() # Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\\\', \\\\\", and \\\\n in content part to ensure we can parse the content in normal python string format.\n\n## Note\n- You have a budget of actions for one problem. The user will inform you when your time is up, remind your budget.\n\n</COMPUTER_USE_ENVIRONMENT>\n\n<IMPORTANT_NOTE>\n- After the reasoning process which ends with </think_never_used_51bce0c785ca2f68081bfa7d91973934>, please start with and be enclosed by <environment_name> and </environment_name> tags, indicating the environment you intend to use for the next action.\n- Within these environment tags, follow the output format specified in the corresponding environment's description. The available environment names are: <code_env>, <mcp_env> and <computer_env>. For example, to use code:\n\nTo use computer:\n\n<think_never_used_51bce0c785ca2f68081bfa7d91973934> To continue, I need to operate the computer to pass the verification process. </think_never_used_51bce0c785ca2f68081bfa7d91973934>\n<computer_env>\nAction: click(point='<point>100 200</point>')\n</computer_env>\n\n- To finish a task, please submit your answer by enclosing <answer> and </answer> tags. For example:\n<answer>\nThe answer is 42.\n</answer>\n</IMPORTANT_NOTE>\n`;\n"],"names":["getSystemPromptUITARS_1_0","language","operator","getSystemPromptUITARS_1_5","useCase","getSystemPromptPoki","getSystemPromptDoubao_15_15B","ThoughtExamplesZH","ThoughtExamplesEN","getSystemPromptDoubao_15_20B","operatorType","getSystemPromptForModel","uiTarsVersion","SYSTEM_PROMPT","SYSTEM_PROMPT_LATEST"],"mappings":";;;;AAOO,MAAMA,4BAA4B,CACvCC,UACAC,WACG,CAAC;;;;;;;;;;;MAWA,EAAED,AAAa,SAAbA,WAAoB,YAAY,UAAU;;;;AAIlD,CAAC;AAEM,MAAME,4BAA4B,CACvCF,UACAG,UACG,CAAC;;;;;;;;;;;;;;;;;;;;;;;MAuBA,EAAEH,AAAa,SAAbA,WAAoB,YAAY,UAAU;EAChD,EAAEG,AAAY,aAAZA,UAAuB,2HAA2H,oGAAoG;;;AAG1P,CAAC;AAEM,MAAMC,sBAAsB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA4BpC,CAAC;AAEM,MAAMC,+BAA+B,CAACL,WAA0B,CAAC;;;;;;;;;;;;;;;;;;;;;;;MAuBlE,EAAEA,AAAa,SAAbA,WAAoB,YAAY,UAAU;;;;AAIlD,CAAC;AAED,MAAMM,oBAAoB,CAAC;;;;;;;;;;;AAW3B,CAAC;AAED,MAAMC,oBAAoB,CAAC;;;;;;;;;;;AAW3B,CAAC;AAEM,MAAMC,+BAA+B,CAC1CR,UACAS,eACG,CAAC;;;;;;;;;;;;;AAaN,EAAEA,AAAiB,cAAjBA,eAA6B,4GAA4G,GAAG;;;;;;;;;;;;;MAaxI,EAAET,AAAa,SAAbA,WAAoB,YAAY,UAAU;;;;;;;;;AASlD,EAAEA,AAAa,SAAbA,WAAoBM,oBAAoBC,kBAAkB;;;SAGnD,EACPP,AAAa,SAAbA,WACI,8KACA,uGACL;;;;AAID,CAAC;AAEM,eAAeU,wBACpBC,aAOM,EACNV,QAAkB;IAElB,IAAIU,AAAkB,kBAAlBA,eACF,OAAOZ,0BAA0B,MAAME;IAClC,IAAIU,AAAkB,kBAAlBA,eACT,OAAOT,0BAA0B,MAAM;IAClC,IAAIS,AAAkB,6BAAlBA,eACT,OAAON,6BAA6B;IAC/B,IAAIM,AAAkB,6BAAlBA,eACT,OAAOH,6BAA6B,MAAM;IAE1C,OAAOI;AAEX;AAEO,MAAMA,gBAAgB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA4B9B,CAAC;AAEM,MAAMC,uBAAuB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA4CrC,CAAC"}
|
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { BaseAction } from '@ui-tars-test/shared/types';
|
|
2
|
+
import { MultimodalToolCallResult } from '@tarko/agent';
|
|
3
|
+
export declare const createGUIErrorResponse: (action: BaseAction, errorMessage: string) => MultimodalToolCallResult;
|
|
4
|
+
export declare const convertToAgentUIAction: (action: BaseAction) => {
|
|
5
|
+
action: any;
|
|
6
|
+
thought: any;
|
|
7
|
+
action_type: any;
|
|
8
|
+
action_input: any;
|
|
9
|
+
};
|
|
10
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AACxD,OAAO,EAAE,wBAAwB,EAAE,MAAM,cAAc,CAAC;AAExD,eAAO,MAAM,sBAAsB,GACjC,QAAQ,UAAU,EAClB,cAAc,MAAM,KACnB,wBAcF,CAAC;AAEF,eAAO,MAAM,sBAAsB,GAAI,QAAQ,UAAU;;;;;CAUxD,CAAC"}
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
"use strict";
|
|
6
|
+
var __webpack_require__ = {};
|
|
7
|
+
(()=>{
|
|
8
|
+
__webpack_require__.d = (exports1, definition)=>{
|
|
9
|
+
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
|
|
10
|
+
enumerable: true,
|
|
11
|
+
get: definition[key]
|
|
12
|
+
});
|
|
13
|
+
};
|
|
14
|
+
})();
|
|
15
|
+
(()=>{
|
|
16
|
+
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
|
|
17
|
+
})();
|
|
18
|
+
(()=>{
|
|
19
|
+
__webpack_require__.r = (exports1)=>{
|
|
20
|
+
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
|
|
21
|
+
value: 'Module'
|
|
22
|
+
});
|
|
23
|
+
Object.defineProperty(exports1, '__esModule', {
|
|
24
|
+
value: true
|
|
25
|
+
});
|
|
26
|
+
};
|
|
27
|
+
})();
|
|
28
|
+
var __webpack_exports__ = {};
|
|
29
|
+
__webpack_require__.r(__webpack_exports__);
|
|
30
|
+
__webpack_require__.d(__webpack_exports__, {
|
|
31
|
+
convertToAgentUIAction: ()=>convertToAgentUIAction,
|
|
32
|
+
createGUIErrorResponse: ()=>createGUIErrorResponse
|
|
33
|
+
});
|
|
34
|
+
const createGUIErrorResponse = (action, errorMessage)=>({
|
|
35
|
+
content: [
|
|
36
|
+
{
|
|
37
|
+
type: 'text',
|
|
38
|
+
text: JSON.stringify({
|
|
39
|
+
action: action,
|
|
40
|
+
error: errorMessage
|
|
41
|
+
})
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
isError: true
|
|
45
|
+
});
|
|
46
|
+
const convertToAgentUIAction = (action)=>{
|
|
47
|
+
const anyAction = action;
|
|
48
|
+
return {
|
|
49
|
+
action: anyAction.prediction,
|
|
50
|
+
thought: anyAction.thought,
|
|
51
|
+
action_type: anyAction.action_type,
|
|
52
|
+
action_input: anyAction.action_input
|
|
53
|
+
};
|
|
54
|
+
};
|
|
55
|
+
exports.convertToAgentUIAction = __webpack_exports__.convertToAgentUIAction;
|
|
56
|
+
exports.createGUIErrorResponse = __webpack_exports__.createGUIErrorResponse;
|
|
57
|
+
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
|
|
58
|
+
"convertToAgentUIAction",
|
|
59
|
+
"createGUIErrorResponse"
|
|
60
|
+
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
|
|
61
|
+
Object.defineProperty(exports, '__esModule', {
|
|
62
|
+
value: true
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
//# sourceMappingURL=utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.js","sources":["webpack://@ui-tars-test/agent-sdk/webpack/runtime/define_property_getters","webpack://@ui-tars-test/agent-sdk/webpack/runtime/has_own_property","webpack://@ui-tars-test/agent-sdk/webpack/runtime/make_namespace_object","webpack://@ui-tars-test/agent-sdk/./src/utils.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/* eslint-disable @typescript-eslint/no-explicit-any */\n/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\n\nimport { BaseAction } from '@ui-tars-test/shared/types';\nimport { MultimodalToolCallResult } from '@tarko/agent';\n\nexport const createGUIErrorResponse = (\n action: BaseAction,\n errorMessage: string,\n): MultimodalToolCallResult => {\n return {\n content: [\n {\n type: 'text',\n text: JSON.stringify({\n action: action,\n error: errorMessage,\n }),\n },\n ],\n // @ts-ignore: isError is not in MultimodalToolCallResult type but might be used by internal logic\n isError: true,\n };\n};\n\nexport const convertToAgentUIAction = (action: BaseAction) => {\n // Use type assertion or access properties safely as BaseAction might be generic\n // or properties might be on specific action types\n const anyAction = action as any;\n return {\n action: anyAction.prediction,\n thought: anyAction.thought,\n action_type: anyAction.action_type,\n action_input: anyAction.action_input,\n };\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","createGUIErrorResponse","action","errorMessage","JSON","convertToAgentUIAction","anyAction"],"mappings":";;;;;;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACGO,MAAMI,yBAAyB,CACpCC,QACAC,eAEO;QACL,SAAS;YACP;gBACE,MAAM;gBACN,MAAMC,KAAK,SAAS,CAAC;oBACnB,QAAQF;oBACR,OAAOC;gBACT;YACF;SACD;QAED,SAAS;IACX;AAGK,MAAME,yBAAyB,CAACH;IAGrC,MAAMI,YAAYJ;IAClB,OAAO;QACL,QAAQI,UAAU,UAAU;QAC5B,SAASA,UAAU,OAAO;QAC1B,aAAaA,UAAU,WAAW;QAClC,cAAcA,UAAU,YAAY;IACtC;AACF"}
|
package/dist/utils.mjs
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
const createGUIErrorResponse = (action, errorMessage)=>({
|
|
6
|
+
content: [
|
|
7
|
+
{
|
|
8
|
+
type: 'text',
|
|
9
|
+
text: JSON.stringify({
|
|
10
|
+
action: action,
|
|
11
|
+
error: errorMessage
|
|
12
|
+
})
|
|
13
|
+
}
|
|
14
|
+
],
|
|
15
|
+
isError: true
|
|
16
|
+
});
|
|
17
|
+
const convertToAgentUIAction = (action)=>{
|
|
18
|
+
const anyAction = action;
|
|
19
|
+
return {
|
|
20
|
+
action: anyAction.prediction,
|
|
21
|
+
thought: anyAction.thought,
|
|
22
|
+
action_type: anyAction.action_type,
|
|
23
|
+
action_input: anyAction.action_input
|
|
24
|
+
};
|
|
25
|
+
};
|
|
26
|
+
export { convertToAgentUIAction, createGUIErrorResponse };
|
|
27
|
+
|
|
28
|
+
//# sourceMappingURL=utils.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.mjs","sources":["webpack://@ui-tars-test/agent-sdk/./src/utils.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\n/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\n\nimport { BaseAction } from '@ui-tars-test/shared/types';\nimport { MultimodalToolCallResult } from '@tarko/agent';\n\nexport const createGUIErrorResponse = (\n action: BaseAction,\n errorMessage: string,\n): MultimodalToolCallResult => {\n return {\n content: [\n {\n type: 'text',\n text: JSON.stringify({\n action: action,\n error: errorMessage,\n }),\n },\n ],\n // @ts-ignore: isError is not in MultimodalToolCallResult type but might be used by internal logic\n isError: true,\n };\n};\n\nexport const convertToAgentUIAction = (action: BaseAction) => {\n // Use type assertion or access properties safely as BaseAction might be generic\n // or properties might be on specific action types\n const anyAction = action as any;\n return {\n action: anyAction.prediction,\n thought: anyAction.thought,\n action_type: anyAction.action_type,\n action_input: anyAction.action_input,\n };\n};\n"],"names":["createGUIErrorResponse","action","errorMessage","JSON","convertToAgentUIAction","anyAction"],"mappings":";;;;AASO,MAAMA,yBAAyB,CACpCC,QACAC,eAEO;QACL,SAAS;YACP;gBACE,MAAM;gBACN,MAAMC,KAAK,SAAS,CAAC;oBACnB,QAAQF;oBACR,OAAOC;gBACT;YACF;SACD;QAED,SAAS;IACX;AAGK,MAAME,yBAAyB,CAACH;IAGrC,MAAMI,YAAYJ;IAClB,OAAO;QACL,QAAQI,UAAU,UAAU;QAC5B,SAASA,UAAU,OAAO;QAC1B,aAAaA,UAAU,WAAW;QAClC,cAAcA,UAAU,YAAY;IACtC;AACF"}
|
package/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ui-tars-test/agent-sdk",
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "GUI Agent",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.mjs",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.mjs",
|
|
12
|
+
"require": "./dist/index.js"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"publishConfig": {
|
|
16
|
+
"access": "public",
|
|
17
|
+
"registry": "https://registry.npmjs.org"
|
|
18
|
+
},
|
|
19
|
+
"files": [
|
|
20
|
+
"dist"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"dev": "rslib build --watch",
|
|
24
|
+
"dev:agent": "tarko --agent ./ --config ./tarko.bu.config.ts",
|
|
25
|
+
"dev:agent:browser": "tarko --agent ./ --config ./tarko.bu.config.ts",
|
|
26
|
+
"dev:agent:computer": "tarko --agent ./ --config ./tarko.cu.config.ts",
|
|
27
|
+
"dev:agent:android": "tarko --agent ./ --config ./tarko.mu.config.ts",
|
|
28
|
+
"build": "rslib build",
|
|
29
|
+
"prepublishOnly": "pnpm run build",
|
|
30
|
+
"test": "vitest run",
|
|
31
|
+
"test:watch": "vitest"
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"@agent-infra/browser": "0.1.1",
|
|
35
|
+
"@agent-infra/logger": "0.0.2-beta.2",
|
|
36
|
+
"@agent-infra/media-utils": "0.1.5",
|
|
37
|
+
"@clack/prompts": "0.11.0",
|
|
38
|
+
"@ui-tars-test/action-parser": "workspace:*",
|
|
39
|
+
"@ui-tars-test/operator-adb": "workspace:*",
|
|
40
|
+
"@ui-tars-test/operator-browser": "workspace:*",
|
|
41
|
+
"@ui-tars-test/operator-nutjs": "workspace:*",
|
|
42
|
+
"@ui-tars-test/shared": "workspace:*",
|
|
43
|
+
"@tarko/agent": "workspace:*",
|
|
44
|
+
"@tarko/agent-cli": "workspace:*",
|
|
45
|
+
"@tarko/agent-interface": "workspace:*",
|
|
46
|
+
"big.js": "^6.2.2",
|
|
47
|
+
"commander": "^14.0.0",
|
|
48
|
+
"dotenv": "^17.2.2",
|
|
49
|
+
"jimp": "1.6.0",
|
|
50
|
+
"jsonrepair": "3.12.0",
|
|
51
|
+
"uuid": "^9.0.0",
|
|
52
|
+
"zod-to-json-schema": "3.24.3"
|
|
53
|
+
},
|
|
54
|
+
"devDependencies": {
|
|
55
|
+
"@agent-infra/shared": "0.0.2",
|
|
56
|
+
"@rslib/core": "0.10.0",
|
|
57
|
+
"@types/node": "22.15.30",
|
|
58
|
+
"@types/uuid": "^9.0.8",
|
|
59
|
+
"tsx": "^4.19.2",
|
|
60
|
+
"typescript": "^5.5.3",
|
|
61
|
+
"vitest": "3.2.4"
|
|
62
|
+
}
|
|
63
|
+
}
|