@ui-tars-test/shared 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/base/agent.d.ts +9 -0
- package/dist/base/agent.d.ts.map +1 -0
- package/dist/base/agent.js +54 -0
- package/dist/base/agent.js.map +1 -0
- package/dist/base/agent.mjs +10 -0
- package/dist/base/agent.mjs.map +1 -0
- package/dist/base/index.d.ts +4 -0
- package/dist/base/index.d.ts.map +1 -0
- package/dist/base/index.js +84 -0
- package/dist/base/index.js.map +1 -0
- package/dist/base/index.mjs +7 -0
- package/dist/base/operator.d.ts +140 -0
- package/dist/base/operator.d.ts.map +1 -0
- package/dist/base/operator.js +112 -0
- package/dist/base/operator.js.map +1 -0
- package/dist/base/operator.mjs +75 -0
- package/dist/base/operator.mjs.map +1 -0
- package/dist/base/parser.d.ts +11 -0
- package/dist/base/parser.d.ts.map +1 -0
- package/dist/base/parser.js +43 -0
- package/dist/base/parser.js.map +1 -0
- package/dist/base/parser.mjs +9 -0
- package/dist/base/parser.mjs.map +1 -0
- package/dist/types/actions.d.ts +224 -0
- package/dist/types/actions.d.ts.map +1 -0
- package/dist/types/actions.js +155 -0
- package/dist/types/actions.js.map +1 -0
- package/dist/types/actions.mjs +115 -0
- package/dist/types/actions.mjs.map +1 -0
- package/dist/types/agents.d.ts +108 -0
- package/dist/types/agents.d.ts.map +1 -0
- package/dist/types/agents.js +42 -0
- package/dist/types/agents.js.map +1 -0
- package/dist/types/agents.mjs +8 -0
- package/dist/types/agents.mjs.map +1 -0
- package/dist/types/archived.d.ts +44 -0
- package/dist/types/archived.d.ts.map +1 -0
- package/dist/types/archived.js +86 -0
- package/dist/types/archived.js.map +1 -0
- package/dist/types/archived.mjs +46 -0
- package/dist/types/archived.mjs.map +1 -0
- package/dist/types/index.d.ts +4 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +84 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/index.mjs +7 -0
- package/dist/utils/actions.d.ts +15 -0
- package/dist/utils/actions.d.ts.map +1 -0
- package/dist/utils/actions.js +196 -0
- package/dist/utils/actions.js.map +1 -0
- package/dist/utils/actions.mjs +156 -0
- package/dist/utils/actions.mjs.map +1 -0
- package/dist/utils/coordinateNormalizer.d.ts +10 -0
- package/dist/utils/coordinateNormalizer.d.ts.map +1 -0
- package/dist/utils/coordinateNormalizer.js +59 -0
- package/dist/utils/coordinateNormalizer.js.map +1 -0
- package/dist/utils/coordinateNormalizer.mjs +25 -0
- package/dist/utils/coordinateNormalizer.mjs.map +1 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +93 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/index.mjs +8 -0
- package/dist/utils/sleep.d.ts +14 -0
- package/dist/utils/sleep.d.ts.map +1 -0
- package/dist/utils/sleep.js +45 -0
- package/dist/utils/sleep.js.map +1 -0
- package/dist/utils/sleep.mjs +11 -0
- package/dist/utils/sleep.mjs.map +1 -0
- package/dist/utils/systemPromptProcessor.d.ts +16 -0
- package/dist/utils/systemPromptProcessor.d.ts.map +1 -0
- package/dist/utils/systemPromptProcessor.js +61 -0
- package/dist/utils/systemPromptProcessor.js.map +1 -0
- package/dist/utils/systemPromptProcessor.mjs +24 -0
- package/dist/utils/systemPromptProcessor.mjs.map +1 -0
- package/package.json +66 -0
- package/src/base/agent.ts +13 -0
- package/src/base/index.ts +7 -0
- package/src/base/operator.ts +221 -0
- package/src/base/parser.ts +16 -0
- package/src/types/actions.ts +382 -0
- package/src/types/agents.ts +128 -0
- package/src/types/archived.ts +55 -0
- package/src/types/index.ts +8 -0
- package/src/utils/actions.ts +244 -0
- package/src/utils/coordinateNormalizer.ts +49 -0
- package/src/utils/index.ts +9 -0
- package/src/utils/sleep.ts +21 -0
- package/src/utils/systemPromptProcessor.ts +48 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
7
|
+
|
|
8
|
+
export type Factors = [number, number];
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Coordinate data structure
|
|
12
|
+
* - Supports pixel coordinates (raw)
|
|
13
|
+
* - Supports normalized coordinates (normalized)
|
|
14
|
+
*/
|
|
15
|
+
export interface Coordinates {
|
|
16
|
+
raw?: { x: number; y: number }; // Raw pixels
|
|
17
|
+
normalized?: { x: number; y: number }; // Normalized coordinates (0–1)
|
|
18
|
+
referenceBox?: { x1: number; y1: number; x2: number; y2: number };
|
|
19
|
+
referenceSystem?: 'screen' | 'window' | 'browserPage' | string; // Coordinate reference system
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Standard structure for GUI Actions
|
|
24
|
+
*/
|
|
25
|
+
export interface BaseAction<
|
|
26
|
+
T extends string = string,
|
|
27
|
+
I extends Record<string, any> = Record<string, any>,
|
|
28
|
+
> {
|
|
29
|
+
type: T; // Action type (e.g., "click", "key", "swipe")
|
|
30
|
+
inputs: I; // Parameters required for the action
|
|
31
|
+
meta?: {
|
|
32
|
+
toolHint?: string; // Suggested execution tool (xdotool / adb / pyautogui etc.)
|
|
33
|
+
comment?: string; // Notes / Debug information
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ---------- ScreenShot Action ----------
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* ScreenShot action
|
|
41
|
+
*/
|
|
42
|
+
export type ScreenShotAction = BaseAction<
|
|
43
|
+
'screenshot',
|
|
44
|
+
{
|
|
45
|
+
start?: Coordinates;
|
|
46
|
+
end?: Coordinates;
|
|
47
|
+
}
|
|
48
|
+
>;
|
|
49
|
+
|
|
50
|
+
// ---------- Mouse Actions ----------
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Click action with coordinates
|
|
54
|
+
*/
|
|
55
|
+
export type ClickAction = BaseAction<
|
|
56
|
+
'click',
|
|
57
|
+
{
|
|
58
|
+
point: Coordinates;
|
|
59
|
+
}
|
|
60
|
+
>;
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Right click action with coordinates
|
|
64
|
+
*/
|
|
65
|
+
export type RightClickAction = BaseAction<
|
|
66
|
+
'right_click',
|
|
67
|
+
{
|
|
68
|
+
point: Coordinates;
|
|
69
|
+
}
|
|
70
|
+
>;
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Double click action with coordinates
|
|
74
|
+
*/
|
|
75
|
+
export type DoubleClickAction = BaseAction<
|
|
76
|
+
'double_click',
|
|
77
|
+
{
|
|
78
|
+
point: Coordinates;
|
|
79
|
+
}
|
|
80
|
+
>;
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Middle click action with coordinates
|
|
84
|
+
*/
|
|
85
|
+
export type MiddleClickAction = BaseAction<
|
|
86
|
+
'middle_click',
|
|
87
|
+
{
|
|
88
|
+
point: Coordinates;
|
|
89
|
+
}
|
|
90
|
+
>;
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Mouse down action
|
|
94
|
+
*/
|
|
95
|
+
export type MouseDownAction = BaseAction<
|
|
96
|
+
'mouse_down',
|
|
97
|
+
{
|
|
98
|
+
point?: Coordinates; // Mouse down position. If not specified, default to execute on the current mouse position.
|
|
99
|
+
button?: 'left' | 'right'; // Down button. Default to left.
|
|
100
|
+
}
|
|
101
|
+
>;
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Mouse up action
|
|
105
|
+
*/
|
|
106
|
+
export type MouseUpAction = BaseAction<
|
|
107
|
+
'mouse_up',
|
|
108
|
+
{
|
|
109
|
+
point?: Coordinates; // Mouse up position. If not specified, default to execute on the current mouse position.
|
|
110
|
+
button?: 'left' | 'right'; // Up button. Default to left.
|
|
111
|
+
}
|
|
112
|
+
>;
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Mouse move action
|
|
116
|
+
*/
|
|
117
|
+
export type MouseMoveAction = BaseAction<
|
|
118
|
+
'mouse_move', // 'move' | 'move_to' | 'hover',
|
|
119
|
+
{
|
|
120
|
+
point: Coordinates; // Target coordinates
|
|
121
|
+
}
|
|
122
|
+
>;
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Drag action with start and end coordinates
|
|
126
|
+
*/
|
|
127
|
+
export type DragAction = BaseAction<
|
|
128
|
+
'drag', // 'left_click_drag' | 'select',
|
|
129
|
+
{
|
|
130
|
+
start: Coordinates;
|
|
131
|
+
end: Coordinates;
|
|
132
|
+
direction?: 'up' | 'down' | 'left' | 'right';
|
|
133
|
+
}
|
|
134
|
+
>;
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Scroll action with coordinates and direction
|
|
138
|
+
*/
|
|
139
|
+
export type ScrollAction = BaseAction<
|
|
140
|
+
'scroll',
|
|
141
|
+
{
|
|
142
|
+
point?: Coordinates;
|
|
143
|
+
direction: 'up' | 'down' | 'left' | 'right';
|
|
144
|
+
}
|
|
145
|
+
>;
|
|
146
|
+
|
|
147
|
+
// ---------- Keyboard Actions ----------
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Type action with text content
|
|
151
|
+
*/
|
|
152
|
+
export type TypeAction = BaseAction<
|
|
153
|
+
'type',
|
|
154
|
+
{
|
|
155
|
+
content: string;
|
|
156
|
+
}
|
|
157
|
+
>;
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Hotkey action with key combination
|
|
161
|
+
*/
|
|
162
|
+
export type HotkeyAction = BaseAction<
|
|
163
|
+
'hotkey',
|
|
164
|
+
{
|
|
165
|
+
key: string;
|
|
166
|
+
}
|
|
167
|
+
>;
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Press key action
|
|
171
|
+
*/
|
|
172
|
+
export type PressAction = BaseAction<
|
|
173
|
+
'press',
|
|
174
|
+
{
|
|
175
|
+
key: string; // Key you want to press. Only one key can be pressed at one time.
|
|
176
|
+
}
|
|
177
|
+
>;
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Release key action
|
|
181
|
+
*/
|
|
182
|
+
export type ReleaseAction = BaseAction<
|
|
183
|
+
'release',
|
|
184
|
+
{
|
|
185
|
+
key: string; // Key you want to release. Only one key can be released at one time.
|
|
186
|
+
}
|
|
187
|
+
>;
|
|
188
|
+
|
|
189
|
+
// ---------- Browser Actions ----------
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Navigate action with URL
|
|
193
|
+
*/
|
|
194
|
+
export type NavigateAction = BaseAction<
|
|
195
|
+
'navigate',
|
|
196
|
+
{
|
|
197
|
+
url: string;
|
|
198
|
+
}
|
|
199
|
+
>;
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Navigate back action
|
|
203
|
+
*/
|
|
204
|
+
export type NavigateBackAction = BaseAction<'navigate_back', Record<string, never>>;
|
|
205
|
+
|
|
206
|
+
// ---------- App Actions ----------
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Long press action with coordinates
|
|
210
|
+
*/
|
|
211
|
+
export type LongPressAction = BaseAction<
|
|
212
|
+
'long_press',
|
|
213
|
+
{
|
|
214
|
+
point: Coordinates;
|
|
215
|
+
}
|
|
216
|
+
>;
|
|
217
|
+
|
|
218
|
+
export type SwipeAction = BaseAction<
|
|
219
|
+
'swipe', // 'drag',
|
|
220
|
+
{
|
|
221
|
+
start: Coordinates;
|
|
222
|
+
end: Coordinates;
|
|
223
|
+
direction: 'up' | 'down' | 'left' | 'right';
|
|
224
|
+
}
|
|
225
|
+
>;
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Home action
|
|
229
|
+
*/
|
|
230
|
+
export type HomeAction = BaseAction<'home' | 'press_home', Record<string, never>>;
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Back action
|
|
234
|
+
*/
|
|
235
|
+
export type BackAction = BaseAction<'back' | 'press_back', Record<string, never>>;
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Open app action
|
|
239
|
+
*/
|
|
240
|
+
export type OpenAppAction = BaseAction<
|
|
241
|
+
'open_app',
|
|
242
|
+
{
|
|
243
|
+
name: string;
|
|
244
|
+
}
|
|
245
|
+
>;
|
|
246
|
+
|
|
247
|
+
// ---------- Wait Actions ----------
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Wait action with no inputs
|
|
251
|
+
*/
|
|
252
|
+
export type WaitAction = BaseAction<
|
|
253
|
+
'wait',
|
|
254
|
+
{
|
|
255
|
+
time?: number; // in seconds (optional)
|
|
256
|
+
}
|
|
257
|
+
>;
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Finished - Complete the current operation.
|
|
261
|
+
*/
|
|
262
|
+
export type FinishAction = BaseAction<
|
|
263
|
+
'finished',
|
|
264
|
+
{
|
|
265
|
+
content?: string;
|
|
266
|
+
}
|
|
267
|
+
>;
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* CallUser - Request user interaction.
|
|
271
|
+
*/
|
|
272
|
+
export type CallUserAction = BaseAction<
|
|
273
|
+
'call_user',
|
|
274
|
+
{
|
|
275
|
+
content?: string;
|
|
276
|
+
}
|
|
277
|
+
>;
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Operational action types (excluding screenshot which has special handling)
|
|
281
|
+
*/
|
|
282
|
+
export type OperationalGUIAction =
|
|
283
|
+
| ClickAction
|
|
284
|
+
| DoubleClickAction
|
|
285
|
+
| RightClickAction
|
|
286
|
+
| MiddleClickAction
|
|
287
|
+
| MouseDownAction
|
|
288
|
+
| MouseUpAction
|
|
289
|
+
| MouseMoveAction
|
|
290
|
+
| DragAction
|
|
291
|
+
| ScrollAction
|
|
292
|
+
| TypeAction
|
|
293
|
+
| HotkeyAction
|
|
294
|
+
| PressAction
|
|
295
|
+
| ReleaseAction
|
|
296
|
+
| NavigateAction
|
|
297
|
+
| NavigateBackAction
|
|
298
|
+
| LongPressAction
|
|
299
|
+
| SwipeAction
|
|
300
|
+
| HomeAction
|
|
301
|
+
| BackAction
|
|
302
|
+
| OpenAppAction
|
|
303
|
+
| WaitAction
|
|
304
|
+
| FinishAction
|
|
305
|
+
| CallUserAction;
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Complete GUI action types including screenshot
|
|
309
|
+
*/
|
|
310
|
+
export type GUIAction = ScreenShotAction | OperationalGUIAction;
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Extract action type from action interface
|
|
314
|
+
*/
|
|
315
|
+
export type ExtractActionType<T> = T extends BaseAction<infer U, any> ? U : never;
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Supported operational action types (excluding screenshot)
|
|
319
|
+
*/
|
|
320
|
+
export type SupportedActionType = ExtractActionType<OperationalGUIAction>;
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* All action types including screenshot
|
|
324
|
+
*/
|
|
325
|
+
export type AllActionType = ExtractActionType<GUIAction>;
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Action metadata for documentation and serialization
|
|
329
|
+
*/
|
|
330
|
+
export interface ActionMetadata {
|
|
331
|
+
description: string;
|
|
332
|
+
category: 'mouse' | 'keyboard' | 'navigation' | 'mobile' | 'system' | 'wait';
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Comprehensive action metadata registry
|
|
337
|
+
*/
|
|
338
|
+
export const ACTION_METADATA: Record<SupportedActionType, ActionMetadata> = {
|
|
339
|
+
click: { category: 'mouse', description: 'Click on an element' },
|
|
340
|
+
right_click: { category: 'mouse', description: 'Right click on an element' },
|
|
341
|
+
double_click: { category: 'mouse', description: 'Double click on an element' },
|
|
342
|
+
middle_click: { category: 'mouse', description: 'Middle click on an element' },
|
|
343
|
+
mouse_down: { category: 'mouse', description: 'Press mouse button down' },
|
|
344
|
+
mouse_up: { category: 'mouse', description: 'Release mouse button' },
|
|
345
|
+
mouse_move: { category: 'mouse', description: 'Move mouse to position' },
|
|
346
|
+
drag: { category: 'mouse', description: 'Drag from one position to another' },
|
|
347
|
+
scroll: { category: 'mouse', description: 'Scroll in a direction' },
|
|
348
|
+
type: { category: 'keyboard', description: 'Type text' },
|
|
349
|
+
hotkey: { category: 'keyboard', description: 'Press hotkey combination' },
|
|
350
|
+
press: { category: 'keyboard', description: 'Press a key' },
|
|
351
|
+
release: { category: 'keyboard', description: 'Release a key' },
|
|
352
|
+
navigate: { category: 'navigation', description: 'Navigate to URL' },
|
|
353
|
+
navigate_back: { category: 'navigation', description: 'Navigate back' },
|
|
354
|
+
long_press: { category: 'mobile', description: 'Long press on element' },
|
|
355
|
+
swipe: { category: 'mobile', description: 'Swipe gesture' },
|
|
356
|
+
home: { category: 'mobile', description: 'Go to home' },
|
|
357
|
+
press_home: { category: 'mobile', description: 'Press home button' },
|
|
358
|
+
back: { category: 'mobile', description: 'Go back' },
|
|
359
|
+
press_back: { category: 'mobile', description: 'Press back button' },
|
|
360
|
+
open_app: { category: 'mobile', description: 'Open application' },
|
|
361
|
+
wait: { category: 'wait', description: 'Wait for specified time' },
|
|
362
|
+
finished: { category: 'system', description: 'Mark task as finished' },
|
|
363
|
+
call_user: { category: 'system', description: 'Request user interaction' },
|
|
364
|
+
} as const;
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Type guard function to check if a string is a valid operational action type
|
|
368
|
+
* @param type - The string to check
|
|
369
|
+
* @returns Whether the string is a valid SupportedActionType
|
|
370
|
+
*/
|
|
371
|
+
export function isSupportedActionType(type: string): type is SupportedActionType {
|
|
372
|
+
return type in ACTION_METADATA;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Type guard function to check if a string is any valid action type (including screenshot)
|
|
377
|
+
* @param type - The string to check
|
|
378
|
+
* @returns Whether the string is a valid AllActionType
|
|
379
|
+
*/
|
|
380
|
+
export function isValidActionType(type: string): type is AllActionType {
|
|
381
|
+
return type === 'screenshot' || isSupportedActionType(type);
|
|
382
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
7
|
+
|
|
8
|
+
import { AgentOptions } from '@tarko/agent-interface';
|
|
9
|
+
import { Factors, BaseAction, Coordinates, SupportedActionType, ActionMetadata } from './actions';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Type definition for parsed GUI response structure
|
|
13
|
+
* Represents the components extracted from a model's output string
|
|
14
|
+
* Aligned with tarko's web UI design
|
|
15
|
+
*/
|
|
16
|
+
export interface ParsedGUIResponse {
|
|
17
|
+
/** raw prediction string */
|
|
18
|
+
rawContent: string;
|
|
19
|
+
/** parsed from Thought: `<thought>` */
|
|
20
|
+
reasoningContent?: string;
|
|
21
|
+
/** parsed from Action: action(params=`action`) */
|
|
22
|
+
rawActionStrings?: string[];
|
|
23
|
+
/** parsed from Action: action(params=`action`) */
|
|
24
|
+
actions: BaseAction[];
|
|
25
|
+
/** error message to feedback to LLM */
|
|
26
|
+
errorMessage?: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Type definition for function to normalize raw coordinates
|
|
31
|
+
* Converts raw pixel coordinates to normalized coordinates (0-1)
|
|
32
|
+
* @param rawX - Raw X coordinate in pixels
|
|
33
|
+
* @param rawY - Raw Y coordinate in pixels
|
|
34
|
+
* @returns Normalized coordinates {x, y} with values between 0 and 1
|
|
35
|
+
*/
|
|
36
|
+
export type NormalizeCoordinates = (
|
|
37
|
+
rawCoords: Coordinates,
|
|
38
|
+
factors?: Factors,
|
|
39
|
+
) => { normalized: Coordinates };
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Type definition for handler function to parse model output into ParsedGUIResponse object
|
|
43
|
+
* @param prediction - The raw output from the model to be parsed
|
|
44
|
+
* @returns ParsedGUIResponse object if parsing is successful, null otherwise
|
|
45
|
+
*/
|
|
46
|
+
export type CustomActionParser = (prediction: string) => ParsedGUIResponse | null;
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Function type for serializing supported actions to string format
|
|
50
|
+
* @param actions - Array of supported action types
|
|
51
|
+
* @returns String representation of the actions for agent processing
|
|
52
|
+
*/
|
|
53
|
+
export type SerializeSupportedActions = (actions: Array<SupportedActionType>) => string;
|
|
54
|
+
|
|
55
|
+
export type ExecuteParams = {
|
|
56
|
+
/** Required actions to execute */
|
|
57
|
+
actions: BaseAction[];
|
|
58
|
+
} & Partial<Omit<ParsedGUIResponse, 'actions'>> &
|
|
59
|
+
Record<string, any>;
|
|
60
|
+
|
|
61
|
+
export type ExecuteOutput = {
|
|
62
|
+
status: 'success' | 'failed';
|
|
63
|
+
errorMessage?: string;
|
|
64
|
+
url?: string; // url of the page
|
|
65
|
+
} & Record<string, any>;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Function type for calculating detail level based on image dimensions
|
|
69
|
+
*/
|
|
70
|
+
export type ImageDetailCalculator = (width: number, height: number) => 'low' | 'high' | 'auto';
|
|
71
|
+
|
|
72
|
+
export interface ScreenshotOutput extends ExecuteOutput {
|
|
73
|
+
/** screenshot base64, `keep screenshot size as physical pixels` */
|
|
74
|
+
base64: string;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Reserved placeholder name for action space in system prompt template
|
|
79
|
+
*/
|
|
80
|
+
export const ACTION_SPACE_PLACEHOLDER = 'action_space';
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Interface for system prompt template configuration
|
|
84
|
+
*/
|
|
85
|
+
export interface SystemPromptTemplate {
|
|
86
|
+
/**
|
|
87
|
+
* Template string with placeholders. Must include an action space placeholder
|
|
88
|
+
* `{{${ACTION_SPACE_PLACEHOLDER}}}` that will be replaced with the string representation of available actions
|
|
89
|
+
*/
|
|
90
|
+
template: string;
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Function to convert BaseAction array to string representation for the action space
|
|
94
|
+
* This will be used to fill the action space placeholder in the template
|
|
95
|
+
*/
|
|
96
|
+
actionsToString?: SerializeSupportedActions;
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Optional map of additional placeholder values to be replaced in the template
|
|
100
|
+
* Keys are placeholder names, values are the replacement strings
|
|
101
|
+
* Note: '${ACTION_SPACE_PLACEHOLDER}' is a reserved placeholder and should NOT be included here
|
|
102
|
+
* as it will be automatically filled using the actionsToString function
|
|
103
|
+
*/
|
|
104
|
+
// placeholders?: Omit<Record<string, string>, typeof ACTION_SPACE_PLACEHOLDER>;
|
|
105
|
+
placeholders?: Record<string, string>;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export interface GUIAgentConfig<TOperator> extends AgentOptions {
|
|
109
|
+
operator: TOperator;
|
|
110
|
+
// ===== Optional =====
|
|
111
|
+
/**
|
|
112
|
+
* System prompt configuration. Can be either:
|
|
113
|
+
* - A simple string (legacy mode)
|
|
114
|
+
* - An array of strings or chat messages (will be concatenated)
|
|
115
|
+
* - A SystemPromptTemplate object with template and actionsToString function
|
|
116
|
+
*/
|
|
117
|
+
systemPrompt?: string | Array<string | { role: string; content: string }> | SystemPromptTemplate;
|
|
118
|
+
/** The handler function to parse model output into PredictionParsed object */
|
|
119
|
+
customeActionParser?: CustomActionParser;
|
|
120
|
+
/** The function to normalize raw coordinates */
|
|
121
|
+
normalizeCoordinates?: NormalizeCoordinates;
|
|
122
|
+
/** The function to calculate detail level based on image dimensions */
|
|
123
|
+
detailCalculator?: ImageDetailCalculator;
|
|
124
|
+
/** Maximum number of turns for Agent to execute, @default 1000 */
|
|
125
|
+
maxLoopCount?: number;
|
|
126
|
+
/** Time interval between two loop iterations (in milliseconds), @default 0 */
|
|
127
|
+
loopIntervalInMs?: number;
|
|
128
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*/
|
|
5
|
+
export enum ErrorStatusEnum {
|
|
6
|
+
/** 100000 */
|
|
7
|
+
SCREENSHOT_RETRY_ERROR = -100000,
|
|
8
|
+
/** 100001 */
|
|
9
|
+
INVOKE_RETRY_ERROR = -100001,
|
|
10
|
+
/** 100002 */
|
|
11
|
+
EXECUTE_RETRY_ERROR = -100002,
|
|
12
|
+
/** 100003 */
|
|
13
|
+
MODEL_SERVICE_ERROR = -100003,
|
|
14
|
+
/** 100004 */
|
|
15
|
+
REACH_MAXLOOP_ERROR = -100004,
|
|
16
|
+
/** 100005 */
|
|
17
|
+
ENVIRONMENT_ERROR = -100005,
|
|
18
|
+
/** 100099 */
|
|
19
|
+
UNKNOWN_ERROR = -100099,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export class GUIAgentError extends Error {
|
|
23
|
+
status: ErrorStatusEnum;
|
|
24
|
+
message: string;
|
|
25
|
+
stack?: string;
|
|
26
|
+
|
|
27
|
+
constructor(status: ErrorStatusEnum, message: string, stack?: string) {
|
|
28
|
+
super(message);
|
|
29
|
+
this.status = status;
|
|
30
|
+
this.message = message;
|
|
31
|
+
this.stack = stack;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export type Status = `${StatusEnum}`;
|
|
36
|
+
export enum StatusEnum {
|
|
37
|
+
INIT = 'init',
|
|
38
|
+
RUNNING = 'running',
|
|
39
|
+
PAUSE = 'pause',
|
|
40
|
+
END = 'end',
|
|
41
|
+
CALL_USER = 'call_user',
|
|
42
|
+
/**
|
|
43
|
+
* @deprecated kept for backward compatibility
|
|
44
|
+
*/
|
|
45
|
+
MAX_LOOP = 'max_loop',
|
|
46
|
+
USER_STOPPED = 'user_stopped',
|
|
47
|
+
ERROR = 'error',
|
|
48
|
+
}
|
|
49
|
+
export interface VlmResponse {
|
|
50
|
+
generate_resp: {
|
|
51
|
+
input: string;
|
|
52
|
+
prediction: string;
|
|
53
|
+
uid: string;
|
|
54
|
+
}[];
|
|
55
|
+
}
|