@midscene/core 0.26.5-beta-20250814095614.0 → 0.26.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model/action-executor.mjs +139 -0
- package/dist/es/ai-model/action-executor.mjs.map +1 -0
- package/dist/es/ai-model/common.mjs +219 -0
- package/dist/es/ai-model/common.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +10 -0
- package/dist/es/ai-model/inspect.mjs +317 -0
- package/dist/es/ai-model/inspect.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +85 -0
- package/dist/es/ai-model/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/assertion.mjs +55 -0
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -0
- package/dist/es/ai-model/prompt/common.mjs +7 -0
- package/dist/es/ai-model/prompt/common.mjs.map +1 -0
- package/dist/es/ai-model/prompt/describe.mjs +44 -0
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +137 -0
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +275 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +359 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +47 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs +34 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/util.mjs +123 -0
- package/dist/es/ai-model/prompt/util.mjs.map +1 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +413 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +235 -0
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
- package/dist/es/image/index.mjs +2 -0
- package/dist/es/index.mjs +7 -2360
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/insight/index.mjs +261 -0
- package/dist/es/insight/index.mjs.map +1 -0
- package/dist/es/insight/utils.mjs +19 -0
- package/dist/es/insight/utils.mjs.map +1 -0
- package/dist/es/types.mjs +11 -0
- package/dist/es/types.mjs.map +1 -0
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/ai-model/action-executor.js +173 -0
- package/dist/lib/ai-model/action-executor.js.map +1 -0
- package/dist/lib/ai-model/common.js +289 -0
- package/dist/lib/ai-model/common.js.map +1 -0
- package/dist/lib/ai-model/index.js +103 -0
- package/dist/lib/ai-model/index.js.map +1 -0
- package/dist/lib/ai-model/inspect.js +360 -0
- package/dist/lib/ai-model/inspect.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +119 -0
- package/dist/lib/ai-model/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/assertion.js +92 -0
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -0
- package/dist/lib/ai-model/prompt/common.js +41 -0
- package/dist/lib/ai-model/prompt/common.js.map +1 -0
- package/dist/lib/ai-model/prompt/describe.js +78 -0
- package/dist/lib/ai-model/prompt/describe.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +177 -0
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +315 -0
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +415 -0
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +84 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js +68 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/util.js +175 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +496 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -0
- package/dist/lib/ai-model/ui-tars-planning.js +272 -0
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
- package/dist/lib/image/index.js +56 -0
- package/dist/lib/image/index.js.map +1 -0
- package/dist/lib/index.js +21 -2393
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/insight/index.js +295 -0
- package/dist/lib/insight/index.js.map +1 -0
- package/dist/lib/insight/utils.js +53 -0
- package/dist/lib/insight/utils.js.map +1 -0
- package/dist/lib/types.js +82 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml.js +20 -0
- package/dist/lib/yaml.js.map +1 -0
- package/dist/types/ai-model/action-executor.d.ts +19 -0
- package/dist/types/ai-model/common.d.ts +34 -0
- package/dist/types/ai-model/index.d.ts +11 -0
- package/dist/types/ai-model/inspect.d.ts +49 -0
- package/dist/types/ai-model/llm-planning.d.ts +10 -0
- package/dist/types/ai-model/prompt/assertion.d.ts +5 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +4 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +9 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +15 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +6 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +25 -0
- package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +1 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +45 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +99 -0
- package/dist/types/ai-model/service-caller/index.d.ts +26 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +76 -0
- package/dist/types/image/index.d.ts +1 -0
- package/dist/types/index.d.ts +9 -1289
- package/dist/types/insight/index.d.ts +26 -0
- package/dist/types/insight/utils.d.ts +2 -0
- package/dist/types/tree.d.ts +1 -11
- package/dist/types/types.d.ts +399 -0
- package/dist/types/utils.d.ts +27 -47
- package/dist/types/yaml.d.ts +172 -0
- package/package.json +6 -6
- package/dist/es/ai-model.mjs +0 -2502
- package/dist/es/ai-model.mjs.map +0 -1
- package/dist/lib/ai-model.js +0 -2622
- package/dist/lib/ai-model.js.map +0 -1
- package/dist/types/ai-model.d.ts +0 -596
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import { UITarsModelVersion, uiTarsModelVersion, vlLocateMode } from "@midscene/shared/env";
|
|
2
|
+
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
3
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
4
|
+
import { transformHotkeyInput } from "@midscene/shared/us-keyboard-layout";
|
|
5
|
+
import { assert } from "@midscene/shared/utils";
|
|
6
|
+
import { actionParser } from "@ui-tars/action-parser";
|
|
7
|
+
import { AIActionType } from "./common.mjs";
|
|
8
|
+
import { getSummary, getUiTarsPlanningPrompt } from "./prompt/ui-tars-planning.mjs";
|
|
9
|
+
import { call } from "./service-caller/index.mjs";
|
|
10
|
+
const debug = getDebug('ui-tars-planning');
|
|
11
|
+
const bboxSize = 10;
|
|
12
|
+
const pointToBbox = (point, width, height)=>[
|
|
13
|
+
Math.round(Math.max(point.x - bboxSize / 2, 0)),
|
|
14
|
+
Math.round(Math.max(point.y - bboxSize / 2, 0)),
|
|
15
|
+
Math.round(Math.min(point.x + bboxSize / 2, width)),
|
|
16
|
+
Math.round(Math.min(point.y + bboxSize / 2, height))
|
|
17
|
+
];
|
|
18
|
+
async function vlmPlanning(options) {
|
|
19
|
+
const { conversationHistory, userInstruction, size } = options;
|
|
20
|
+
const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
|
|
21
|
+
const res = await call([
|
|
22
|
+
{
|
|
23
|
+
role: 'user',
|
|
24
|
+
content: systemPrompt
|
|
25
|
+
},
|
|
26
|
+
...conversationHistory
|
|
27
|
+
], AIActionType.INSPECT_ELEMENT);
|
|
28
|
+
const convertedText = convertBboxToCoordinates(res.content);
|
|
29
|
+
const modelVer = uiTarsModelVersion();
|
|
30
|
+
const { parsed } = actionParser({
|
|
31
|
+
prediction: convertedText,
|
|
32
|
+
factor: [
|
|
33
|
+
1000,
|
|
34
|
+
1000
|
|
35
|
+
],
|
|
36
|
+
screenContext: {
|
|
37
|
+
width: size.width,
|
|
38
|
+
height: size.height
|
|
39
|
+
},
|
|
40
|
+
modelVer: modelVer || void 0
|
|
41
|
+
});
|
|
42
|
+
debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));
|
|
43
|
+
const transformActions = [];
|
|
44
|
+
parsed.forEach((action)=>{
|
|
45
|
+
if ('click' === action.action_type) {
|
|
46
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
47
|
+
const point = getPoint(action.action_inputs.start_box, size);
|
|
48
|
+
transformActions.push({
|
|
49
|
+
type: 'Locate',
|
|
50
|
+
param: {},
|
|
51
|
+
locate: {
|
|
52
|
+
prompt: action.thought || '',
|
|
53
|
+
bbox: pointToBbox({
|
|
54
|
+
x: point[0],
|
|
55
|
+
y: point[1]
|
|
56
|
+
}, size.width, size.height)
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
transformActions.push({
|
|
60
|
+
type: 'Tap',
|
|
61
|
+
locate: {
|
|
62
|
+
prompt: action.thought || '',
|
|
63
|
+
bbox: pointToBbox({
|
|
64
|
+
x: point[0],
|
|
65
|
+
y: point[1]
|
|
66
|
+
}, size.width, size.height)
|
|
67
|
+
},
|
|
68
|
+
param: action.thought || ''
|
|
69
|
+
});
|
|
70
|
+
} else if ('drag' === action.action_type) {
|
|
71
|
+
assert(action.action_inputs.start_box, 'start_box is required');
|
|
72
|
+
assert(action.action_inputs.end_box, 'end_box is required');
|
|
73
|
+
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
74
|
+
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
75
|
+
transformActions.push({
|
|
76
|
+
type: 'Drag',
|
|
77
|
+
param: {
|
|
78
|
+
start_box: {
|
|
79
|
+
x: startPoint[0],
|
|
80
|
+
y: startPoint[1]
|
|
81
|
+
},
|
|
82
|
+
end_box: {
|
|
83
|
+
x: endPoint[0],
|
|
84
|
+
y: endPoint[1]
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
locate: null,
|
|
88
|
+
thought: action.thought || ''
|
|
89
|
+
});
|
|
90
|
+
} else if ('type' === action.action_type) transformActions.push({
|
|
91
|
+
type: 'Input',
|
|
92
|
+
param: {
|
|
93
|
+
value: action.action_inputs.content
|
|
94
|
+
},
|
|
95
|
+
locate: null,
|
|
96
|
+
thought: action.thought || ''
|
|
97
|
+
});
|
|
98
|
+
else if ('scroll' === action.action_type) transformActions.push({
|
|
99
|
+
type: 'Scroll',
|
|
100
|
+
param: {
|
|
101
|
+
direction: action.action_inputs.direction
|
|
102
|
+
},
|
|
103
|
+
locate: null,
|
|
104
|
+
thought: action.thought || ''
|
|
105
|
+
});
|
|
106
|
+
else if ('finished' === action.action_type) transformActions.push({
|
|
107
|
+
type: 'Finished',
|
|
108
|
+
param: {},
|
|
109
|
+
locate: null,
|
|
110
|
+
thought: action.thought || ''
|
|
111
|
+
});
|
|
112
|
+
else if ('hotkey' === action.action_type) if (action.action_inputs.key) {
|
|
113
|
+
const keys = transformHotkeyInput(action.action_inputs.key);
|
|
114
|
+
transformActions.push({
|
|
115
|
+
type: 'KeyboardPress',
|
|
116
|
+
param: {
|
|
117
|
+
value: keys
|
|
118
|
+
},
|
|
119
|
+
locate: null,
|
|
120
|
+
thought: action.thought || ''
|
|
121
|
+
});
|
|
122
|
+
} else console.warn('No key found in action: hotkey. Will not perform action.');
|
|
123
|
+
else if ('wait' === action.action_type) transformActions.push({
|
|
124
|
+
type: 'Sleep',
|
|
125
|
+
param: {
|
|
126
|
+
timeMs: 1000
|
|
127
|
+
},
|
|
128
|
+
locate: null,
|
|
129
|
+
thought: action.thought || ''
|
|
130
|
+
});
|
|
131
|
+
else if ('androidBackButton' === action.action_type) transformActions.push({
|
|
132
|
+
type: 'AndroidBackButton',
|
|
133
|
+
param: {},
|
|
134
|
+
locate: null,
|
|
135
|
+
thought: action.thought || ''
|
|
136
|
+
});
|
|
137
|
+
else if ('androidHomeButton' === action.action_type) transformActions.push({
|
|
138
|
+
type: 'AndroidHomeButton',
|
|
139
|
+
param: {},
|
|
140
|
+
locate: null,
|
|
141
|
+
thought: action.thought || ''
|
|
142
|
+
});
|
|
143
|
+
else if ('androidRecentAppsButton' === action.action_type) transformActions.push({
|
|
144
|
+
type: 'AndroidRecentAppsButton',
|
|
145
|
+
param: {}
|
|
146
|
+
});
|
|
147
|
+
else if ('androidLongPress' === action.action_type) {
|
|
148
|
+
assert(action.action_inputs.start_coords, 'start_coords is required for androidLongPress');
|
|
149
|
+
const point = action.action_inputs.start_coords;
|
|
150
|
+
transformActions.push({
|
|
151
|
+
type: 'AndroidLongPress',
|
|
152
|
+
param: {
|
|
153
|
+
x: point[0],
|
|
154
|
+
y: point[1],
|
|
155
|
+
duration: 1000
|
|
156
|
+
},
|
|
157
|
+
locate: null,
|
|
158
|
+
thought: action.thought || ''
|
|
159
|
+
});
|
|
160
|
+
} else if ('androidPull' === action.action_type) {
|
|
161
|
+
const pullDirection = action.action_inputs.direction || 'down';
|
|
162
|
+
const startPoint = action.action_inputs.start_coords ? {
|
|
163
|
+
x: action.action_inputs.start_coords[0],
|
|
164
|
+
y: action.action_inputs.start_coords[1]
|
|
165
|
+
} : void 0;
|
|
166
|
+
transformActions.push({
|
|
167
|
+
type: 'AndroidPull',
|
|
168
|
+
param: {
|
|
169
|
+
direction: pullDirection,
|
|
170
|
+
startPoint,
|
|
171
|
+
distance: action.action_inputs.distance,
|
|
172
|
+
duration: action.action_inputs.duration || 500
|
|
173
|
+
},
|
|
174
|
+
locate: null,
|
|
175
|
+
thought: action.thought || ''
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
if (0 === transformActions.length) throw new Error(`No actions found, response: ${res.content}`, {
|
|
180
|
+
cause: {
|
|
181
|
+
prediction: res.content,
|
|
182
|
+
parsed
|
|
183
|
+
}
|
|
184
|
+
});
|
|
185
|
+
return {
|
|
186
|
+
actions: transformActions,
|
|
187
|
+
actionsFromModel: parsed,
|
|
188
|
+
action_summary: getSummary(res.content),
|
|
189
|
+
usage: res.usage,
|
|
190
|
+
rawResponse: JSON.stringify(res.content, void 0, 2)
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
function convertBboxToCoordinates(text) {
|
|
194
|
+
const pattern = /<bbox>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)<\/bbox>/g;
|
|
195
|
+
function replaceMatch(match, x1, y1, x2, y2) {
|
|
196
|
+
const x1Num = Number.parseInt(x1, 10);
|
|
197
|
+
const y1Num = Number.parseInt(y1, 10);
|
|
198
|
+
const x2Num = Number.parseInt(x2, 10);
|
|
199
|
+
const y2Num = Number.parseInt(y2, 10);
|
|
200
|
+
const x = Math.floor((x1Num + x2Num) / 2);
|
|
201
|
+
const y = Math.floor((y1Num + y2Num) / 2);
|
|
202
|
+
return `(${x},${y})`;
|
|
203
|
+
}
|
|
204
|
+
const cleanedText = text.replace(/\[EOS\]/g, '');
|
|
205
|
+
return cleanedText.replace(pattern, replaceMatch).trim();
|
|
206
|
+
}
|
|
207
|
+
function getPoint(startBox, size) {
|
|
208
|
+
const [x, y] = JSON.parse(startBox);
|
|
209
|
+
return [
|
|
210
|
+
x * size.width,
|
|
211
|
+
y * size.height
|
|
212
|
+
];
|
|
213
|
+
}
|
|
214
|
+
async function resizeImageForUiTars(imageBase64, size) {
|
|
215
|
+
if ('vlm-ui-tars' === vlLocateMode() && uiTarsModelVersion() === UITarsModelVersion.V1_5) {
|
|
216
|
+
debug('ui-tars-v1.5, will check image size', size);
|
|
217
|
+
const currentPixels = size.width * size.height;
|
|
218
|
+
const maxPixels = 12845056;
|
|
219
|
+
if (currentPixels > maxPixels) {
|
|
220
|
+
const resizeFactor = Math.sqrt(maxPixels / currentPixels);
|
|
221
|
+
const newWidth = Math.floor(size.width * resizeFactor);
|
|
222
|
+
const newHeight = Math.floor(size.height * resizeFactor);
|
|
223
|
+
debug('resize image for ui-tars, new width: %s, new height: %s', newWidth, newHeight);
|
|
224
|
+
const resizedImage = await resizeImgBase64(imageBase64, {
|
|
225
|
+
width: newWidth,
|
|
226
|
+
height: newHeight
|
|
227
|
+
});
|
|
228
|
+
return resizedImage;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return imageBase64;
|
|
232
|
+
}
|
|
233
|
+
export { resizeImageForUiTars, vlmPlanning };
|
|
234
|
+
|
|
235
|
+
//# sourceMappingURL=ui-tars-planning.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait'\n | 'androidBackButton'\n | 'androidHomeButton'\n | 'androidRecentAppsButton'\n | 'androidLongPress'\n | 'androidPull';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size } = options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion();\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Locate',\n param: {},\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n });\n transformActions.push({\n type: 'Tap',\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n param: action.thought || '',\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'Drag',\n param: {\n start_box: { x: startPoint[0], y: startPoint[1] },\n end_box: { x: endPoint[0], y: endPoint[1] },\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n value: keys,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidBackButton') {\n transformActions.push({\n type: 'AndroidBackButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidHomeButton') {\n transformActions.push({\n type: 'AndroidHomeButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidRecentAppsButton') {\n transformActions.push({\n type: 'AndroidRecentAppsButton',\n param: {},\n });\n } else if (action.action_type === 'androidLongPress') {\n assert(\n action.action_inputs.start_coords,\n 'start_coords is required for androidLongPress',\n );\n const point = action.action_inputs.start_coords;\n transformActions.push({\n type: 'AndroidLongPress',\n param: {\n x: point[0],\n y: point[1],\n duration: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidPull') {\n const pullDirection = action.action_inputs.direction || 'down';\n const startPoint = action.action_inputs.start_coords\n ? {\n x: action.action_inputs.start_coords[0],\n y: action.action_inputs.start_coords[1],\n }\n : undefined;\n\n transformActions.push({\n type: 'AndroidPull',\n param: {\n direction: pullDirection as 'up' | 'down',\n startPoint,\n distance: (action.action_inputs as any).distance,\n duration: (action.action_inputs as any).duration || 500,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\ninterface AndroidLongPressAction extends BaseAction {\n action_type: 'androidLongPress';\n action_inputs: {\n start_coords: [number, number]; // Coordinates for long press\n duration?: number; // Duration in milliseconds\n };\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction\n | AndroidLongPressAction;\n\nexport async function resizeImageForUiTars(imageBase64: string, size: Size) {\n if (\n vlLocateMode() === 'vlm-ui-tars' &&\n uiTarsModelVersion() === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","pullDirection","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AAkCA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAIjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAE,GAAGH;IACvD,MAAMI,eAAeC,4BAA4BH;IAEjD,MAAMI,MAAM,MAAMC,KAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGH;KACJ,EACDO,aAAa,eAAe;IAE9B,MAAMC,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC;IAEjB,MAAM,EAAEC,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAON,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUQ,YAAYI;IACxB;IAEAxB,MAAM,YAAYoB,UAAU,UAAUK,KAAK,SAAS,CAACH;IAErD,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMvB,QAAQyB,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEf;YACvDc,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO,CAAC;gBACR,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMxB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;YACF;YACAc,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMxB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;gBACA,OAAOe,OAAO,OAAO,IAAI;YAC3B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEf;YAC5D,MAAMmB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEf;YACxDc,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAW;wBAAE,GAAGI,UAAU,CAAC,EAAE;wBAAE,GAAGA,UAAU,CAAC,EAAE;oBAAC;oBAChD,SAAS;wBAAE,GAAGC,QAAQ,CAAC,EAAE;wBAAE,GAAGA,QAAQ,CAAC,EAAE;oBAAC;gBAC5C;gBACA,QAAQ;gBACR,SAASJ,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,OAAOM;gBACT;gBACA,QAAQ;gBACR,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAdEO,QAAQ,IAAI,CACV;aAcC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,8BAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;QACV;aACK,IAAIC,AAAuB,uBAAvBA,OAAO,WAAW,EAAyB;YACpDC,OACED,OAAO,aAAa,CAAC,YAAY,EACjC;YAEF,MAAMvB,QAAQuB,OAAO,aAAa,CAAC,YAAY;YAC/CD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,GAAGtB,KAAK,CAAC,EAAE;oBACX,GAAGA,KAAK,CAAC,EAAE;oBACX,UAAU;gBACZ;gBACA,QAAQ;gBACR,SAASuB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,kBAAvBA,OAAO,WAAW,EAAoB;YAC/C,MAAMQ,gBAAgBR,OAAO,aAAa,CAAC,SAAS,IAAI;YACxD,MAAMG,aAAaH,OAAO,aAAa,CAAC,YAAY,GAChD;gBACE,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;gBACvC,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;YACzC,IACAH;YAEJE,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAWS;oBACXL;oBACA,UAAWH,OAAO,aAAa,CAAS,QAAQ;oBAChD,UAAWA,OAAO,aAAa,CAAS,QAAQ,IAAI;gBACtD;gBACA,QAAQ;gBACR,SAASA,OAAO,OAAO,IAAI;YAC7B;QACF;IACF;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIU,MAAM,CAAC,4BAA4B,EAAErB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGF,OAAO;QACL,SAASI;QACT,kBAAkBJ;QAClB,gBAAgBe,WAAWtB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBmB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI5C,KAAK,KAAK,CAAEuC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI7C,KAAK,KAAK,CAAEyC,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASX,SAASyB,QAAgB,EAAE1C,IAAuC;IACzE,MAAM,CAACuC,GAAGC,EAAE,GAAG3B,KAAK,KAAK,CAAC6B;IAC1B,OAAO;QAACH,IAAIvC,KAAK,KAAK;QAAEwC,IAAIxC,KAAK,MAAM;KAAC;AAC1C;AA2EO,eAAe2C,qBAAqBC,WAAmB,EAAE5C,IAAU;IACxE,IACE6C,AAAmB,kBAAnBA,kBACApC,yBAAyBqC,mBAAmB,IAAI,EAChD;QACA1D,MAAM,uCAAuCY;QAC7C,MAAM+C,gBAAgB/C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMgD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAetD,KAAK,IAAI,CAACqD,YAAYD;YAC3C,MAAMG,WAAWvD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGiD;YACzC,MAAME,YAAYxD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGiD;YAC3C7D,MACE,2DACA8D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|