@midscene/core 0.26.5-beta-20250814095614.0 → 0.26.5-beta-20250814125155.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model/action-executor.mjs +139 -0
- package/dist/es/ai-model/action-executor.mjs.map +1 -0
- package/dist/es/ai-model/common.mjs +219 -0
- package/dist/es/ai-model/common.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +10 -0
- package/dist/es/ai-model/inspect.mjs +317 -0
- package/dist/es/ai-model/inspect.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +85 -0
- package/dist/es/ai-model/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/assertion.mjs +55 -0
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -0
- package/dist/es/ai-model/prompt/common.mjs +7 -0
- package/dist/es/ai-model/prompt/common.mjs.map +1 -0
- package/dist/es/ai-model/prompt/describe.mjs +44 -0
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +137 -0
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +275 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +359 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +47 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs +34 -0
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/util.mjs +123 -0
- package/dist/es/ai-model/prompt/util.mjs.map +1 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +413 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +235 -0
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
- package/dist/es/image/index.mjs +2 -0
- package/dist/es/index.mjs +7 -2360
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/insight/index.mjs +261 -0
- package/dist/es/insight/index.mjs.map +1 -0
- package/dist/es/insight/utils.mjs +19 -0
- package/dist/es/insight/utils.mjs.map +1 -0
- package/dist/es/types.mjs +11 -0
- package/dist/es/types.mjs.map +1 -0
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/ai-model/action-executor.js +173 -0
- package/dist/lib/ai-model/action-executor.js.map +1 -0
- package/dist/lib/ai-model/common.js +289 -0
- package/dist/lib/ai-model/common.js.map +1 -0
- package/dist/lib/ai-model/index.js +103 -0
- package/dist/lib/ai-model/index.js.map +1 -0
- package/dist/lib/ai-model/inspect.js +360 -0
- package/dist/lib/ai-model/inspect.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +119 -0
- package/dist/lib/ai-model/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/assertion.js +92 -0
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -0
- package/dist/lib/ai-model/prompt/common.js +41 -0
- package/dist/lib/ai-model/prompt/common.js.map +1 -0
- package/dist/lib/ai-model/prompt/describe.js +78 -0
- package/dist/lib/ai-model/prompt/describe.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +177 -0
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +315 -0
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +415 -0
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +84 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js +68 -0
- package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/util.js +175 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +496 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -0
- package/dist/lib/ai-model/ui-tars-planning.js +272 -0
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
- package/dist/lib/image/index.js +56 -0
- package/dist/lib/image/index.js.map +1 -0
- package/dist/lib/index.js +21 -2393
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/insight/index.js +295 -0
- package/dist/lib/insight/index.js.map +1 -0
- package/dist/lib/insight/utils.js +53 -0
- package/dist/lib/insight/utils.js.map +1 -0
- package/dist/lib/types.js +82 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml.js +20 -0
- package/dist/lib/yaml.js.map +1 -0
- package/dist/types/ai-model/action-executor.d.ts +19 -0
- package/dist/types/ai-model/common.d.ts +34 -0
- package/dist/types/ai-model/index.d.ts +11 -0
- package/dist/types/ai-model/inspect.d.ts +49 -0
- package/dist/types/ai-model/llm-planning.d.ts +10 -0
- package/dist/types/ai-model/prompt/assertion.d.ts +5 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +4 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +9 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +15 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +6 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +25 -0
- package/dist/types/ai-model/prompt/ui-tars-locator.d.ts +1 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +45 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +99 -0
- package/dist/types/ai-model/service-caller/index.d.ts +26 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +76 -0
- package/dist/types/image/index.d.ts +1 -0
- package/dist/types/index.d.ts +9 -1289
- package/dist/types/insight/index.d.ts +26 -0
- package/dist/types/insight/utils.d.ts +2 -0
- package/dist/types/tree.d.ts +1 -11
- package/dist/types/types.d.ts +399 -0
- package/dist/types/utils.d.ts +27 -47
- package/dist/types/yaml.d.ts +172 -0
- package/package.json +6 -6
- package/dist/es/ai-model.mjs +0 -2502
- package/dist/es/ai-model.mjs.map +0 -1
- package/dist/lib/ai-model.js +0 -2622
- package/dist/lib/ai-model.js.map +0 -1
- package/dist/types/ai-model.d.ts +0 -596
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
import { MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, getAIConfigInBoolean, vlLocateMode } from "@midscene/shared/env";
|
|
2
|
+
import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@midscene/shared/img";
|
|
3
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
4
|
+
import { assert } from "@midscene/shared/utils";
|
|
5
|
+
import { AIActionType, adaptBboxToRect, callAiFn, expandSearchArea, markupImageForLLM, mergeRects } from "./common.mjs";
|
|
6
|
+
import { systemPromptToAssert } from "./prompt/assertion.mjs";
|
|
7
|
+
import { extractDataQueryPrompt, systemPromptToExtract } from "./prompt/extraction.mjs";
|
|
8
|
+
import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
|
|
9
|
+
import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
|
|
10
|
+
import { describeUserPage, distance, distanceThreshold, elementByPositionWithElementInfo } from "./prompt/util.mjs";
|
|
11
|
+
import { callToGetJSONObject } from "./service-caller/index.mjs";
|
|
12
|
+
const debugInspect = getDebug('ai:inspect');
|
|
13
|
+
const debugSection = getDebug('ai:section');
|
|
14
|
+
const extraTextFromUserPrompt = (prompt)=>{
|
|
15
|
+
if ('string' == typeof prompt) return prompt;
|
|
16
|
+
return prompt.prompt;
|
|
17
|
+
};
|
|
18
|
+
const promptsToChatParam = async (multimodalPrompt)=>{
|
|
19
|
+
var _multimodalPrompt_images;
|
|
20
|
+
const msgs = [];
|
|
21
|
+
if (null == multimodalPrompt ? void 0 : null == (_multimodalPrompt_images = multimodalPrompt.images) ? void 0 : _multimodalPrompt_images.length) {
|
|
22
|
+
msgs.push({
|
|
23
|
+
role: 'user',
|
|
24
|
+
content: [
|
|
25
|
+
{
|
|
26
|
+
type: 'text',
|
|
27
|
+
text: 'Next, I will provide all the reference images.'
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
});
|
|
31
|
+
for (const item of multimodalPrompt.images){
|
|
32
|
+
const base64 = await preProcessImageUrl(item.url, !!multimodalPrompt.convertHttpImage2Base64);
|
|
33
|
+
msgs.push({
|
|
34
|
+
role: 'user',
|
|
35
|
+
content: [
|
|
36
|
+
{
|
|
37
|
+
type: 'text',
|
|
38
|
+
text: `reference image ${item.name}:`
|
|
39
|
+
}
|
|
40
|
+
]
|
|
41
|
+
});
|
|
42
|
+
msgs.push({
|
|
43
|
+
role: 'user',
|
|
44
|
+
content: [
|
|
45
|
+
{
|
|
46
|
+
type: 'image_url',
|
|
47
|
+
image_url: {
|
|
48
|
+
url: base64,
|
|
49
|
+
detail: 'high'
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
]
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return msgs;
|
|
57
|
+
};
|
|
58
|
+
async function AiLocateElement(options) {
|
|
59
|
+
const { context, targetElementDescription, callAI } = options;
|
|
60
|
+
const { screenshotBase64 } = context;
|
|
61
|
+
const { description, elementById, insertElementByPosition } = await describeUserPage(context);
|
|
62
|
+
assert(targetElementDescription, "cannot find the target element description");
|
|
63
|
+
const userInstructionPrompt = await findElementPrompt.format({
|
|
64
|
+
pageDescription: description,
|
|
65
|
+
targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
|
|
66
|
+
});
|
|
67
|
+
const systemPrompt = systemPromptToLocateElement(vlLocateMode());
|
|
68
|
+
let imagePayload = screenshotBase64;
|
|
69
|
+
if (options.searchConfig) {
|
|
70
|
+
assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
|
|
71
|
+
assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
|
|
72
|
+
imagePayload = options.searchConfig.imageBase64;
|
|
73
|
+
} else if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
74
|
+
else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
75
|
+
const msgs = [
|
|
76
|
+
{
|
|
77
|
+
role: 'system',
|
|
78
|
+
content: systemPrompt
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
role: 'user',
|
|
82
|
+
content: [
|
|
83
|
+
{
|
|
84
|
+
type: 'image_url',
|
|
85
|
+
image_url: {
|
|
86
|
+
url: imagePayload,
|
|
87
|
+
detail: 'high'
|
|
88
|
+
}
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
type: 'text',
|
|
92
|
+
text: userInstructionPrompt
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
}
|
|
96
|
+
];
|
|
97
|
+
if ('string' != typeof targetElementDescription) {
|
|
98
|
+
const addOns = await promptsToChatParam({
|
|
99
|
+
images: targetElementDescription.images,
|
|
100
|
+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
|
|
101
|
+
});
|
|
102
|
+
msgs.push(...addOns);
|
|
103
|
+
}
|
|
104
|
+
const callAIFn = callAI || callToGetJSONObject;
|
|
105
|
+
const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);
|
|
106
|
+
const rawResponse = JSON.stringify(res.content);
|
|
107
|
+
let resRect;
|
|
108
|
+
let matchedElements = 'elements' in res.content ? res.content.elements : [];
|
|
109
|
+
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
110
|
+
try {
|
|
111
|
+
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
|
|
112
|
+
var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
|
|
113
|
+
resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
|
|
114
|
+
debugInspect('resRect', resRect);
|
|
115
|
+
const rectCenter = {
|
|
116
|
+
x: resRect.left + resRect.width / 2,
|
|
117
|
+
y: resRect.top + resRect.height / 2
|
|
118
|
+
};
|
|
119
|
+
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
|
|
120
|
+
const distanceToCenter = element ? distance({
|
|
121
|
+
x: element.center[0],
|
|
122
|
+
y: element.center[1]
|
|
123
|
+
}, rectCenter) : 0;
|
|
124
|
+
if (!element || distanceToCenter > distanceThreshold) element = insertElementByPosition(rectCenter);
|
|
125
|
+
if (element) {
|
|
126
|
+
matchedElements = [
|
|
127
|
+
element
|
|
128
|
+
];
|
|
129
|
+
errors = [];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
} catch (e) {
|
|
133
|
+
const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : 'unknown error in locate';
|
|
134
|
+
if (errors && (null == errors ? void 0 : errors.length) !== 0) errors.push(`(${msg})`);
|
|
135
|
+
else errors = [
|
|
136
|
+
msg
|
|
137
|
+
];
|
|
138
|
+
}
|
|
139
|
+
return {
|
|
140
|
+
rect: resRect,
|
|
141
|
+
parseResult: {
|
|
142
|
+
elements: matchedElements,
|
|
143
|
+
errors
|
|
144
|
+
},
|
|
145
|
+
rawResponse,
|
|
146
|
+
elementById,
|
|
147
|
+
usage: res.usage,
|
|
148
|
+
isOrderSensitive: 'object' == typeof res.content && null !== res.content && 'isOrderSensitive' in res.content ? res.content.isOrderSensitive : void 0
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
async function AiLocateSection(options) {
|
|
152
|
+
const { context, sectionDescription } = options;
|
|
153
|
+
const { screenshotBase64 } = context;
|
|
154
|
+
const systemPrompt = systemPromptToLocateSection(vlLocateMode());
|
|
155
|
+
const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
|
|
156
|
+
sectionDescription: extraTextFromUserPrompt(sectionDescription)
|
|
157
|
+
});
|
|
158
|
+
const msgs = [
|
|
159
|
+
{
|
|
160
|
+
role: 'system',
|
|
161
|
+
content: systemPrompt
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
role: 'user',
|
|
165
|
+
content: [
|
|
166
|
+
{
|
|
167
|
+
type: 'image_url',
|
|
168
|
+
image_url: {
|
|
169
|
+
url: screenshotBase64,
|
|
170
|
+
detail: 'high'
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
type: 'text',
|
|
175
|
+
text: sectionLocatorInstructionText
|
|
176
|
+
}
|
|
177
|
+
]
|
|
178
|
+
}
|
|
179
|
+
];
|
|
180
|
+
if ('string' != typeof sectionDescription) {
|
|
181
|
+
const addOns = await promptsToChatParam({
|
|
182
|
+
images: sectionDescription.images,
|
|
183
|
+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
|
|
184
|
+
});
|
|
185
|
+
msgs.push(...addOns);
|
|
186
|
+
}
|
|
187
|
+
const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
|
|
188
|
+
let sectionRect;
|
|
189
|
+
const sectionBbox = result.content.bbox;
|
|
190
|
+
if (sectionBbox) {
|
|
191
|
+
const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height);
|
|
192
|
+
debugSection('original targetRect %j', targetRect);
|
|
193
|
+
const referenceBboxList = result.content.references_bbox || [];
|
|
194
|
+
debugSection('referenceBboxList %j', referenceBboxList);
|
|
195
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height));
|
|
196
|
+
debugSection('referenceRects %j', referenceRects);
|
|
197
|
+
const mergedRect = mergeRects([
|
|
198
|
+
targetRect,
|
|
199
|
+
...referenceRects
|
|
200
|
+
]);
|
|
201
|
+
debugSection('mergedRect %j', mergedRect);
|
|
202
|
+
sectionRect = expandSearchArea(mergedRect, context.size);
|
|
203
|
+
debugSection('expanded sectionRect %j', sectionRect);
|
|
204
|
+
}
|
|
205
|
+
let imageBase64 = screenshotBase64;
|
|
206
|
+
if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL));
|
|
207
|
+
return {
|
|
208
|
+
rect: sectionRect,
|
|
209
|
+
imageBase64,
|
|
210
|
+
error: result.content.error,
|
|
211
|
+
rawResponse: JSON.stringify(result.content),
|
|
212
|
+
usage: result.usage
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
async function AiExtractElementInfo(options) {
|
|
216
|
+
var _options_extractOption;
|
|
217
|
+
const { dataQuery, context, extractOption, multimodalPrompt } = options;
|
|
218
|
+
const systemPrompt = systemPromptToExtract();
|
|
219
|
+
const { screenshotBase64 } = context;
|
|
220
|
+
const { description, elementById } = await describeUserPage(context, {
|
|
221
|
+
truncateTextLength: 200,
|
|
222
|
+
filterNonTextContent: false,
|
|
223
|
+
visibleOnly: false,
|
|
224
|
+
domIncluded: null == extractOption ? void 0 : extractOption.domIncluded
|
|
225
|
+
});
|
|
226
|
+
const extractDataPromptText = await extractDataQueryPrompt(description, dataQuery);
|
|
227
|
+
const userContent = [];
|
|
228
|
+
if ((null == extractOption ? void 0 : extractOption.screenshotIncluded) !== false) userContent.push({
|
|
229
|
+
type: 'image_url',
|
|
230
|
+
image_url: {
|
|
231
|
+
url: screenshotBase64,
|
|
232
|
+
detail: 'high'
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
userContent.push({
|
|
236
|
+
type: 'text',
|
|
237
|
+
text: extractDataPromptText
|
|
238
|
+
});
|
|
239
|
+
const msgs = [
|
|
240
|
+
{
|
|
241
|
+
role: 'system',
|
|
242
|
+
content: systemPrompt
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
role: 'user',
|
|
246
|
+
content: userContent
|
|
247
|
+
}
|
|
248
|
+
];
|
|
249
|
+
if (null == (_options_extractOption = options.extractOption) ? void 0 : _options_extractOption.returnThought) msgs.push({
|
|
250
|
+
role: 'user',
|
|
251
|
+
content: 'Please provide reasons.'
|
|
252
|
+
});
|
|
253
|
+
if (multimodalPrompt) {
|
|
254
|
+
const addOns = await promptsToChatParam({
|
|
255
|
+
images: multimodalPrompt.images,
|
|
256
|
+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
|
|
257
|
+
});
|
|
258
|
+
msgs.push(...addOns);
|
|
259
|
+
}
|
|
260
|
+
const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
|
|
261
|
+
return {
|
|
262
|
+
parseResult: result.content,
|
|
263
|
+
elementById,
|
|
264
|
+
usage: result.usage
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
async function AiAssert(options) {
|
|
268
|
+
const { assertion, context } = options;
|
|
269
|
+
assert(assertion, 'assertion should not be empty');
|
|
270
|
+
const { screenshotBase64 } = context;
|
|
271
|
+
const systemPrompt = systemPromptToAssert({
|
|
272
|
+
isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)
|
|
273
|
+
});
|
|
274
|
+
const assertionText = extraTextFromUserPrompt(assertion);
|
|
275
|
+
const msgs = [
|
|
276
|
+
{
|
|
277
|
+
role: 'system',
|
|
278
|
+
content: systemPrompt
|
|
279
|
+
},
|
|
280
|
+
{
|
|
281
|
+
role: 'user',
|
|
282
|
+
content: [
|
|
283
|
+
{
|
|
284
|
+
type: 'image_url',
|
|
285
|
+
image_url: {
|
|
286
|
+
url: screenshotBase64,
|
|
287
|
+
detail: 'high'
|
|
288
|
+
}
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
type: 'text',
|
|
292
|
+
text: `
|
|
293
|
+
Here is the assertion. Please tell whether it is truthy according to the screenshot.
|
|
294
|
+
=====================================
|
|
295
|
+
${assertionText}
|
|
296
|
+
=====================================
|
|
297
|
+
`
|
|
298
|
+
}
|
|
299
|
+
]
|
|
300
|
+
}
|
|
301
|
+
];
|
|
302
|
+
if ('string' != typeof assertion) {
|
|
303
|
+
const addOns = await promptsToChatParam({
|
|
304
|
+
images: assertion.images,
|
|
305
|
+
convertHttpImage2Base64: assertion.convertHttpImage2Base64
|
|
306
|
+
});
|
|
307
|
+
msgs.push(...addOns);
|
|
308
|
+
}
|
|
309
|
+
const { content: assertResult, usage } = await callAiFn(msgs, AIActionType.ASSERT);
|
|
310
|
+
return {
|
|
311
|
+
content: assertResult,
|
|
312
|
+
usage
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
export { AiAssert, AiExtractElementInfo, AiLocateElement, AiLocateSection };
|
|
316
|
+
|
|
317
|
+
//# sourceMappingURL=inspect.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIAssertionResponse,\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n MIDSCENE_USE_QWEN_VL,\n MIDSCENE_USE_VLM_UI_TARS,\n getAIConfigInBoolean,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport { systemPromptToAssert } from './prompt/assertion';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlLocateMode());\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlLocateMode());\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(bbox, context.size.width, context.size.height);\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n\nexport async function AiAssert<\n ElementType extends BaseElement = BaseElement,\n>(options: { assertion: TUserPrompt; context: UIContext<ElementType> }) {\n const { assertion, context } = options;\n\n assert(assertion, 'assertion should not be empty');\n\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToAssert({\n isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS),\n });\n\n const assertionText = extraTextFromUserPrompt(assertion);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: `\nHere is the assertion. Please tell whether it is truthy according to the screenshot.\n=====================================\n${assertionText}\n=====================================\n `,\n },\n ],\n },\n ];\n\n if (typeof assertion !== 'string') {\n const addOns = await promptsToChatParam({\n images: assertion.images,\n convertHttpImage2Base64: assertion.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const { content: assertResult, usage } = await callAiFn<AIAssertionResponse>(\n msgs,\n AIActionType.ASSERT,\n );\n return {\n content: assertResult,\n usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getAIConfigInBoolean","MIDSCENE_USE_QWEN_VL","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiAssert","assertion","systemPromptToAssert","MIDSCENE_USE_VLM_UI_TARS","assertionText","assertResult","usage"],"mappings":";;;;;;;;;;;AAmEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBP;IAEzBQ,OACEP,0BACA;IAGF,MAAMQ,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0Bf,wBAAwBY;IACpD;IACA,MAAMU,eAAeC,4BAA4BC;IAEjD,IAAIC,eAAeX;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBS,OACET,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFS,OACET,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIc,AAAmB,cAAnBA,gBACTC,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACD,gBACVC,eAAe,MAAME,kBACnBb,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOR,0BAAuC;QAChD,MAAMgB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAMC,WACJhB,UAAUiB;IAEZ,MAAMC,MAAM,MAAMF,SAASxB,MAAM2B,aAAa,eAAe;IAE7D,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBACAC,6BAAAA,wBACAC,6BAAAA;YALFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK5B,QAAQ,IAAI,CAAC,KAAK,EACvD6B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK7B,QAAQ,IAAI,CAAC,MAAM,UACzD8B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC7C,aAAa,WAAWsC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiCnC,QAAQ,IAAI,EAAEiC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB5C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE4C,kBAAkB,EAAE,GAAG7C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAekC,4BAA4BhC;IACjD,MAAMiC,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB1D,wBAAwBuD;IAC9C;IACA,MAAMlD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM2C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM1B,mBAAmB;YACtC,QAAQqD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAlD,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAG3B,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAnD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM;QAErBZ,aAAa,0BAA0BgE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D5D,aAAa,wBAAwBiE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBAAgBuB,MAAMvD,QAAQ,IAAI,CAAC,KAAK,EAAEA,QAAQ,IAAI,CAAC,MAAM;QAExEZ,aAAa,qBAAqBkE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DlE,aAAa,iBAAiBoE;QAG9BN,cAAcQ,iBAAiBF,YAAYxD,QAAQ,IAAI;QACvDZ,aAAa,2BAA2B8D;IAC1C;IAEA,IAAIS,cAAcxD;IAClB,IAAI+C,aACFS,cAAc,MAAMC,WAClBzD,kBACA+C,aACAW,qBAAqBC;IAIzB,OAAO;QACL,MAAMZ;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAee,qBAGpBhE,OAKD;QA0CKiE;IAzCJ,MAAM,EAAEC,SAAS,EAAEjE,OAAO,EAAEkE,aAAa,EAAE1E,gBAAgB,EAAE,GAAGO;IAChE,MAAMY,eAAewD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBP,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAakE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAEA,MAAME,wBAAwB,MAAMC,uBAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAMyB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAE3B,OAAO;QACL,aAAa2B,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF;AAEO,eAAeuB,SAEpBxE,OAAoE;IACpE,MAAM,EAAEyE,SAAS,EAAExE,OAAO,EAAE,GAAGD;IAE/BS,OAAOgE,WAAW;IAElB,MAAM,EAAErE,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAe8D,qBAAqB;QACxC,UAAUZ,qBAAqBa;IACjC;IAEA,MAAMC,gBAAgBtF,wBAAwBmF;IAE9C,MAAM9E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM,CAAC;;;AAGjB,EAAEwE,cAAc;;EAEd,CAAC;gBACK;aACD;QACH;KACD;IAED,IAAI,AAAqB,YAArB,OAAOH,WAAwB;QACjC,MAAMvD,SAAS,MAAM1B,mBAAmB;YACtC,QAAQiF,UAAU,MAAM;YACxB,yBAAyBA,UAAU,uBAAuB;QAC5D;QACA9E,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM,EAAE,SAAS2D,YAAY,EAAEC,KAAK,EAAE,GAAG,MAAM5B,SAC7CvD,MACA2B,aAAa,MAAM;IAErB,OAAO;QACL,SAASuD;QACTC;IACF;AACF"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { vlLocateMode } from "@midscene/shared/env";
|
|
2
|
+
import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
|
|
3
|
+
import { assert } from "@midscene/shared/utils";
|
|
4
|
+
import { AIActionType, buildYamlFlowFromPlans, callAiFn, fillBboxParam, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
|
|
5
|
+
import { automationUserPrompt, generateTaskBackgroundContext, systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
|
+
import { describeUserPage } from "./prompt/util.mjs";
|
|
7
|
+
async function plan(userInstruction, opts) {
|
|
8
|
+
var _planFromAI_action;
|
|
9
|
+
const { callAI, context } = opts || {};
|
|
10
|
+
const { screenshotBase64, size } = context;
|
|
11
|
+
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
12
|
+
const systemPrompt = await systemPromptToTaskPlanning({
|
|
13
|
+
actionSpace: opts.actionSpace,
|
|
14
|
+
vlMode: vlLocateMode()
|
|
15
|
+
});
|
|
16
|
+
const taskBackgroundContextText = generateTaskBackgroundContext(userInstruction, opts.log, opts.actionContext);
|
|
17
|
+
const userInstructionPrompt = await automationUserPrompt(vlLocateMode()).format({
|
|
18
|
+
pageDescription,
|
|
19
|
+
taskBackgroundContext: taskBackgroundContextText
|
|
20
|
+
});
|
|
21
|
+
let imagePayload = screenshotBase64;
|
|
22
|
+
if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
23
|
+
else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
24
|
+
warnGPT4oSizeLimit(size);
|
|
25
|
+
const msgs = [
|
|
26
|
+
{
|
|
27
|
+
role: 'system',
|
|
28
|
+
content: systemPrompt
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
role: 'user',
|
|
32
|
+
content: [
|
|
33
|
+
{
|
|
34
|
+
type: 'image_url',
|
|
35
|
+
image_url: {
|
|
36
|
+
url: imagePayload,
|
|
37
|
+
detail: 'high'
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
type: 'text',
|
|
42
|
+
text: userInstructionPrompt
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|
|
46
|
+
];
|
|
47
|
+
const call = callAI || callAiFn;
|
|
48
|
+
const { content, usage } = await call(msgs, AIActionType.PLAN);
|
|
49
|
+
const rawResponse = JSON.stringify(content, void 0, 2);
|
|
50
|
+
const planFromAI = content;
|
|
51
|
+
const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
|
|
52
|
+
planFromAI.action
|
|
53
|
+
] : planFromAI.actions) || [];
|
|
54
|
+
const returnValue = {
|
|
55
|
+
...planFromAI,
|
|
56
|
+
actions,
|
|
57
|
+
rawResponse,
|
|
58
|
+
usage,
|
|
59
|
+
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
60
|
+
};
|
|
61
|
+
assert(planFromAI, "can't get plans from AI");
|
|
62
|
+
if (vlLocateMode()) {
|
|
63
|
+
actions.forEach((action)=>{
|
|
64
|
+
if (action.locate) try {
|
|
65
|
+
action.locate = fillBboxParam(action.locate, size.width, size.height);
|
|
66
|
+
} catch (e) {
|
|
67
|
+
throw new Error(`Failed to fill locate param: ${planFromAI.error} (${e instanceof Error ? e.message : 'unknown error'})`, {
|
|
68
|
+
cause: e
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
});
|
|
72
|
+
assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
73
|
+
} else actions.forEach((action)=>{
|
|
74
|
+
var _action_locate;
|
|
75
|
+
if (null == (_action_locate = action.locate) ? void 0 : _action_locate.id) {
|
|
76
|
+
const element = elementById(action.locate.id);
|
|
77
|
+
if (element) action.locate.id = element.id;
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
|
|
81
|
+
return returnValue;
|
|
82
|
+
}
|
|
83
|
+
export { plan };
|
|
84
|
+
|
|
85
|
+
//# sourceMappingURL=llm-planning.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n const { description: pageDescription, elementById } =\n await describeUserPage(context);\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(msgs, AIActionType.PLAN);\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n if (vlLocateMode()) {\n actions.forEach((action) => {\n if (action.locate) {\n try {\n action.locate = fillBboxParam(action.locate, size.width, size.height);\n } catch (e) {\n throw new Error(\n `Failed to fill locate param: ${planFromAI.error} (${\n e instanceof Error ? e.message : 'unknown error'\n })`,\n {\n cause: e,\n },\n );\n }\n }\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n } else {\n actions.forEach((action) => {\n if (action.locate?.id) {\n // The model may return indexId, need to perform a query correction to avoid exceptions\n const element = elementById(action.locate.id);\n if (element) {\n action.locate.id = element.id;\n }\n }\n });\n }\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","fillBboxParam","e","Error","_action_locate","element","console"],"mappings":";;;;;;AAyBO,eAAeA,KACpBC,eAAuB,EACvBC,IAOC;QA8DEC;IA5DH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IACnC,MAAM,EAAE,aAAaG,eAAe,EAAEC,WAAW,EAAE,GACjD,MAAMC,iBAAiBL;IAEzB,MAAMM,eAAe,MAAMC,2BAA2B;QACpD,aAAaV,KAAK,WAAW;QAC7B,QAAQW;IACV;IACA,MAAMC,4BAA4BC,8BAChCd,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMc,wBAAwB,MAAMC,qBAClCJ,gBACA,MAAM,CAAC;QACPL;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeZ;IACnB,IAAIO,AAAmB,cAAnBA,gBACFK,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACL,gBACVK,eAAe,MAAME,kBACnBd,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBgB,mBAAmBd;IAEnB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOnB,UAAUoB;IACvB,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAAKD,MAAMK,aAAa,IAAI;IAC7D,MAAMC,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH7B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC4B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,uBAAuBF,SAASD,WAAW,KAAK;IAC5D;IAEAI,OAAOJ,YAAY;IAEnB,IAAIlB,gBAAgB;QAClBmB,QAAQ,OAAO,CAAC,CAACI;YACf,IAAIA,OAAO,MAAM,EACf,IAAI;gBACFA,OAAO,MAAM,GAAGC,cAAcD,OAAO,MAAM,EAAE7B,KAAK,KAAK,EAAEA,KAAK,MAAM;YACtE,EAAE,OAAO+B,GAAG;gBACV,MAAM,IAAIC,MACR,CAAC,6BAA6B,EAAER,WAAW,KAAK,CAAC,EAAE,EACjDO,aAAaC,QAAQD,EAAE,OAAO,GAAG,gBAClC,CAAC,CAAC,EACH;oBACE,OAAOA;gBACT;YAEJ;QAEJ;QAEAH,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IACzE,OACEC,QAAQ,OAAO,CAAC,CAACI;YACXI;QAAJ,IAAI,QAAAA,CAAAA,iBAAAA,OAAO,MAAM,AAAD,IAAZA,KAAAA,IAAAA,eAAe,EAAE,EAAE;YAErB,MAAMC,UAAUhC,YAAY2B,OAAO,MAAM,CAAC,EAAE;YAC5C,IAAIK,SACFL,OAAO,MAAM,CAAC,EAAE,GAAGK,QAAQ,EAAE;QAEjC;IACF;IAGF,IACET,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBS,QAAQ,IAAI,CACV,8EACAzC;IAIJ,OAAOgC;AACT"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
+
const defaultAssertionPrompt = 'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';
|
|
3
|
+
const defaultAssertionResponseJsonFormat = `Return in the following JSON format:
|
|
4
|
+
{
|
|
5
|
+
pass: boolean, // whether the assertion is truthy
|
|
6
|
+
thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
|
|
7
|
+
}`;
|
|
8
|
+
const getUiTarsAssertionResponseJsonFormat = ()=>`## Output Json String Format
|
|
9
|
+
\`\`\`
|
|
10
|
+
"{
|
|
11
|
+
"pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
|
|
12
|
+
"thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
|
|
13
|
+
}"
|
|
14
|
+
\`\`\`
|
|
15
|
+
|
|
16
|
+
## Rules **MUST** follow
|
|
17
|
+
- Make sure to return **only** the JSON, with **no additional** text or explanations.
|
|
18
|
+
- Use ${getPreferredLanguage()} in \`thought\` part.
|
|
19
|
+
- You **MUST** strictly follow up the **Output Json String Format**.`;
|
|
20
|
+
function systemPromptToAssert(model) {
|
|
21
|
+
return `${defaultAssertionPrompt}
|
|
22
|
+
|
|
23
|
+
${model.isUITars ? getUiTarsAssertionResponseJsonFormat() : defaultAssertionResponseJsonFormat}`;
|
|
24
|
+
}
|
|
25
|
+
const assertSchema = {
|
|
26
|
+
type: 'json_schema',
|
|
27
|
+
json_schema: {
|
|
28
|
+
name: 'assert',
|
|
29
|
+
strict: true,
|
|
30
|
+
schema: {
|
|
31
|
+
type: 'object',
|
|
32
|
+
properties: {
|
|
33
|
+
pass: {
|
|
34
|
+
type: 'boolean',
|
|
35
|
+
description: 'Whether the assertion passed or failed'
|
|
36
|
+
},
|
|
37
|
+
thought: {
|
|
38
|
+
type: [
|
|
39
|
+
'string',
|
|
40
|
+
'null'
|
|
41
|
+
],
|
|
42
|
+
description: 'The thought process behind the assertion'
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
required: [
|
|
46
|
+
'pass',
|
|
47
|
+
'thought'
|
|
48
|
+
],
|
|
49
|
+
additionalProperties: false
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
export { assertSchema, systemPromptToAssert };
|
|
54
|
+
|
|
55
|
+
//# sourceMappingURL=assertion.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/assertion.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/assertion.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nconst defaultAssertionPrompt =\n 'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';\n\nconst defaultAssertionResponseJsonFormat = `Return in the following JSON format:\n{\n pass: boolean, // whether the assertion is truthy\n thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.\n}`;\n\nconst getUiTarsAssertionResponseJsonFormat = () => `## Output Json String Format\n\\`\\`\\`\n\"{\n \"pass\": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>, \n \"thought\": \"<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>\"\n}\"\n\\`\\`\\`\n\n## Rules **MUST** follow\n- Make sure to return **only** the JSON, with **no additional** text or explanations.\n- Use ${getPreferredLanguage()} in \\`thought\\` part.\n- You **MUST** strictly follow up the **Output Json String Format**.`;\n\nexport function systemPromptToAssert(model: { isUITars: boolean }) {\n return `${defaultAssertionPrompt}\n\n${model.isUITars ? getUiTarsAssertionResponseJsonFormat() : defaultAssertionResponseJsonFormat}`;\n}\n\nexport const assertSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'assert',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n pass: {\n type: 'boolean',\n description: 'Whether the assertion passed or failed',\n },\n thought: {\n type: ['string', 'null'],\n description: 'The thought process behind the assertion',\n },\n },\n required: ['pass', 'thought'],\n additionalProperties: false,\n },\n },\n};\n"],"names":["defaultAssertionPrompt","defaultAssertionResponseJsonFormat","getUiTarsAssertionResponseJsonFormat","getPreferredLanguage","systemPromptToAssert","model","assertSchema"],"mappings":";AAGA,MAAMA,yBACJ;AAEF,MAAMC,qCAAqC,CAAC;;;;CAI3C,CAAC;AAEF,MAAMC,uCAAuC,IAAM,CAAC;;;;;;;;;;MAU9C,EAAEC,uBAAuB;oEACqC,CAAC;AAE9D,SAASC,qBAAqBC,KAA4B;IAC/D,OAAO,GAAGL,uBAAuB;;AAEnC,EAAEK,MAAM,QAAQ,GAAGH,yCAAyCD,oCAAoC;AAChG;AAEO,MAAMK,eAAyC;IACpD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,MAAM;oBACJ,MAAM;oBACN,aAAa;gBACf;gBACA,SAAS;oBACP,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAQ;aAAU;YAC7B,sBAAsB;QACxB;IACF;AACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/common.ts"],"sourcesContent":["import type { vlLocateMode } from '@midscene/shared/env';\nexport function bboxDescription(vlMode: ReturnType<typeof vlLocateMode>) {\n if (vlMode === 'gemini') {\n return '2d bounding box as [ymin, xmin, ymax, xmax]';\n }\n return '2d bounding box as [xmin, ymin, xmax, ymax]';\n}\n"],"names":["bboxDescription","vlMode"],"mappings":"AACO,SAASA,gBAAgBC,MAAuC;IACrE,IAAIA,AAAW,aAAXA,QACF,OAAO;IAET,OAAO;AACT"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
+
const elementDescriberInstruction = ()=>`
|
|
3
|
+
Describe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.
|
|
4
|
+
|
|
5
|
+
CRITICAL REQUIREMENTS:
|
|
6
|
+
1. UNIQUENESS: The description must uniquely identify this element on the current page
|
|
7
|
+
2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
|
|
8
|
+
3. PRECISION: Be specific enough to distinguish from similar elements
|
|
9
|
+
|
|
10
|
+
DESCRIPTION STRUCTURE:
|
|
11
|
+
1. Element type (button, input, link, div, etc.)
|
|
12
|
+
2. Primary identifier (in order of preference):
|
|
13
|
+
- Unique text content: "with text 'Login'"
|
|
14
|
+
- Unique attribute: "with aria-label 'Search'"
|
|
15
|
+
- Unique class/ID: "with class 'primary-button'"
|
|
16
|
+
- Unique position: "in header navigation"
|
|
17
|
+
3. Secondary identifiers (if needed for uniqueness):
|
|
18
|
+
- Visual features: "blue background", "with icon"
|
|
19
|
+
- Relative position: "below search bar", "in sidebar"
|
|
20
|
+
- Parent context: "in login form", "in main menu"
|
|
21
|
+
|
|
22
|
+
GUIDELINES:
|
|
23
|
+
- Keep description under 25 words
|
|
24
|
+
- Prioritize semantic identifiers over visual ones
|
|
25
|
+
- Use consistent terminology across similar elements
|
|
26
|
+
- Avoid page-specific or temporary content
|
|
27
|
+
- Don't mention the red rectangle or selection box
|
|
28
|
+
- Focus on stable, reusable characteristics
|
|
29
|
+
|
|
30
|
+
EXAMPLES:
|
|
31
|
+
- "Login button with text 'Sign In'"
|
|
32
|
+
- "Search input with placeholder 'Enter keywords'"
|
|
33
|
+
- "Navigation link with text 'Home' in header"
|
|
34
|
+
- "Submit button in contact form"
|
|
35
|
+
- "Menu icon with aria-label 'Open menu'"
|
|
36
|
+
|
|
37
|
+
Return JSON:
|
|
38
|
+
{
|
|
39
|
+
"description": "unique element identifier",
|
|
40
|
+
"error"?: "error message if any"
|
|
41
|
+
}`;
|
|
42
|
+
export { elementDescriberInstruction };
|
|
43
|
+
|
|
44
|
+
//# sourceMappingURL=describe.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/describe.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/describe.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\nexport const elementDescriberInstruction = () => {\n return `\nDescribe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.\n\nCRITICAL REQUIREMENTS:\n1. UNIQUENESS: The description must uniquely identify this element on the current page\n2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts\n3. PRECISION: Be specific enough to distinguish from similar elements\n\nDESCRIPTION STRUCTURE:\n1. Element type (button, input, link, div, etc.)\n2. Primary identifier (in order of preference):\n - Unique text content: \"with text 'Login'\"\n - Unique attribute: \"with aria-label 'Search'\"\n - Unique class/ID: \"with class 'primary-button'\"\n - Unique position: \"in header navigation\"\n3. Secondary identifiers (if needed for uniqueness):\n - Visual features: \"blue background\", \"with icon\"\n - Relative position: \"below search bar\", \"in sidebar\"\n - Parent context: \"in login form\", \"in main menu\"\n\nGUIDELINES:\n- Keep description under 25 words\n- Prioritize semantic identifiers over visual ones\n- Use consistent terminology across similar elements\n- Avoid page-specific or temporary content\n- Don't mention the red rectangle or selection box\n- Focus on stable, reusable characteristics\n\nEXAMPLES:\n- \"Login button with text 'Sign In'\"\n- \"Search input with placeholder 'Enter keywords'\"\n- \"Navigation link with text 'Home' in header\"\n- \"Submit button in contact form\"\n- \"Menu icon with aria-label 'Open menu'\"\n\nReturn JSON:\n{\n \"description\": \"unique element identifier\",\n \"error\"?: \"error message if any\"\n}`;\n};\n"],"names":["elementDescriberInstruction","getPreferredLanguage"],"mappings":";AAEO,MAAMA,8BAA8B,IAClC,CAAC;0EACgE,EAAEC,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAsClG,CAAC"}
|