@donggui/core 1.5.4-donggui.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +9 -0
- package/dist/es/agent/agent.mjs +709 -0
- package/dist/es/agent/agent.mjs.map +1 -0
- package/dist/es/agent/common.mjs +0 -0
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/index.mjs +6 -0
- package/dist/es/agent/task-builder.mjs +330 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/task-cache.mjs +186 -0
- package/dist/es/agent/task-cache.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +422 -0
- package/dist/es/agent/tasks.mjs.map +1 -0
- package/dist/es/agent/ui-utils.mjs +91 -0
- package/dist/es/agent/ui-utils.mjs.map +1 -0
- package/dist/es/agent/utils.mjs +198 -0
- package/dist/es/agent/utils.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/actions.mjs +224 -0
- package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/index.mjs +6 -0
- package/dist/es/ai-model/auto-glm/parser.mjs +239 -0
- package/dist/es/ai-model/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/planning.mjs +71 -0
- package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs +222 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/util.mjs +9 -0
- package/dist/es/ai-model/auto-glm/util.mjs.map +1 -0
- package/dist/es/ai-model/conversation-history.mjs +195 -0
- package/dist/es/ai-model/conversation-history.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +11 -0
- package/dist/es/ai-model/inspect.mjs +386 -0
- package/dist/es/ai-model/inspect.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +233 -0
- package/dist/es/ai-model/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/common.mjs +7 -0
- package/dist/es/ai-model/prompt/common.mjs.map +1 -0
- package/dist/es/ai-model/prompt/describe.mjs +66 -0
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
- package/dist/es/ai-model/prompt/extraction.mjs +129 -0
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs +51 -0
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs +364 -0
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +44 -0
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
- package/dist/es/ai-model/prompt/util.mjs +59 -0
- package/dist/es/ai-model/prompt/util.mjs.map +1 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
- package/dist/es/ai-model/service-caller/index.mjs +466 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
- package/dist/es/ai-model/ui-tars-planning.mjs +249 -0
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
- package/dist/es/common.mjs +371 -0
- package/dist/es/common.mjs.map +1 -0
- package/dist/es/device/device-options.mjs +0 -0
- package/dist/es/device/index.mjs +300 -0
- package/dist/es/device/index.mjs.map +1 -0
- package/dist/es/dump/html-utils.mjs +211 -0
- package/dist/es/dump/html-utils.mjs.map +1 -0
- package/dist/es/dump/image-restoration.mjs +43 -0
- package/dist/es/dump/image-restoration.mjs.map +1 -0
- package/dist/es/dump/index.mjs +3 -0
- package/dist/es/index.mjs +15 -0
- package/dist/es/index.mjs.map +1 -0
- package/dist/es/report-generator.mjs +134 -0
- package/dist/es/report-generator.mjs.map +1 -0
- package/dist/es/report.mjs +111 -0
- package/dist/es/report.mjs.map +1 -0
- package/dist/es/screenshot-item.mjs +105 -0
- package/dist/es/screenshot-item.mjs.map +1 -0
- package/dist/es/service/index.mjs +256 -0
- package/dist/es/service/index.mjs.map +1 -0
- package/dist/es/service/utils.mjs +15 -0
- package/dist/es/service/utils.mjs.map +1 -0
- package/dist/es/skill/index.mjs +38 -0
- package/dist/es/skill/index.mjs.map +1 -0
- package/dist/es/task-runner.mjs +258 -0
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/task-timing.mjs +12 -0
- package/dist/es/task-timing.mjs.map +1 -0
- package/dist/es/tree.mjs +13 -0
- package/dist/es/tree.mjs.map +1 -0
- package/dist/es/types.mjs +196 -0
- package/dist/es/types.mjs.map +1 -0
- package/dist/es/utils.mjs +218 -0
- package/dist/es/utils.mjs.map +1 -0
- package/dist/es/yaml/builder.mjs +13 -0
- package/dist/es/yaml/builder.mjs.map +1 -0
- package/dist/es/yaml/index.mjs +4 -0
- package/dist/es/yaml/player.mjs +418 -0
- package/dist/es/yaml/player.mjs.map +1 -0
- package/dist/es/yaml/utils.mjs +73 -0
- package/dist/es/yaml/utils.mjs.map +1 -0
- package/dist/es/yaml.mjs +0 -0
- package/dist/lib/agent/agent.js +757 -0
- package/dist/lib/agent/agent.js.map +1 -0
- package/dist/lib/agent/common.js +5 -0
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/index.js +81 -0
- package/dist/lib/agent/index.js.map +1 -0
- package/dist/lib/agent/task-builder.js +367 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/task-cache.js +238 -0
- package/dist/lib/agent/task-cache.js.map +1 -0
- package/dist/lib/agent/tasks.js +465 -0
- package/dist/lib/agent/tasks.js.map +1 -0
- package/dist/lib/agent/ui-utils.js +143 -0
- package/dist/lib/agent/ui-utils.js.map +1 -0
- package/dist/lib/agent/utils.js +275 -0
- package/dist/lib/agent/utils.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/actions.js +258 -0
- package/dist/lib/ai-model/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/index.js +66 -0
- package/dist/lib/ai-model/auto-glm/index.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/parser.js +282 -0
- package/dist/lib/ai-model/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/planning.js +105 -0
- package/dist/lib/ai-model/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/prompt.js +259 -0
- package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/util.js +46 -0
- package/dist/lib/ai-model/auto-glm/util.js.map +1 -0
- package/dist/lib/ai-model/conversation-history.js +229 -0
- package/dist/lib/ai-model/conversation-history.js.map +1 -0
- package/dist/lib/ai-model/index.js +125 -0
- package/dist/lib/ai-model/index.js.map +1 -0
- package/dist/lib/ai-model/inspect.js +429 -0
- package/dist/lib/ai-model/inspect.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +270 -0
- package/dist/lib/ai-model/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/common.js +41 -0
- package/dist/lib/ai-model/prompt/common.js.map +1 -0
- package/dist/lib/ai-model/prompt/describe.js +100 -0
- package/dist/lib/ai-model/prompt/describe.js.map +1 -0
- package/dist/lib/ai-model/prompt/extraction.js +169 -0
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-locator.js +88 -0
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-planning.js +401 -0
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js +81 -0
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
- package/dist/lib/ai-model/prompt/util.js +105 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
- package/dist/lib/ai-model/service-caller/index.js +531 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -0
- package/dist/lib/ai-model/ui-tars-planning.js +283 -0
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
- package/dist/lib/common.js +480 -0
- package/dist/lib/common.js.map +1 -0
- package/dist/lib/device/device-options.js +20 -0
- package/dist/lib/device/device-options.js.map +1 -0
- package/dist/lib/device/index.js +418 -0
- package/dist/lib/device/index.js.map +1 -0
- package/dist/lib/dump/html-utils.js +281 -0
- package/dist/lib/dump/html-utils.js.map +1 -0
- package/dist/lib/dump/image-restoration.js +77 -0
- package/dist/lib/dump/image-restoration.js.map +1 -0
- package/dist/lib/dump/index.js +60 -0
- package/dist/lib/dump/index.js.map +1 -0
- package/dist/lib/index.js +146 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/report-generator.js +172 -0
- package/dist/lib/report-generator.js.map +1 -0
- package/dist/lib/report.js +145 -0
- package/dist/lib/report.js.map +1 -0
- package/dist/lib/screenshot-item.js +139 -0
- package/dist/lib/screenshot-item.js.map +1 -0
- package/dist/lib/service/index.js +290 -0
- package/dist/lib/service/index.js.map +1 -0
- package/dist/lib/service/utils.js +49 -0
- package/dist/lib/service/utils.js.map +1 -0
- package/dist/lib/skill/index.js +72 -0
- package/dist/lib/skill/index.js.map +1 -0
- package/dist/lib/task-runner.js +295 -0
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/task-timing.js +46 -0
- package/dist/lib/task-timing.js.map +1 -0
- package/dist/lib/tree.js +53 -0
- package/dist/lib/tree.js.map +1 -0
- package/dist/lib/types.js +285 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/utils.js +297 -0
- package/dist/lib/utils.js.map +1 -0
- package/dist/lib/yaml/builder.js +57 -0
- package/dist/lib/yaml/builder.js.map +1 -0
- package/dist/lib/yaml/index.js +81 -0
- package/dist/lib/yaml/index.js.map +1 -0
- package/dist/lib/yaml/player.js +452 -0
- package/dist/lib/yaml/player.js.map +1 -0
- package/dist/lib/yaml/utils.js +126 -0
- package/dist/lib/yaml/utils.js.map +1 -0
- package/dist/lib/yaml.js +20 -0
- package/dist/lib/yaml.js.map +1 -0
- package/dist/types/agent/agent.d.ts +190 -0
- package/dist/types/agent/common.d.ts +0 -0
- package/dist/types/agent/execution-session.d.ts +36 -0
- package/dist/types/agent/index.d.ts +10 -0
- package/dist/types/agent/task-builder.d.ts +34 -0
- package/dist/types/agent/task-cache.d.ts +48 -0
- package/dist/types/agent/tasks.d.ts +70 -0
- package/dist/types/agent/ui-utils.d.ts +14 -0
- package/dist/types/agent/utils.d.ts +29 -0
- package/dist/types/ai-model/auto-glm/actions.d.ts +77 -0
- package/dist/types/ai-model/auto-glm/index.d.ts +6 -0
- package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
- package/dist/types/ai-model/auto-glm/planning.d.ts +10 -0
- package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
- package/dist/types/ai-model/auto-glm/util.d.ts +13 -0
- package/dist/types/ai-model/conversation-history.d.ts +105 -0
- package/dist/types/ai-model/index.d.ts +14 -0
- package/dist/types/ai-model/inspect.d.ts +58 -0
- package/dist/types/ai-model/llm-planning.d.ts +19 -0
- package/dist/types/ai-model/prompt/common.d.ts +2 -0
- package/dist/types/ai-model/prompt/describe.d.ts +1 -0
- package/dist/types/ai-model/prompt/extraction.d.ts +7 -0
- package/dist/types/ai-model/prompt/llm-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/llm-planning.d.ts +10 -0
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +3 -0
- package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
- package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +33 -0
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +100 -0
- package/dist/types/ai-model/service-caller/index.d.ts +49 -0
- package/dist/types/ai-model/ui-tars-planning.d.ts +72 -0
- package/dist/types/common.d.ts +288 -0
- package/dist/types/device/device-options.d.ts +142 -0
- package/dist/types/device/index.d.ts +2315 -0
- package/dist/types/dump/html-utils.d.ts +52 -0
- package/dist/types/dump/image-restoration.d.ts +6 -0
- package/dist/types/dump/index.d.ts +5 -0
- package/dist/types/index.d.ts +17 -0
- package/dist/types/report-generator.d.ts +48 -0
- package/dist/types/report.d.ts +15 -0
- package/dist/types/screenshot-item.d.ts +66 -0
- package/dist/types/service/index.d.ts +23 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/skill/index.d.ts +25 -0
- package/dist/types/task-runner.d.ts +48 -0
- package/dist/types/task-timing.d.ts +8 -0
- package/dist/types/tree.d.ts +4 -0
- package/dist/types/types.d.ts +645 -0
- package/dist/types/utils.d.ts +40 -0
- package/dist/types/yaml/builder.d.ts +2 -0
- package/dist/types/yaml/index.d.ts +4 -0
- package/dist/types/yaml/player.d.ts +34 -0
- package/dist/types/yaml/utils.d.ts +9 -0
- package/dist/types/yaml.d.ts +203 -0
- package/package.json +111 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import { generateElementByPoint, generateElementByRect } from "@midscene/shared/extractor/dom-util";
|
|
2
|
+
import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl, scaleImage } from "@midscene/shared/img";
|
|
3
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
4
|
+
import { assert } from "@midscene/shared/utils";
|
|
5
|
+
import { adaptBboxToRect, expandSearchArea, mergeRects } from "../common.mjs";
|
|
6
|
+
import { parseAutoGLMLocateResponse } from "./auto-glm/parser.mjs";
|
|
7
|
+
import { getAutoGLMLocatePrompt } from "./auto-glm/prompt.mjs";
|
|
8
|
+
import { isAutoGLM } from "./auto-glm/util.mjs";
|
|
9
|
+
import { extractDataQueryPrompt, parseXMLExtractionResponse, systemPromptToExtract } from "./prompt/extraction.mjs";
|
|
10
|
+
import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
|
|
11
|
+
import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
|
|
12
|
+
import { orderSensitiveJudgePrompt, systemPromptToJudgeOrderSensitive } from "./prompt/order-sensitive-judge.mjs";
|
|
13
|
+
import { AIResponseParseError, callAI, callAIWithObjectResponse, callAIWithStringResponse } from "./service-caller/index.mjs";
|
|
14
|
+
const debugInspect = getDebug('ai:inspect');
|
|
15
|
+
const debugSection = getDebug('ai:section');
|
|
16
|
+
const extraTextFromUserPrompt = (prompt)=>{
|
|
17
|
+
if ('string' == typeof prompt) return prompt;
|
|
18
|
+
return prompt.prompt;
|
|
19
|
+
};
|
|
20
|
+
const promptsToChatParam = async (multimodalPrompt)=>{
|
|
21
|
+
const msgs = [];
|
|
22
|
+
if (multimodalPrompt?.images?.length) {
|
|
23
|
+
msgs.push({
|
|
24
|
+
role: 'user',
|
|
25
|
+
content: [
|
|
26
|
+
{
|
|
27
|
+
type: 'text',
|
|
28
|
+
text: 'Next, I will provide all the reference images.'
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
});
|
|
32
|
+
for (const item of multimodalPrompt.images){
|
|
33
|
+
const base64 = await preProcessImageUrl(item.url, !!multimodalPrompt.convertHttpImage2Base64);
|
|
34
|
+
msgs.push({
|
|
35
|
+
role: 'user',
|
|
36
|
+
content: [
|
|
37
|
+
{
|
|
38
|
+
type: 'text',
|
|
39
|
+
text: `this is the reference image named '${item.name}':`
|
|
40
|
+
}
|
|
41
|
+
]
|
|
42
|
+
});
|
|
43
|
+
msgs.push({
|
|
44
|
+
role: 'user',
|
|
45
|
+
content: [
|
|
46
|
+
{
|
|
47
|
+
type: 'image_url',
|
|
48
|
+
image_url: {
|
|
49
|
+
url: base64,
|
|
50
|
+
detail: 'high'
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return msgs;
|
|
58
|
+
};
|
|
59
|
+
async function AiLocateElement(options) {
|
|
60
|
+
const { context, targetElementDescription, modelConfig } = options;
|
|
61
|
+
const { modelFamily } = modelConfig;
|
|
62
|
+
const screenshotBase64 = context.screenshot.base64;
|
|
63
|
+
assert(targetElementDescription, "cannot find the target element description");
|
|
64
|
+
const targetElementDescriptionText = extraTextFromUserPrompt(targetElementDescription);
|
|
65
|
+
const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);
|
|
66
|
+
const systemPrompt = isAutoGLM(modelFamily) ? getAutoGLMLocatePrompt(modelFamily) : systemPromptToLocateElement(modelFamily);
|
|
67
|
+
let imagePayload = screenshotBase64;
|
|
68
|
+
let imageWidth = context.shotSize.width;
|
|
69
|
+
let imageHeight = context.shotSize.height;
|
|
70
|
+
let originalImageWidth = imageWidth;
|
|
71
|
+
let originalImageHeight = imageHeight;
|
|
72
|
+
if (options.searchConfig) {
|
|
73
|
+
assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
|
|
74
|
+
assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
|
|
75
|
+
imagePayload = options.searchConfig.imageBase64;
|
|
76
|
+
imageWidth = options.searchConfig.rect?.width;
|
|
77
|
+
imageHeight = options.searchConfig.rect?.height;
|
|
78
|
+
originalImageWidth = imageWidth;
|
|
79
|
+
originalImageHeight = imageHeight;
|
|
80
|
+
} else if ('qwen2.5-vl' === modelFamily) {
|
|
81
|
+
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
82
|
+
imageWidth = paddedResult.width;
|
|
83
|
+
imageHeight = paddedResult.height;
|
|
84
|
+
imagePayload = paddedResult.imageBase64;
|
|
85
|
+
}
|
|
86
|
+
const msgs = [
|
|
87
|
+
{
|
|
88
|
+
role: 'system',
|
|
89
|
+
content: systemPrompt
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
role: 'user',
|
|
93
|
+
content: [
|
|
94
|
+
{
|
|
95
|
+
type: 'image_url',
|
|
96
|
+
image_url: {
|
|
97
|
+
url: imagePayload,
|
|
98
|
+
detail: 'high'
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
type: 'text',
|
|
103
|
+
text: isAutoGLM(modelFamily) ? `Tap: ${userInstructionPrompt}` : userInstructionPrompt
|
|
104
|
+
}
|
|
105
|
+
]
|
|
106
|
+
}
|
|
107
|
+
];
|
|
108
|
+
if ('string' != typeof targetElementDescription) {
|
|
109
|
+
const addOns = await promptsToChatParam({
|
|
110
|
+
images: targetElementDescription.images,
|
|
111
|
+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
|
|
112
|
+
});
|
|
113
|
+
msgs.push(...addOns);
|
|
114
|
+
}
|
|
115
|
+
if (isAutoGLM(modelFamily)) {
|
|
116
|
+
const { content: rawResponseContent, usage } = await callAIWithStringResponse(msgs, modelConfig, {
|
|
117
|
+
abortSignal: options.abortSignal
|
|
118
|
+
});
|
|
119
|
+
debugInspect('auto-glm rawResponse:', rawResponseContent);
|
|
120
|
+
const parsed = parseAutoGLMLocateResponse(rawResponseContent);
|
|
121
|
+
debugInspect('auto-glm thinking:', parsed.think);
|
|
122
|
+
debugInspect('auto-glm coordinates:', parsed.coordinates);
|
|
123
|
+
let resRect;
|
|
124
|
+
let matchedElements = [];
|
|
125
|
+
let errors = [];
|
|
126
|
+
if (parsed.error || !parsed.coordinates) {
|
|
127
|
+
errors = [
|
|
128
|
+
parsed.error || 'Failed to parse auto-glm response'
|
|
129
|
+
];
|
|
130
|
+
debugInspect('auto-glm parse error:', errors[0]);
|
|
131
|
+
} else {
|
|
132
|
+
const { x, y } = parsed.coordinates;
|
|
133
|
+
debugInspect('auto-glm coordinates [0-999]:', {
|
|
134
|
+
x,
|
|
135
|
+
y
|
|
136
|
+
});
|
|
137
|
+
const pixelX = Math.round(x * imageWidth / 1000);
|
|
138
|
+
const pixelY = Math.round(y * imageHeight / 1000);
|
|
139
|
+
debugInspect('auto-glm pixel coordinates:', {
|
|
140
|
+
pixelX,
|
|
141
|
+
pixelY
|
|
142
|
+
});
|
|
143
|
+
let finalX = pixelX;
|
|
144
|
+
let finalY = pixelY;
|
|
145
|
+
if (options.searchConfig?.rect) {
|
|
146
|
+
finalX += options.searchConfig.rect.left;
|
|
147
|
+
finalY += options.searchConfig.rect.top;
|
|
148
|
+
}
|
|
149
|
+
const element = generateElementByPoint([
|
|
150
|
+
finalX,
|
|
151
|
+
finalY
|
|
152
|
+
], targetElementDescriptionText);
|
|
153
|
+
resRect = element.rect;
|
|
154
|
+
debugInspect('auto-glm resRect:', resRect);
|
|
155
|
+
if (element) matchedElements = [
|
|
156
|
+
element
|
|
157
|
+
];
|
|
158
|
+
}
|
|
159
|
+
return {
|
|
160
|
+
rect: resRect,
|
|
161
|
+
parseResult: {
|
|
162
|
+
elements: matchedElements,
|
|
163
|
+
errors
|
|
164
|
+
},
|
|
165
|
+
rawResponse: rawResponseContent,
|
|
166
|
+
usage,
|
|
167
|
+
reasoning_content: parsed.think
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
let res;
|
|
171
|
+
try {
|
|
172
|
+
res = await callAIWithObjectResponse(msgs, modelConfig, {
|
|
173
|
+
abortSignal: options.abortSignal
|
|
174
|
+
});
|
|
175
|
+
} catch (callError) {
|
|
176
|
+
const errorMessage = callError instanceof Error ? callError.message : String(callError);
|
|
177
|
+
const rawResponse = callError instanceof AIResponseParseError ? callError.rawResponse : errorMessage;
|
|
178
|
+
const usage = callError instanceof AIResponseParseError ? callError.usage : void 0;
|
|
179
|
+
return {
|
|
180
|
+
rect: void 0,
|
|
181
|
+
parseResult: {
|
|
182
|
+
elements: [],
|
|
183
|
+
errors: [
|
|
184
|
+
`AI call error: ${errorMessage}`
|
|
185
|
+
]
|
|
186
|
+
},
|
|
187
|
+
rawResponse,
|
|
188
|
+
usage,
|
|
189
|
+
reasoning_content: void 0
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
const rawResponse = JSON.stringify(res.content);
|
|
193
|
+
let resRect;
|
|
194
|
+
let matchedElements = [];
|
|
195
|
+
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
196
|
+
try {
|
|
197
|
+
if ('bbox' in res.content && Array.isArray(res.content.bbox) && res.content.bbox.length >= 1) {
|
|
198
|
+
resRect = adaptBboxToRect(res.content.bbox, imageWidth, imageHeight, options.searchConfig?.rect?.left, options.searchConfig?.rect?.top, originalImageWidth, originalImageHeight, modelFamily, options.searchConfig?.scale);
|
|
199
|
+
debugInspect('resRect', resRect);
|
|
200
|
+
const element = generateElementByRect(resRect, targetElementDescriptionText);
|
|
201
|
+
errors = [];
|
|
202
|
+
if (element) matchedElements = [
|
|
203
|
+
element
|
|
204
|
+
];
|
|
205
|
+
}
|
|
206
|
+
} catch (e) {
|
|
207
|
+
const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : 'unknown error in locate';
|
|
208
|
+
if (errors && errors?.length !== 0) errors.push(`(${msg})`);
|
|
209
|
+
else errors = [
|
|
210
|
+
msg
|
|
211
|
+
];
|
|
212
|
+
}
|
|
213
|
+
return {
|
|
214
|
+
rect: resRect,
|
|
215
|
+
parseResult: {
|
|
216
|
+
elements: matchedElements,
|
|
217
|
+
errors: errors
|
|
218
|
+
},
|
|
219
|
+
rawResponse,
|
|
220
|
+
usage: res.usage,
|
|
221
|
+
reasoning_content: res.reasoning_content
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
async function AiLocateSection(options) {
|
|
225
|
+
const { context, sectionDescription, modelConfig } = options;
|
|
226
|
+
const { modelFamily } = modelConfig;
|
|
227
|
+
const screenshotBase64 = context.screenshot.base64;
|
|
228
|
+
const systemPrompt = systemPromptToLocateSection(modelFamily);
|
|
229
|
+
const sectionLocatorInstructionText = sectionLocatorInstruction(extraTextFromUserPrompt(sectionDescription));
|
|
230
|
+
const msgs = [
|
|
231
|
+
{
|
|
232
|
+
role: 'system',
|
|
233
|
+
content: systemPrompt
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
role: 'user',
|
|
237
|
+
content: [
|
|
238
|
+
{
|
|
239
|
+
type: 'image_url',
|
|
240
|
+
image_url: {
|
|
241
|
+
url: screenshotBase64,
|
|
242
|
+
detail: 'high'
|
|
243
|
+
}
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
type: 'text',
|
|
247
|
+
text: sectionLocatorInstructionText
|
|
248
|
+
}
|
|
249
|
+
]
|
|
250
|
+
}
|
|
251
|
+
];
|
|
252
|
+
if ('string' != typeof sectionDescription) {
|
|
253
|
+
const addOns = await promptsToChatParam({
|
|
254
|
+
images: sectionDescription.images,
|
|
255
|
+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
|
|
256
|
+
});
|
|
257
|
+
msgs.push(...addOns);
|
|
258
|
+
}
|
|
259
|
+
let result;
|
|
260
|
+
try {
|
|
261
|
+
result = await callAIWithObjectResponse(msgs, modelConfig, {
|
|
262
|
+
abortSignal: options.abortSignal
|
|
263
|
+
});
|
|
264
|
+
} catch (callError) {
|
|
265
|
+
const errorMessage = callError instanceof Error ? callError.message : String(callError);
|
|
266
|
+
const rawResponse = callError instanceof AIResponseParseError ? callError.rawResponse : errorMessage;
|
|
267
|
+
const usage = callError instanceof AIResponseParseError ? callError.usage : void 0;
|
|
268
|
+
return {
|
|
269
|
+
rect: void 0,
|
|
270
|
+
imageBase64: void 0,
|
|
271
|
+
error: `AI call error: ${errorMessage}`,
|
|
272
|
+
rawResponse,
|
|
273
|
+
usage
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
let sectionRect;
|
|
277
|
+
const sectionBbox = result.content.bbox;
|
|
278
|
+
if (sectionBbox) {
|
|
279
|
+
const targetRect = adaptBboxToRect(sectionBbox, context.shotSize.width, context.shotSize.height, 0, 0, context.shotSize.width, context.shotSize.height, modelFamily);
|
|
280
|
+
debugSection('original targetRect %j', targetRect);
|
|
281
|
+
const referenceBboxList = result.content.references_bbox || [];
|
|
282
|
+
debugSection('referenceBboxList %j', referenceBboxList);
|
|
283
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.shotSize.width, context.shotSize.height, 0, 0, context.shotSize.width, context.shotSize.height, modelFamily));
|
|
284
|
+
debugSection('referenceRects %j', referenceRects);
|
|
285
|
+
const mergedRect = mergeRects([
|
|
286
|
+
targetRect,
|
|
287
|
+
...referenceRects
|
|
288
|
+
]);
|
|
289
|
+
debugSection('mergedRect %j', mergedRect);
|
|
290
|
+
sectionRect = expandSearchArea(mergedRect, context.shotSize);
|
|
291
|
+
debugSection('expanded sectionRect %j', sectionRect);
|
|
292
|
+
}
|
|
293
|
+
let imageBase64 = screenshotBase64;
|
|
294
|
+
let scale;
|
|
295
|
+
if (sectionRect) {
|
|
296
|
+
const originalWidth = sectionRect.width;
|
|
297
|
+
const originalHeight = sectionRect.height;
|
|
298
|
+
const croppedResult = await cropByRect(screenshotBase64, sectionRect, 'qwen2.5-vl' === modelFamily);
|
|
299
|
+
const scaleRatio = 2;
|
|
300
|
+
const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);
|
|
301
|
+
imageBase64 = scaledResult.imageBase64;
|
|
302
|
+
scale = scaleRatio;
|
|
303
|
+
sectionRect.width = scaledResult.width;
|
|
304
|
+
sectionRect.height = scaledResult.height;
|
|
305
|
+
debugSection('scaled sectionRect from %dx%d to %dx%d (scale=%d)', originalWidth, originalHeight, sectionRect.width, sectionRect.height, scale);
|
|
306
|
+
}
|
|
307
|
+
return {
|
|
308
|
+
rect: sectionRect,
|
|
309
|
+
imageBase64,
|
|
310
|
+
scale,
|
|
311
|
+
error: result.content.error,
|
|
312
|
+
rawResponse: JSON.stringify(result.content),
|
|
313
|
+
usage: result.usage
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
async function AiExtractElementInfo(options) {
|
|
317
|
+
const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } = options;
|
|
318
|
+
const systemPrompt = systemPromptToExtract();
|
|
319
|
+
const screenshotBase64 = context.screenshot.base64;
|
|
320
|
+
const extractDataPromptText = extractDataQueryPrompt(options.pageDescription || '', dataQuery);
|
|
321
|
+
const userContent = [];
|
|
322
|
+
if (extractOption?.screenshotIncluded !== false) userContent.push({
|
|
323
|
+
type: 'image_url',
|
|
324
|
+
image_url: {
|
|
325
|
+
url: screenshotBase64,
|
|
326
|
+
detail: 'high'
|
|
327
|
+
}
|
|
328
|
+
});
|
|
329
|
+
userContent.push({
|
|
330
|
+
type: 'text',
|
|
331
|
+
text: extractDataPromptText
|
|
332
|
+
});
|
|
333
|
+
const msgs = [
|
|
334
|
+
{
|
|
335
|
+
role: 'system',
|
|
336
|
+
content: systemPrompt
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
role: 'user',
|
|
340
|
+
content: userContent
|
|
341
|
+
}
|
|
342
|
+
];
|
|
343
|
+
if (multimodalPrompt) {
|
|
344
|
+
const addOns = await promptsToChatParam({
|
|
345
|
+
images: multimodalPrompt.images,
|
|
346
|
+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
|
|
347
|
+
});
|
|
348
|
+
msgs.push(...addOns);
|
|
349
|
+
}
|
|
350
|
+
const { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelConfig);
|
|
351
|
+
let parseResult;
|
|
352
|
+
try {
|
|
353
|
+
parseResult = parseXMLExtractionResponse(rawResponse);
|
|
354
|
+
} catch (parseError) {
|
|
355
|
+
const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
|
|
356
|
+
throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
|
|
357
|
+
}
|
|
358
|
+
return {
|
|
359
|
+
parseResult,
|
|
360
|
+
rawResponse,
|
|
361
|
+
usage,
|
|
362
|
+
reasoning_content
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
async function AiJudgeOrderSensitive(description, callAIFn, modelConfig) {
|
|
366
|
+
const systemPrompt = systemPromptToJudgeOrderSensitive();
|
|
367
|
+
const userPrompt = orderSensitiveJudgePrompt(description);
|
|
368
|
+
const msgs = [
|
|
369
|
+
{
|
|
370
|
+
role: 'system',
|
|
371
|
+
content: systemPrompt
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
role: 'user',
|
|
375
|
+
content: userPrompt
|
|
376
|
+
}
|
|
377
|
+
];
|
|
378
|
+
const result = await callAIFn(msgs, modelConfig);
|
|
379
|
+
return {
|
|
380
|
+
isOrderSensitive: result.content.isOrderSensitive ?? false,
|
|
381
|
+
usage: result.usage
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
export { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection };
|
|
385
|
+
|
|
386
|
+
//# sourceMappingURL=inspect.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n generateElementByPoint,\n generateElementByRect,\n} from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n scaleImage,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.shotSize.width;\n let imageHeight = context.shotSize.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig, {\n abortSignal: options.abortSignal,\n });\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Apply offset if searching in a cropped area\n let finalX = pixelX;\n let finalY = pixelY;\n if (options.searchConfig?.rect) {\n finalX += options.searchConfig.rect.left;\n finalY += options.searchConfig.rect.top;\n }\n\n const element: LocateResultElement = generateElementByPoint(\n [finalX, finalY],\n targetElementDescriptionText as string,\n );\n\n resRect = element.rect;\n debugInspect('auto-glm resRect:', resRect);\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<\n ReturnType<\n typeof callAIWithObjectResponse<AIElementResponse | [number, number]>\n >\n >;\n try {\n res = await callAIWithObjectResponse<AIElementResponse | [number, number]>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n options.searchConfig?.scale,\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n scale?: number;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n sectionRect = expandSearchArea(mergedRect, context.shotSize);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n let scale: number | undefined;\n\n if (sectionRect) {\n const originalWidth = sectionRect.width;\n const originalHeight = sectionRect.height;\n\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n\n const scaleRatio = 2;\n const scaledResult = await scaleImage(\n croppedResult.imageBase64,\n scaleRatio,\n );\n\n imageBase64 = scaledResult.imageBase64;\n scale = scaleRatio;\n sectionRect.width = scaledResult.width;\n sectionRect.height = scaledResult.height;\n\n debugSection(\n 'scaled sectionRect from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n sectionRect.width,\n sectionRect.height,\n scale,\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n scale,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","modelConfig","modelFamily","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","finalX","finalY","element","generateElementByPoint","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","scale","originalWidth","originalHeight","croppedResult","cropByRect","scaleRatio","scaledResult","scaleImage","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","callAIFn","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AA6DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAMrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,WAAW,EAAE,GAAGH;IAC3D,MAAM,EAAEI,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElDK,OACEJ,0BACA;IAEF,MAAMK,+BAA+BhB,wBACnCW;IAEF,MAAMM,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUP,eAC3BQ,uBAAuBR,eACvBS,4BAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAad,QAAQ,QAAQ,CAAC,KAAK;IACvC,IAAIe,cAAcf,QAAQ,QAAQ,CAAC,MAAM;IACzC,IAAIgB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIhB,QAAQ,YAAY,EAAE;QACxBM,OACEN,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFM,OACEN,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFc,eAAed,QAAQ,YAAY,CAAC,WAAW;QAC/Ce,aAAaf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCgB,cAAchB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCiB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMe,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMxB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUP,eACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAON,0BAAuC;QAChD,MAAMmB,SAAS,MAAM5B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI0B;IACf;IAEA,IAAIV,UAAUP,cAAc;QAC1B,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB7B,MAAMQ,aAAa;YAChD,aAAaH,QAAQ,WAAW;QAClC;QAEFZ,aAAa,yBAAyBkC;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1ClC,aAAa,sBAAsBqC,OAAO,KAAK;QAC/CrC,aAAa,yBAAyBqC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DrC,aAAa,yBAAyByC,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCrC,aAAa,iCAAiC;gBAAE0C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C5B,aAAa,+BAA+B;gBAAE4C;gBAAQE;YAAO;YAG7D,IAAIC,SAASH;YACb,IAAII,SAASF;YACb,IAAIlC,QAAQ,YAAY,EAAE,MAAM;gBAC9BmC,UAAUnC,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBACxCoC,UAAUpC,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YACzC;YAEA,MAAMqC,UAA+BC,uBACnC;gBAACH;gBAAQC;aAAO,EAChB7B;YAGFoB,UAAUU,QAAQ,IAAI;YACtBjD,aAAa,qBAAqBuC;YAElC,IAAIU,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMV;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIc;IAKJ,IAAI;QACFA,MAAM,MAAMC,yBACV7C,MACAQ,aACA;YAAE,aAAaH,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACAtB;YACA,mBAAmBwB;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIZ;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYU,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBU,MAAM,OAAO,CAACV,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAZ,UAAUuB,gBACRX,IAAI,OAAO,CAAC,IAAI,EAChBxB,YACAC,aACAhB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BiB,oBACAC,qBACAd,aACAJ,QAAQ,YAAY,EAAE;YAGxBZ,aAAa,WAAWuC;YAExB,MAAMU,UAA+Bc,sBACnCxB,SACApB;YAEFsB,SAAS,EAAE;YAEX,IAAIQ,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;IACF,EAAE,OAAOe,GAAG;QACV,MAAMC,MACJD,aAAaT,QACT,CAAC,sBAAsB,EAAES,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACvB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEwB,IAAI,CAAC,CAAC;aAFtBxB,SAAS;YAACwB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAM1B;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAgB;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAee,gBAAgBtD,OAKrC;IAQC,MAAM,EAAEC,OAAO,EAAEsD,kBAAkB,EAAEpD,WAAW,EAAE,GAAGH;IACrD,MAAM,EAAEI,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMS,eAAe8C,4BAA4BpD;IACjD,MAAMqD,gCAAgCC,0BACpCnE,wBAAwBgE;IAE1B,MAAM5D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMoD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMlC,SAAS,MAAM5B,mBAAmB;YACtC,QAAQ8D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA5D,KAAK,IAAI,IAAI0B;IACf;IAEA,IAAIsC;IAGJ,IAAI;QACFA,SAAS,MAAMnB,yBACb7C,MACAQ,aACA;YAAE,aAAaH,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACAtB;QACF;IACF;IAEA,IAAIqC;IACJ,MAAMC,cAAcF,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIE,aAAa;QACf,MAAMC,aAAaZ,gBACjBW,aACA5D,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBG;QAEFd,aAAa,0BAA0BwE;QAEvC,MAAMC,oBAAoBJ,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DrE,aAAa,wBAAwByE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAShB,MAAM,OAAO,CAACgB,OAC/B,GAAG,CAAC,CAACA,OACGf,gBACLe,MACAhE,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBG;QAGNd,aAAa,qBAAqB0E;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7D1E,aAAa,iBAAiB4E;QAE9BN,cAAcQ,iBAAiBF,YAAYjE,QAAQ,QAAQ;QAC3DX,aAAa,2BAA2BsE;IAC1C;IAEA,IAAIS,cAAchE;IAClB,IAAIiE;IAEJ,IAAIV,aAAa;QACf,MAAMW,gBAAgBX,YAAY,KAAK;QACvC,MAAMY,iBAAiBZ,YAAY,MAAM;QAEzC,MAAMa,gBAAgB,MAAMC,WAC1BrE,kBACAuD,aACAxD,AAAgB,iBAAhBA;QAGF,MAAMuE,aAAa;QACnB,MAAMC,eAAe,MAAMC,WACzBJ,cAAc,WAAW,EACzBE;QAGFN,cAAcO,aAAa,WAAW;QACtCN,QAAQK;QACRf,YAAY,KAAK,GAAGgB,aAAa,KAAK;QACtChB,YAAY,MAAM,GAAGgB,aAAa,MAAM;QAExCtF,aACE,qDACAiF,eACAC,gBACAZ,YAAY,KAAK,EACjBA,YAAY,MAAM,EAClBU;IAEJ;IAEA,OAAO;QACL,MAAMV;QACNS;QACAC;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAemB,qBAAwB9E,OAO7C;IACC,MAAM,EAAE+E,SAAS,EAAE9E,OAAO,EAAE+E,aAAa,EAAEtF,gBAAgB,EAAES,WAAW,EAAE,GACxEH;IACF,MAAMU,eAAeuE;IACrB,MAAM5E,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMiF,wBAAwBC,uBAC5BnF,QAAQ,eAAe,IAAI,IAC3B+E;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAK/E;YACL,QAAQ;QACV;IACF;IAGF+E,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMvF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS0E;QACX;KACD;IAED,IAAI1F,kBAAkB;QACpB,MAAM2B,SAAS,MAAM5B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI0B;IACf;IAEA,MAAM,EACJ,SAASwB,WAAW,EACpBtB,KAAK,EACL8D,iBAAiB,EAClB,GAAG,MAAMC,OAAO3F,MAAMQ;IAGvB,IAAIoF;IACJ,IAAI;QACFA,cAAcC,2BAA8B3C;IAC9C,EAAE,OAAO4C,YAAY;QAEnB,MAAM/C,eACJ+C,sBAAsB9C,QAAQ8C,WAAW,OAAO,GAAG7C,OAAO6C;QAC5D,MAAM,IAAI3C,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAtB;IAEJ;IAEA,OAAO;QACLgE;QACA1C;QACAtB;QACA8D;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBC,QAAwE,EACxEzF,WAAyB;IAKzB,MAAMO,eAAemF;IACrB,MAAMC,aAAaC,0BAA0BJ;IAE7C,MAAMhG,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAASoF;QACX;KACD;IAED,MAAMnC,SAAS,MAAMiC,SAASjG,MAAMQ;IAEpC,OAAO;QACL,kBAAkBwD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
|
|
2
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
3
|
+
import { assert } from "@midscene/shared/utils";
|
|
4
|
+
import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "../common.mjs";
|
|
5
|
+
import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
|
+
import { extractXMLTag, parseMarkFinishedIndexes, parseSubGoalsFromXML } from "./prompt/util.mjs";
|
|
7
|
+
import { AIResponseParseError, callAI, safeParseJson } from "./service-caller/index.mjs";
|
|
8
|
+
const debug = getDebug('planning');
|
|
9
|
+
const warnLog = getDebug('planning', {
|
|
10
|
+
console: true
|
|
11
|
+
});
|
|
12
|
+
function parseXMLPlanningResponse(xmlString, modelFamily) {
|
|
13
|
+
const thought = extractXMLTag(xmlString, 'thought');
|
|
14
|
+
const memory = extractXMLTag(xmlString, 'memory');
|
|
15
|
+
const log = extractXMLTag(xmlString, 'log') || '';
|
|
16
|
+
const error = extractXMLTag(xmlString, 'error');
|
|
17
|
+
const actionType = extractXMLTag(xmlString, 'action-type');
|
|
18
|
+
const actionParamStr = extractXMLTag(xmlString, 'action-param-json');
|
|
19
|
+
const completeGoalRegex = /<complete\s+success="(true|false)">([\s\S]*?)<\/complete>/i;
|
|
20
|
+
const completeGoalMatch = xmlString.match(completeGoalRegex);
|
|
21
|
+
let finalizeMessage;
|
|
22
|
+
let finalizeSuccess;
|
|
23
|
+
if (completeGoalMatch) {
|
|
24
|
+
finalizeSuccess = 'true' === completeGoalMatch[1];
|
|
25
|
+
finalizeMessage = completeGoalMatch[2]?.trim() || void 0;
|
|
26
|
+
}
|
|
27
|
+
const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');
|
|
28
|
+
const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');
|
|
29
|
+
const updateSubGoals = updatePlanContent ? parseSubGoalsFromXML(updatePlanContent) : void 0;
|
|
30
|
+
const markFinishedIndexes = markSubGoalDone ? parseMarkFinishedIndexes(markSubGoalDone) : void 0;
|
|
31
|
+
let action = null;
|
|
32
|
+
if (actionType && 'null' !== actionType.toLowerCase()) {
|
|
33
|
+
const type = actionType.trim();
|
|
34
|
+
let param;
|
|
35
|
+
if (actionParamStr) try {
|
|
36
|
+
param = safeParseJson(actionParamStr, modelFamily);
|
|
37
|
+
} catch (e) {
|
|
38
|
+
throw new Error(`Failed to parse action-param-json: ${e}`);
|
|
39
|
+
}
|
|
40
|
+
action = {
|
|
41
|
+
type,
|
|
42
|
+
...void 0 !== param ? {
|
|
43
|
+
param
|
|
44
|
+
} : {}
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
return {
|
|
48
|
+
...thought ? {
|
|
49
|
+
thought
|
|
50
|
+
} : {},
|
|
51
|
+
...memory ? {
|
|
52
|
+
memory
|
|
53
|
+
} : {},
|
|
54
|
+
log,
|
|
55
|
+
...error ? {
|
|
56
|
+
error
|
|
57
|
+
} : {},
|
|
58
|
+
action,
|
|
59
|
+
...void 0 !== finalizeMessage ? {
|
|
60
|
+
finalizeMessage
|
|
61
|
+
} : {},
|
|
62
|
+
...void 0 !== finalizeSuccess ? {
|
|
63
|
+
finalizeSuccess
|
|
64
|
+
} : {},
|
|
65
|
+
...updateSubGoals?.length ? {
|
|
66
|
+
updateSubGoals
|
|
67
|
+
} : {},
|
|
68
|
+
...markFinishedIndexes?.length ? {
|
|
69
|
+
markFinishedIndexes
|
|
70
|
+
} : {}
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
async function plan(userInstruction, opts) {
|
|
74
|
+
const { context, modelConfig, conversationHistory } = opts;
|
|
75
|
+
const { shotSize } = context;
|
|
76
|
+
const screenshotBase64 = context.screenshot.base64;
|
|
77
|
+
const { modelFamily } = modelConfig;
|
|
78
|
+
const includeSubGoals = true === opts.deepThink;
|
|
79
|
+
const systemPrompt = await systemPromptToTaskPlanning({
|
|
80
|
+
actionSpace: opts.actionSpace,
|
|
81
|
+
modelFamily,
|
|
82
|
+
includeBbox: opts.includeBbox,
|
|
83
|
+
includeThought: true,
|
|
84
|
+
includeSubGoals
|
|
85
|
+
});
|
|
86
|
+
let imagePayload = screenshotBase64;
|
|
87
|
+
let imageWidth = shotSize.width;
|
|
88
|
+
let imageHeight = shotSize.height;
|
|
89
|
+
if ('qwen2.5-vl' === modelFamily) {
|
|
90
|
+
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
91
|
+
imageWidth = paddedResult.width;
|
|
92
|
+
imageHeight = paddedResult.height;
|
|
93
|
+
imagePayload = paddedResult.imageBase64;
|
|
94
|
+
}
|
|
95
|
+
const actionContext = opts.actionContext ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\n` : '';
|
|
96
|
+
const instruction = [
|
|
97
|
+
{
|
|
98
|
+
role: 'user',
|
|
99
|
+
content: [
|
|
100
|
+
{
|
|
101
|
+
type: 'text',
|
|
102
|
+
text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
}
|
|
106
|
+
];
|
|
107
|
+
let latestFeedbackMessage;
|
|
108
|
+
const subGoalsText = includeSubGoals ? conversationHistory.subGoalsToText() : conversationHistory.historicalLogsToText();
|
|
109
|
+
const subGoalsSection = subGoalsText ? `\n\n${subGoalsText}` : '';
|
|
110
|
+
const memoriesText = conversationHistory.memoriesToText();
|
|
111
|
+
const memoriesSection = memoriesText ? `\n\n${memoriesText}` : '';
|
|
112
|
+
if (conversationHistory.pendingFeedbackMessage) {
|
|
113
|
+
latestFeedbackMessage = {
|
|
114
|
+
role: 'user',
|
|
115
|
+
content: [
|
|
116
|
+
{
|
|
117
|
+
type: 'text',
|
|
118
|
+
text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
type: 'image_url',
|
|
122
|
+
image_url: {
|
|
123
|
+
url: imagePayload,
|
|
124
|
+
detail: 'high'
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
]
|
|
128
|
+
};
|
|
129
|
+
conversationHistory.resetPendingFeedbackMessageIfExists();
|
|
130
|
+
} else latestFeedbackMessage = {
|
|
131
|
+
role: 'user',
|
|
132
|
+
content: [
|
|
133
|
+
{
|
|
134
|
+
type: 'text',
|
|
135
|
+
text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
type: 'image_url',
|
|
139
|
+
image_url: {
|
|
140
|
+
url: imagePayload,
|
|
141
|
+
detail: 'high'
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
]
|
|
145
|
+
};
|
|
146
|
+
conversationHistory.append(latestFeedbackMessage);
|
|
147
|
+
conversationHistory.compressHistory(50, 20);
|
|
148
|
+
const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);
|
|
149
|
+
const msgs = [
|
|
150
|
+
{
|
|
151
|
+
role: 'system',
|
|
152
|
+
content: systemPrompt
|
|
153
|
+
},
|
|
154
|
+
...instruction,
|
|
155
|
+
...historyLog
|
|
156
|
+
];
|
|
157
|
+
let { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelConfig, {
|
|
158
|
+
deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
|
|
159
|
+
abortSignal: opts.abortSignal
|
|
160
|
+
});
|
|
161
|
+
let planFromAI;
|
|
162
|
+
try {
|
|
163
|
+
try {
|
|
164
|
+
planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
|
|
165
|
+
} catch {
|
|
166
|
+
const retry = await callAI(msgs, modelConfig, {
|
|
167
|
+
deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
|
|
168
|
+
abortSignal: opts.abortSignal
|
|
169
|
+
});
|
|
170
|
+
rawResponse = retry.content;
|
|
171
|
+
usage = retry.usage;
|
|
172
|
+
reasoning_content = retry.reasoning_content;
|
|
173
|
+
planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
|
|
174
|
+
}
|
|
175
|
+
if (planFromAI.action && void 0 !== planFromAI.finalizeSuccess) {
|
|
176
|
+
warnLog('Planning response included both an action and <complete>; ignoring <complete> output.');
|
|
177
|
+
planFromAI.finalizeMessage = void 0;
|
|
178
|
+
planFromAI.finalizeSuccess = void 0;
|
|
179
|
+
}
|
|
180
|
+
const actions = planFromAI.action ? [
|
|
181
|
+
planFromAI.action
|
|
182
|
+
] : [];
|
|
183
|
+
let shouldContinuePlanning = true;
|
|
184
|
+
if (void 0 !== planFromAI.finalizeSuccess) {
|
|
185
|
+
debug('task completed via <complete> tag, stop planning');
|
|
186
|
+
shouldContinuePlanning = false;
|
|
187
|
+
if (includeSubGoals) conversationHistory.markAllSubGoalsFinished();
|
|
188
|
+
}
|
|
189
|
+
const returnValue = {
|
|
190
|
+
...planFromAI,
|
|
191
|
+
actions,
|
|
192
|
+
rawResponse,
|
|
193
|
+
usage,
|
|
194
|
+
reasoning_content,
|
|
195
|
+
yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),
|
|
196
|
+
shouldContinuePlanning
|
|
197
|
+
};
|
|
198
|
+
assert(planFromAI, "can't get plans from AI");
|
|
199
|
+
actions.forEach((action)=>{
|
|
200
|
+
const type = action.type;
|
|
201
|
+
const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
|
|
202
|
+
debug('actionInActionSpace matched', actionInActionSpace);
|
|
203
|
+
const locateFields = actionInActionSpace ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema) : [];
|
|
204
|
+
debug('locateFields', locateFields);
|
|
205
|
+
locateFields.forEach((field)=>{
|
|
206
|
+
const locateResult = action.param[field];
|
|
207
|
+
if (locateResult && void 0 !== modelFamily) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, modelFamily);
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
if (includeSubGoals) {
|
|
211
|
+
if (planFromAI.updateSubGoals?.length) conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);
|
|
212
|
+
if (planFromAI.markFinishedIndexes?.length) for (const index of planFromAI.markFinishedIndexes)conversationHistory.markSubGoalFinished(index);
|
|
213
|
+
if (planFromAI.log) conversationHistory.appendSubGoalLog(planFromAI.log);
|
|
214
|
+
} else if (planFromAI.log) conversationHistory.appendHistoricalLog(planFromAI.log);
|
|
215
|
+
if (planFromAI.memory) conversationHistory.appendMemory(planFromAI.memory);
|
|
216
|
+
conversationHistory.append({
|
|
217
|
+
role: 'assistant',
|
|
218
|
+
content: [
|
|
219
|
+
{
|
|
220
|
+
type: 'text',
|
|
221
|
+
text: rawResponse
|
|
222
|
+
}
|
|
223
|
+
]
|
|
224
|
+
});
|
|
225
|
+
return returnValue;
|
|
226
|
+
} catch (parseError) {
|
|
227
|
+
const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
|
|
228
|
+
throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
export { parseXMLPlanningResponse, plan };
|
|
232
|
+
|
|
233
|
+
//# sourceMappingURL=llm-planning.mjs.map
|