@midscene/core 1.2.1-beta-20260113073450.0 → 1.2.1-beta-20260114072539.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +19 -15
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +33 -17
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/auto-glm/actions.mjs +217 -0
- package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/index.mjs +5 -0
- package/dist/es/ai-model/auto-glm/parser.mjs +239 -0
- package/dist/es/ai-model/auto-glm/parser.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/planning.mjs +63 -0
- package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs +222 -0
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -0
- package/dist/es/ai-model/auto-glm/util.mjs +22 -0
- package/dist/es/ai-model/auto-glm/util.mjs.map +1 -0
- package/dist/es/ai-model/index.mjs +2 -1
- package/dist/es/ai-model/inspect.mjs +68 -3
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/latest-locate-recorder.mjs +29 -0
- package/dist/es/ai-model/latest-locate-recorder.mjs.map +1 -0
- package/dist/es/ai-model/llm-planning.mjs +3 -12
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +8 -40
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +5 -0
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +42 -30
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/common.mjs +7 -15
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/device/index.mjs +3 -28
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/service/index.mjs +5 -0
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/agent.js +18 -14
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +32 -16
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/auto-glm/actions.js +251 -0
- package/dist/lib/ai-model/auto-glm/actions.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/index.js +59 -0
- package/dist/lib/ai-model/auto-glm/index.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/parser.js +282 -0
- package/dist/lib/ai-model/auto-glm/parser.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/planning.js +97 -0
- package/dist/lib/ai-model/auto-glm/planning.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/prompt.js +259 -0
- package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -0
- package/dist/lib/ai-model/auto-glm/util.js +62 -0
- package/dist/lib/ai-model/auto-glm/util.js.map +1 -0
- package/dist/lib/ai-model/index.js +15 -11
- package/dist/lib/ai-model/inspect.js +67 -2
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/latest-locate-recorder.js +63 -0
- package/dist/lib/ai-model/latest-locate-recorder.js.map +1 -0
- package/dist/lib/ai-model/llm-planning.js +2 -11
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +8 -40
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +5 -0
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +42 -30
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/common.js +6 -20
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/device/index.js +15 -52
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/service/index.js +5 -0
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/agent/agent.d.ts +15 -4
- package/dist/types/agent/tasks.d.ts +4 -4
- package/dist/types/ai-model/auto-glm/actions.d.ts +77 -0
- package/dist/types/ai-model/auto-glm/index.d.ts +5 -0
- package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
- package/dist/types/ai-model/auto-glm/planning.d.ts +9 -0
- package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
- package/dist/types/ai-model/auto-glm/util.d.ts +16 -0
- package/dist/types/ai-model/index.d.ts +1 -0
- package/dist/types/ai-model/latest-locate-recorder.d.ts +14 -0
- package/dist/types/common.d.ts +1 -8
- package/dist/types/device/index.d.ts +0 -22
- package/dist/types/types.d.ts +2 -2
- package/package.json +2 -2
|
@@ -5,7 +5,8 @@ import { generatePlaywrightTest, generatePlaywrightTestStream } from "./prompt/p
|
|
|
5
5
|
import { generateYamlTest, generateYamlTestStream } from "./prompt/yaml-generator.mjs";
|
|
6
6
|
import { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection } from "./inspect.mjs";
|
|
7
7
|
import { plan } from "./llm-planning.mjs";
|
|
8
|
+
import { autoGLMPlanning } from "./auto-glm/planning.mjs";
|
|
8
9
|
import { PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBboxToRect, dumpActionParam, findAllMidsceneLocatorField, getMidsceneLocationSchema, loadActionParam, parseActionParam } from "../common.mjs";
|
|
9
10
|
import { resizeImageForUiTars, uiTarsPlanning } from "./ui-tars-planning.mjs";
|
|
10
11
|
import { ConversationHistory } from "./conversation-history.mjs";
|
|
11
|
-
export { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection, ConversationHistory, PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBboxToRect, callAI, callAIWithObjectResponse, callAIWithStringResponse, describeUserPage, dumpActionParam, findAllMidsceneLocatorField, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, getMidsceneLocationSchema, loadActionParam, parseActionParam, plan, resizeImageForUiTars, systemPromptToLocateElement, uiTarsPlanning };
|
|
12
|
+
export { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection, ConversationHistory, PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBboxToRect, autoGLMPlanning, callAI, callAIWithObjectResponse, callAIWithStringResponse, describeUserPage, dumpActionParam, findAllMidsceneLocatorField, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, getMidsceneLocationSchema, loadActionParam, parseActionParam, plan, resizeImageForUiTars, systemPromptToLocateElement, uiTarsPlanning };
|
|
@@ -3,11 +3,14 @@ import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@mi
|
|
|
3
3
|
import { getDebug } from "@midscene/shared/logger";
|
|
4
4
|
import { assert } from "@midscene/shared/utils";
|
|
5
5
|
import { adaptBboxToRect, expandSearchArea, mergeRects } from "../common.mjs";
|
|
6
|
+
import { parseAutoGLMLocateResponse } from "./auto-glm/parser.mjs";
|
|
7
|
+
import { getAutoGLMLocatePrompt } from "./auto-glm/prompt.mjs";
|
|
8
|
+
import { isAutoGLM } from "./auto-glm/util.mjs";
|
|
6
9
|
import { extractDataQueryPrompt, systemPromptToExtract } from "./prompt/extraction.mjs";
|
|
7
10
|
import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
|
|
8
11
|
import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
|
|
9
12
|
import { orderSensitiveJudgePrompt, systemPromptToJudgeOrderSensitive } from "./prompt/order-sensitive-judge.mjs";
|
|
10
|
-
import { callAIWithObjectResponse } from "./service-caller/index.mjs";
|
|
13
|
+
import { callAIWithObjectResponse, callAIWithStringResponse } from "./service-caller/index.mjs";
|
|
11
14
|
const debugInspect = getDebug('ai:inspect');
|
|
12
15
|
const debugSection = getDebug('ai:section');
|
|
13
16
|
const extraTextFromUserPrompt = (prompt)=>{
|
|
@@ -60,7 +63,7 @@ async function AiLocateElement(options) {
|
|
|
60
63
|
assert(targetElementDescription, "cannot find the target element description");
|
|
61
64
|
const targetElementDescriptionText = extraTextFromUserPrompt(targetElementDescription);
|
|
62
65
|
const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);
|
|
63
|
-
const systemPrompt = systemPromptToLocateElement(vlMode);
|
|
66
|
+
const systemPrompt = isAutoGLM(vlMode) ? getAutoGLMLocatePrompt(vlMode) : systemPromptToLocateElement(vlMode);
|
|
64
67
|
let imagePayload = screenshotBase64;
|
|
65
68
|
let imageWidth = context.size.width;
|
|
66
69
|
let imageHeight = context.size.height;
|
|
@@ -97,7 +100,7 @@ async function AiLocateElement(options) {
|
|
|
97
100
|
},
|
|
98
101
|
{
|
|
99
102
|
type: 'text',
|
|
100
|
-
text: userInstructionPrompt
|
|
103
|
+
text: isAutoGLM(vlMode) ? `Tap: ${userInstructionPrompt}` : userInstructionPrompt
|
|
101
104
|
}
|
|
102
105
|
]
|
|
103
106
|
}
|
|
@@ -109,6 +112,68 @@ async function AiLocateElement(options) {
|
|
|
109
112
|
});
|
|
110
113
|
msgs.push(...addOns);
|
|
111
114
|
}
|
|
115
|
+
if (isAutoGLM(vlMode)) {
|
|
116
|
+
const { content: rawResponseContent, usage } = await callAIWithStringResponse(msgs, modelConfig);
|
|
117
|
+
debugInspect('auto-glm rawResponse:', rawResponseContent);
|
|
118
|
+
const parsed = parseAutoGLMLocateResponse(rawResponseContent);
|
|
119
|
+
debugInspect('auto-glm thinking:', parsed.think);
|
|
120
|
+
debugInspect('auto-glm coordinates:', parsed.coordinates);
|
|
121
|
+
let resRect;
|
|
122
|
+
let matchedElements = [];
|
|
123
|
+
let errors = [];
|
|
124
|
+
if (parsed.error || !parsed.coordinates) {
|
|
125
|
+
errors = [
|
|
126
|
+
parsed.error || 'Failed to parse auto-glm response'
|
|
127
|
+
];
|
|
128
|
+
debugInspect('auto-glm parse error:', errors[0]);
|
|
129
|
+
} else {
|
|
130
|
+
const { x, y } = parsed.coordinates;
|
|
131
|
+
debugInspect('auto-glm coordinates [0-999]:', {
|
|
132
|
+
x,
|
|
133
|
+
y
|
|
134
|
+
});
|
|
135
|
+
const pixelX = Math.round(x * imageWidth / 1000);
|
|
136
|
+
const pixelY = Math.round(y * imageHeight / 1000);
|
|
137
|
+
debugInspect('auto-glm pixel coordinates:', {
|
|
138
|
+
pixelX,
|
|
139
|
+
pixelY
|
|
140
|
+
});
|
|
141
|
+
const bboxSize = 10;
|
|
142
|
+
const x1 = Math.max(pixelX - bboxSize / 2, 0);
|
|
143
|
+
const y1 = Math.max(pixelY - bboxSize / 2, 0);
|
|
144
|
+
const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);
|
|
145
|
+
const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);
|
|
146
|
+
resRect = {
|
|
147
|
+
left: x1,
|
|
148
|
+
top: y1,
|
|
149
|
+
width: x2 - x1,
|
|
150
|
+
height: y2 - y1
|
|
151
|
+
};
|
|
152
|
+
if (options.searchConfig?.rect) {
|
|
153
|
+
resRect.left += options.searchConfig.rect.left;
|
|
154
|
+
resRect.top += options.searchConfig.rect.top;
|
|
155
|
+
}
|
|
156
|
+
debugInspect('auto-glm resRect:', resRect);
|
|
157
|
+
const rectCenter = {
|
|
158
|
+
x: resRect.left + resRect.width / 2,
|
|
159
|
+
y: resRect.top + resRect.height / 2
|
|
160
|
+
};
|
|
161
|
+
const element = generateElementByPosition(rectCenter, targetElementDescriptionText);
|
|
162
|
+
if (element) matchedElements = [
|
|
163
|
+
element
|
|
164
|
+
];
|
|
165
|
+
}
|
|
166
|
+
return {
|
|
167
|
+
rect: resRect,
|
|
168
|
+
parseResult: {
|
|
169
|
+
elements: matchedElements,
|
|
170
|
+
errors
|
|
171
|
+
},
|
|
172
|
+
rawResponse: rawResponseContent,
|
|
173
|
+
usage,
|
|
174
|
+
reasoning_content: parsed.think
|
|
175
|
+
};
|
|
176
|
+
}
|
|
112
177
|
const res = await callAIFn(msgs, modelConfig);
|
|
113
178
|
const rawResponse = JSON.stringify(res.content);
|
|
114
179
|
let resRect;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const { screenshotBase64 } = context;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","res","rawResponse","JSON","resRect","matchedElements","errors","Array","adaptBboxToRect","rectCenter","element","generateElementByPosition","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;AAiDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7BM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,4BAA4BP;IAEjD,IAAIQ,eAAeP;IACnB,IAAIQ,aAAab,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIc,cAAcd,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIe,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIf,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFa,eAAeb,QAAQ,YAAY,CAAC,WAAW;QAC/Cc,aAAad,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCe,cAAcf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCgB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIV,AAAW,iBAAXA,QAAyB;QAClC,MAAMa,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMvB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMkB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMC,MAAM,MAAMlB,SAASR,MAAMS;IAEjC,MAAMkB,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIG;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBM,MAAM,OAAO,CAACN,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAG,UAAUI,gBACRP,IAAI,OAAO,CAAC,IAAI,EAChBP,YACAC,aACAf,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BgB,oBACAC,qBACAZ;YAGFjB,aAAa,WAAWoC;YAExB,MAAMK,aAAa;gBACjB,GAAGL,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMM,UAA+BC,0BACnCF,YACArB;YAEFkB,SAAS,EAAE;YAEX,IAAII,SACFL,kBAAkB;gBAACK;aAAQ;QAE/B;IACF,EAAE,OAAOE,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACN,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEO,IAAI,CAAC,CAAC;aAFtBP,SAAS;YAACO;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMT;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAJ;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAec,gBAAgBnC,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEmC,kBAAkB,EAAEhC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMU,eAAe0B,4BAA4BhC;IACjD,MAAMiC,gCAAgCC,0BACpChD,wBAAwB6C;IAE1B,MAAMzC,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMgC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMhB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQ2C,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAzC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAGF,IAAIsC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAahB,gBACjBe,aACA1C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BsD;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DlD,aAAa,wBAAwBuD;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASpB,MAAM,OAAO,CAACoB,OAC/B,GAAG,CAAC,CAACA,OACGnB,gBACLmB,MACA9C,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqBwD;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DxD,aAAa,iBAAiB0D;QAG9BN,cAAcQ,iBAAiBF,YAAY/C,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BoD;IAC1C;IAEA,IAAIS,cAAc7C;IAClB,IAAIoC,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B/C,kBACAoC,aACArC,AAAW,iBAAXA;QAEF8C,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAajB,KAAK,SAAS,CAACiB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBtD,OAO7C;IACC,MAAM,EAAEuD,SAAS,EAAEtD,OAAO,EAAEuD,aAAa,EAAE9D,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe8C;IACrB,MAAM,EAAEnD,gBAAgB,EAAE,GAAGL;IAE7B,MAAMyD,wBAAwBC,uBAC5B3D,QAAQ,eAAe,IAAI,IAC3BuD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKtD;YACL,QAAQ;QACV;IACF;IAGFsD,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM/D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASiD;QACX;KACD;IAED,IAAIlE,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIyB;IACf;IAEA,MAAMoB,SAAS,MAAMC,yBACnB9C,MACAS;IAEF,OAAO;QACL,aAAaoC,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB3D,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeoD;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMnE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASqD;QACX;KACD;IAED,MAAMxB,SAAS,MAAMrC,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBoC,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(vlMode)\n ? getAutoGLMLocatePrompt(vlMode)\n : systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(vlMode)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(vlMode)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig);\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Create a small bbox around the point\n const bboxSize = 10;\n const x1 = Math.max(pixelX - bboxSize / 2, 0);\n const y1 = Math.max(pixelY - bboxSize / 2, 0);\n const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);\n const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);\n\n // Convert to Rect format\n resRect = {\n left: x1,\n top: y1,\n width: x2 - x1,\n height: y2 - y1,\n };\n\n // Apply offset if searching in a cropped area\n if (options.searchConfig?.rect) {\n resRect.left += options.searchConfig.rect.left;\n resRect.top += options.searchConfig.rect.top;\n }\n\n debugInspect('auto-glm resRect:', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const { screenshotBase64 } = context;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","bboxSize","x1","y1","x2","y2","rectCenter","element","generateElementByPosition","res","rawResponse","JSON","Array","adaptBboxToRect","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AAuDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OASrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7BM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUP,UAC3BQ,uBAAuBR,UACvBS,4BAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAaf,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIgB,cAAchB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIiB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjB,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;QAC/CgB,aAAahB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiB,cAAcjB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAW,iBAAXA,QAAyB;QAClC,MAAMe,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMzB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUP,UACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMoB,SAAS,MAAM7B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIV,UAAUP,SAAS;QACrB,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB9B,MAAMS;QAEvChB,aAAa,yBAAyBmC;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1CnC,aAAa,sBAAsBsC,OAAO,KAAK;QAC/CtC,aAAa,yBAAyBsC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DtC,aAAa,yBAAyB0C,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCtC,aAAa,iCAAiC;gBAAE2C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C7B,aAAa,+BAA+B;gBAAE6C;gBAAQE;YAAO;YAG7D,MAAMC,WAAW;YACjB,MAAMC,KAAKH,KAAK,GAAG,CAACD,SAASG,WAAW,GAAG;YAC3C,MAAME,KAAKJ,KAAK,GAAG,CAACC,SAASC,WAAW,GAAG;YAC3C,MAAMG,KAAKL,KAAK,GAAG,CAACD,SAASG,WAAW,GAAGpB;YAC3C,MAAMwB,KAAKN,KAAK,GAAG,CAACC,SAASC,WAAW,GAAGnB;YAG3CW,UAAU;gBACR,MAAMS;gBACN,KAAKC;gBACL,OAAOC,KAAKF;gBACZ,QAAQG,KAAKF;YACf;YAGA,IAAItC,QAAQ,YAAY,EAAE,MAAM;gBAC9B4B,QAAQ,IAAI,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBAC9C4B,QAAQ,GAAG,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YAC9C;YAEAZ,aAAa,qBAAqBwC;YAElC,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,0BACnCF,YACAjC;YAGF,IAAIkC,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMd;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,MAAMkB,MAAM,MAAMzC,SAASR,MAAMS;IAEjC,MAAMyC,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIhB;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYc,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBG,MAAM,OAAO,CAACH,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAhB,UAAUoB,gBACRJ,IAAI,OAAO,CAAC,IAAI,EAChB5B,YACAC,aACAjB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkB,oBACAC,qBACAd;YAGFjB,aAAa,WAAWwC;YAExB,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,0BACnCF,YACAjC;YAEFsB,SAAS,EAAE;YAEX,IAAIY,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;IACF,EAAE,OAAOO,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACnB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEoB,IAAI,CAAC,CAAC;aAFtBpB,SAAS;YAACoB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMtB;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAe;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAeQ,gBAAgBpD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEoD,kBAAkB,EAAEjD,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMU,eAAe2C,4BAA4BjD;IACjD,MAAMkD,gCAAgCC,0BACpCjE,wBAAwB8D;IAE1B,MAAM1D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM/B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQ4D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA1D,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,yBACnB/D,MACAS;IAGF,IAAIuD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAab,gBACjBY,aACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BuE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DnE,aAAa,wBAAwBwE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASjB,MAAM,OAAO,CAACiB,OAC/B,GAAG,CAAC,CAACA,OACGhB,gBACLgB,MACA/D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqByE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DzE,aAAa,iBAAiB2E;QAG9BN,cAAcQ,iBAAiBF,YAAYhE,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BqE;IAC1C;IAEA,IAAIS,cAAc9D;IAClB,IAAIqD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1BhE,kBACAqD,aACAtD,AAAW,iBAAXA;QAEF+D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBvE,OAO7C;IACC,MAAM,EAAEwE,SAAS,EAAEvE,OAAO,EAAEwE,aAAa,EAAE/E,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe+D;IACrB,MAAM,EAAEpE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM0E,wBAAwBC,uBAC5B5E,QAAQ,eAAe,IAAI,IAC3BwE;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKvE;YACL,QAAQ;QACV;IACF;IAGFuE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASkE;QACX;KACD;IAED,IAAInF,kBAAkB;QACpB,MAAM4B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,yBACnB/D,MACAS;IAEF,OAAO;QACL,aAAaqD,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB5E,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeqE;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAMpF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASsE;QACX;KACD;IAED,MAAMxB,SAAS,MAAMtD,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBqD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
function _define_property(obj, key, value) {
|
|
2
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
3
|
+
value: value,
|
|
4
|
+
enumerable: true,
|
|
5
|
+
configurable: true,
|
|
6
|
+
writable: true
|
|
7
|
+
});
|
|
8
|
+
else obj[key] = value;
|
|
9
|
+
return obj;
|
|
10
|
+
}
|
|
11
|
+
class LatestLocateRecorder {
|
|
12
|
+
recordLocate(locate, source) {
|
|
13
|
+
this.latestLocate = locate;
|
|
14
|
+
this.source = source;
|
|
15
|
+
}
|
|
16
|
+
getLatestLocate() {
|
|
17
|
+
return {
|
|
18
|
+
locate: this.latestLocate,
|
|
19
|
+
source: this.source
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
constructor(){
|
|
23
|
+
_define_property(this, "latestLocate", void 0);
|
|
24
|
+
_define_property(this, "source", '');
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
export { LatestLocateRecorder };
|
|
28
|
+
|
|
29
|
+
//# sourceMappingURL=latest-locate-recorder.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ai-model/latest-locate-recorder.mjs","sources":["../../../src/ai-model/latest-locate-recorder.ts"],"sourcesContent":["interface ILocate {\n prompt: string;\n bbox: [number, number, number, number];\n}\n\nexport class LatestLocateRecorder {\n latestLocate: ILocate | undefined = undefined;\n source = '';\n\n recordLocate(locate: ILocate, source: string) {\n this.latestLocate = locate;\n this.source = source;\n }\n\n getLatestLocate(): { locate: ILocate | undefined; source: string } {\n return {\n locate: this.latestLocate,\n source: this.source,\n };\n }\n}\n"],"names":["LatestLocateRecorder","locate","source","undefined"],"mappings":";;;;;;;;;;AAKO,MAAMA;IAIX,aAAaC,MAAe,EAAEC,MAAc,EAAE;QAC5C,IAAI,CAAC,YAAY,GAAGD;QACpB,IAAI,CAAC,MAAM,GAAGC;IAChB;IAEA,kBAAmE;QACjE,OAAO;YACL,QAAQ,IAAI,CAAC,YAAY;YACzB,QAAQ,IAAI,CAAC,MAAM;QACrB;IACF;;QAbA,uCAAoCC;QACpC,iCAAS;;AAaX"}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
|
|
2
2
|
import { getDebug } from "@midscene/shared/logger";
|
|
3
3
|
import { assert } from "@midscene/shared/utils";
|
|
4
|
-
import { buildYamlFlowFromPlans, fillBboxParam,
|
|
4
|
+
import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "../common.mjs";
|
|
5
5
|
import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
6
|
import { callAIWithObjectResponse } from "./service-caller/index.mjs";
|
|
7
7
|
const debug = getDebug('planning');
|
|
@@ -88,23 +88,13 @@ async function plan(userInstruction, opts) {
|
|
|
88
88
|
const actions = planFromAI.action ? [
|
|
89
89
|
planFromAI.action
|
|
90
90
|
] : [];
|
|
91
|
-
let shouldContinuePlanning = true;
|
|
92
|
-
if (!actions.length) {
|
|
93
|
-
debug('no actions planned and no sleep instruction, stop planning');
|
|
94
|
-
shouldContinuePlanning = false;
|
|
95
|
-
}
|
|
96
|
-
if (actions[0]?.type === finalizeActionName) {
|
|
97
|
-
debug('finalize action planned, stop planning');
|
|
98
|
-
shouldContinuePlanning = false;
|
|
99
|
-
}
|
|
100
91
|
const returnValue = {
|
|
101
92
|
...planFromAI,
|
|
102
93
|
actions,
|
|
103
94
|
rawResponse,
|
|
104
95
|
usage,
|
|
105
96
|
reasoning_content,
|
|
106
|
-
yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace)
|
|
107
|
-
shouldContinuePlanning
|
|
97
|
+
yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
|
|
108
98
|
};
|
|
109
99
|
assert(planFromAI, "can't get plans from AI");
|
|
110
100
|
actions.forEach((action)=>{
|
|
@@ -118,6 +108,7 @@ async function plan(userInstruction, opts) {
|
|
|
118
108
|
if (locateResult && void 0 !== vlMode) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, vlMode);
|
|
119
109
|
});
|
|
120
110
|
});
|
|
111
|
+
if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
|
|
121
112
|
conversationHistory.append({
|
|
122
113
|
role: 'assistant',
|
|
123
114
|
content: [
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n reasoning_content,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n modelConfig,\n {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n },\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","reasoning_content","callAIWithObjectResponse","undefined","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,yBACRL,MACAlB,aACA;QACE,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAe0B,SAAY1B,KAAK,SAAS;IACpE;IAGF,MAAM2B,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMO,cAAkC;QACtC,GAAGP,UAAU;QACbM;QACAL;QACAC;QACAC;QACA,UAAUK,uBACRF,SACA3B,KAAK,WAAW,EAChBqB,WAAW,KAAK;IAEpB;IAEAS,OAAOT,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBjC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC+B,SAAWA,OAAO,IAAI,KAAKC;QAG9BpC,MAAM,+BAA+BqC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENrC,MAAM,gBAAgBsC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB/B,AAAWoB,WAAXpB,QAElByB,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA3B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEqB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBW,QAAQ,IAAI,CACV,8EACAxC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOM;AACT"}
|
|
@@ -1,22 +1,11 @@
|
|
|
1
1
|
import { getZodDescription, getZodTypeName } from "@midscene/shared/zod-schema-utils";
|
|
2
2
|
import { bboxDescription } from "./common.mjs";
|
|
3
|
-
const commonOutputFields =
|
|
3
|
+
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
4
|
+
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
4
5
|
const vlLocateParam = (vlMode)=>{
|
|
5
6
|
if (vlMode) return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;
|
|
6
7
|
return "{ prompt: string /* description of the target element */ }";
|
|
7
8
|
};
|
|
8
|
-
const findDefaultValue = (field)=>{
|
|
9
|
-
let current = field;
|
|
10
|
-
const visited = new Set();
|
|
11
|
-
while(current && !visited.has(current)){
|
|
12
|
-
visited.add(current);
|
|
13
|
-
const currentWithDef = current;
|
|
14
|
-
if (!currentWithDef._def?.typeName) break;
|
|
15
|
-
if ('ZodDefault' === currentWithDef._def.typeName) return currentWithDef._def.defaultValue?.();
|
|
16
|
-
if ('ZodOptional' === currentWithDef._def.typeName || 'ZodNullable' === currentWithDef._def.typeName) current = currentWithDef._def.innerType;
|
|
17
|
-
else break;
|
|
18
|
-
}
|
|
19
|
-
};
|
|
20
9
|
const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
21
10
|
const tab = ' ';
|
|
22
11
|
const fields = [];
|
|
@@ -32,16 +21,8 @@ const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
|
32
21
|
const keyWithOptional = isOptional ? `${key}?` : key;
|
|
33
22
|
const typeName = getZodTypeName(field, locatorSchemaTypeDescription);
|
|
34
23
|
const description = getZodDescription(field);
|
|
35
|
-
const defaultValue = findDefaultValue(field);
|
|
36
|
-
const hasDefault = void 0 !== defaultValue;
|
|
37
24
|
let paramLine = `${keyWithOptional}: ${typeName}`;
|
|
38
|
-
|
|
39
|
-
if (description) comments.push(description);
|
|
40
|
-
if (hasDefault) {
|
|
41
|
-
const defaultStr = 'string' == typeof defaultValue ? `"${defaultValue}"` : JSON.stringify(defaultValue);
|
|
42
|
-
comments.push(`default: ${defaultStr}`);
|
|
43
|
-
}
|
|
44
|
-
if (comments.length > 0) paramLine += ` // ${comments.join(', ')}`;
|
|
25
|
+
if (description) paramLine += ` // ${description}`;
|
|
45
26
|
paramLines.push(paramLine);
|
|
46
27
|
}
|
|
47
28
|
if (paramLines.length > 0) {
|
|
@@ -95,6 +76,7 @@ Please tell what the next one action is (or null if no action should be done) to
|
|
|
95
76
|
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
96
77
|
- Make sure the previous actions are completed successfully before performing the next step
|
|
97
78
|
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the "error" field to the error message.
|
|
79
|
+
- If there is nothing to do but waiting, set the "sleep" field to the positive waiting time in milliseconds and null for the "action" field.
|
|
98
80
|
- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the "Print_Assert_Result" action.
|
|
99
81
|
|
|
100
82
|
## Supporting actions
|
|
@@ -106,7 +88,6 @@ ${logFieldInstruction}
|
|
|
106
88
|
|
|
107
89
|
Return in JSON format:
|
|
108
90
|
{
|
|
109
|
-
"note"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action.
|
|
110
91
|
"log": string, // a brief preamble to the user explaining what you’re about to do
|
|
111
92
|
${commonOutputFields}
|
|
112
93
|
"action":
|
|
@@ -115,13 +96,16 @@ Return in JSON format:
|
|
|
115
96
|
"param"?: { // The parameter of the action, if any
|
|
116
97
|
// k-v style parameter fields
|
|
117
98
|
},
|
|
118
|
-
} | null
|
|
99
|
+
} | null,
|
|
100
|
+
,
|
|
101
|
+
"sleep"?: number, // The sleep time after the action, in milliseconds.
|
|
119
102
|
}
|
|
120
103
|
|
|
121
104
|
For example, if the instruction is to login and the form has already been filled, this is a valid return value:
|
|
122
105
|
|
|
123
106
|
{
|
|
124
107
|
"log": "Click the login button",
|
|
108
|
+
"more_actions_needed_by_instruction": false,
|
|
125
109
|
"action": {
|
|
126
110
|
"type": "Tap",
|
|
127
111
|
"param": {
|
|
@@ -131,22 +115,6 @@ For example, if the instruction is to login and the form has already been filled
|
|
|
131
115
|
}
|
|
132
116
|
}
|
|
133
117
|
}
|
|
134
|
-
|
|
135
|
-
For example, if the instruction is to find out every title in the screenshot, the return value should be:
|
|
136
|
-
|
|
137
|
-
{
|
|
138
|
-
"note": "The titles in the screenshot are: 'Hello, world!', 'Midscene 101', 'Model strategy'",
|
|
139
|
-
"log": "Scroll to find more titles",
|
|
140
|
-
"action": {
|
|
141
|
-
"type": "Scroll",
|
|
142
|
-
"param": {
|
|
143
|
-
"locate": {
|
|
144
|
-
"prompt": "The page content area"
|
|
145
|
-
},
|
|
146
|
-
"direction": "down"
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
118
|
`;
|
|
151
119
|
}
|
|
152
120
|
export { descriptionForAction, systemPromptToTaskPlanning };
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"note\"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action.\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"Click the login button\",\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n\nFor example, if the instruction is to find out every title in the screenshot, the return value should be:\n\n{\n \"note\": \"The titles in the screenshot are: 'Hello, world!', 'Midscene 101', 'Model strategy'\",\n \"log\": \"Scroll to find more titles\",\n \"action\": {\n \"type\": \"Scroll\",\n \"param\": {\n \"locate\": {\n \"prompt\": \"The page content area\"\n },\n \"direction\": \"down\"\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB;AAE3B,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKd,MAAM,IAAIe,OAAO,OAAO,CAACF,OACxC,IAAIb,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMgB,aACJ,AACE,cADF,OAAQhB,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMiB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMI,WAAWC,eAAenB,OAAOO;gBAGvC,MAAMa,cAAcC,kBAAkBrB;gBAGtC,MAAMsB,eAAevB,iBAAiBC;gBACtC,MAAMuB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3ChB,WAAW,IAAI,CAACe;YAClB;YAIF,IAAIf,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACmB;oBAClBpB,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEoB,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,eAAeR;YAChC,MAAMS,cAAcC,kBAAkBV;YAGtC,IAAImB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBrB,OAAO,IAAI,CAACqB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAExB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeuB,2BAA2B,EAC/CC,WAAW,EACXnC,MAAM,EACNoC,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACpC,QAClB,MAAM,IAAIqC,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAAC1B,SACtCD,qBACLC,QACAV,cAAcqC,cAAcpC,SAAS2B;IAGzC,MAAMY,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,MAAME,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;AAeV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;;EAQpB,EAAE1C,mBAAmB;;;;;;;;;;;;;;;;;;oCAkBa,EAAEE,UAAUoC,cAAc,mCAAmC,GAAG;;;;;;;;;;;;;;;;;;;;;AAqBpG,CAAC;AACD"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"Click the login button\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","field","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","paramLine","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACH,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAME,aACJ,AACE,cADF,OAAQF,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMG,kBAAkBD,aAAa,GAAGH,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMK,WAAWC,eAAeL,OAAOR;gBAGvC,MAAMc,cAAcC,kBAAkBP;gBAGtC,IAAIQ,YAAY,GAAGL,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,IAAIE,aACFE,aAAa,CAAC,IAAI,EAAEF,aAAa;gBAGnCX,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACc;oBAClBf,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEe,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAML,WAAWC,eAAeT;YAChC,MAAMU,cAAcC,kBAAkBX;YAGtC,IAAIc,mBAAmB,CAAC,SAAS,EAAEN,UAAU;YAC7C,IAAIE,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpBhB,OAAO,IAAI,CAACgB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEnB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAekB,2BAA2B,EAC/CC,WAAW,EACXxB,MAAM,EACNyB,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACzB,QAClB,MAAM,IAAI0B,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAACrB,SACtCD,qBACLC,QACAJ,cAAc0B,cAAczB,SAAS4B;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAEhC,mBAAmB;;;;;;;;;;;;;;;;;;;;;oCAqBa,EAAEE,UAAUyB,cAAc,mCAAmC,GAAG;;;;;AAKpG,CAAC;AACD"}
|
|
@@ -3,6 +3,7 @@ import { getDebug } from "@midscene/shared/logger";
|
|
|
3
3
|
import { assert, ifInBrowser } from "@midscene/shared/utils";
|
|
4
4
|
import { jsonrepair } from "jsonrepair";
|
|
5
5
|
import openai_0 from "openai";
|
|
6
|
+
import { isAutoGLM } from "../auto-glm/util.mjs";
|
|
6
7
|
async function createChatClient({ modelConfig }) {
|
|
7
8
|
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, modelDescription, uiTarsModelVersion: uiTarsVersion, vlMode, createOpenAIClient, timeout } = modelConfig;
|
|
8
9
|
let proxyAgent;
|
|
@@ -140,6 +141,10 @@ async function callAI(messages, modelConfig, options) {
|
|
|
140
141
|
vl_high_resolution_images: true
|
|
141
142
|
} : {}
|
|
142
143
|
};
|
|
144
|
+
if (isAutoGLM(vlMode)) {
|
|
145
|
+
commonConfig.top_p = 0.85;
|
|
146
|
+
commonConfig.frequency_penalty = 0.2;
|
|
147
|
+
}
|
|
143
148
|
const { config: deepThinkConfig, debugMessage, warningMessage } = resolveDeepThinkConfig({
|
|
144
149
|
deepThink: options?.deepThink,
|
|
145
150
|
vlMode
|