@midscene/core 1.4.3 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/llm-planning.mjs +7 -23
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +25 -239
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/llm-planning.js +6 -22
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +25 -239
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
- package/package.json +2 -2
package/dist/lib/agent/utils.js
CHANGED
|
@@ -153,7 +153,7 @@ async function matchElementFromCache(context, cacheEntry, cachePrompt, cacheable
|
|
|
153
153
|
return;
|
|
154
154
|
}
|
|
155
155
|
}
|
|
156
|
-
const getMidsceneVersion = ()=>"1.4.
|
|
156
|
+
const getMidsceneVersion = ()=>"1.4.4";
|
|
157
157
|
const parsePrompt = (prompt)=>{
|
|
158
158
|
if ('string' == typeof prompt) return {
|
|
159
159
|
textPrompt: prompt,
|
|
@@ -53,10 +53,6 @@ function parseXMLPlanningResponse(xmlString, modelFamily) {
|
|
|
53
53
|
finalizeSuccess = 'true' === completeGoalMatch[1];
|
|
54
54
|
finalizeMessage = completeGoalMatch[2]?.trim() || void 0;
|
|
55
55
|
}
|
|
56
|
-
const updatePlanContent = (0, util_js_namespaceObject.extractXMLTag)(xmlString, 'update-plan-content');
|
|
57
|
-
const markSubGoalDone = (0, util_js_namespaceObject.extractXMLTag)(xmlString, 'mark-sub-goal-done');
|
|
58
|
-
const updateSubGoals = updatePlanContent ? (0, util_js_namespaceObject.parseSubGoalsFromXML)(updatePlanContent) : void 0;
|
|
59
|
-
const markFinishedIndexes = markSubGoalDone ? (0, util_js_namespaceObject.parseMarkFinishedIndexes)(markSubGoalDone) : void 0;
|
|
60
56
|
let action = null;
|
|
61
57
|
if (actionType && 'null' !== actionType.toLowerCase()) {
|
|
62
58
|
const type = actionType.trim();
|
|
@@ -90,12 +86,6 @@ function parseXMLPlanningResponse(xmlString, modelFamily) {
|
|
|
90
86
|
} : {},
|
|
91
87
|
...void 0 !== finalizeSuccess ? {
|
|
92
88
|
finalizeSuccess
|
|
93
|
-
} : {},
|
|
94
|
-
...updateSubGoals?.length ? {
|
|
95
|
-
updateSubGoals
|
|
96
|
-
} : {},
|
|
97
|
-
...markFinishedIndexes?.length ? {
|
|
98
|
-
markFinishedIndexes
|
|
99
89
|
} : {}
|
|
100
90
|
};
|
|
101
91
|
}
|
|
@@ -104,13 +94,12 @@ async function plan(userInstruction, opts) {
|
|
|
104
94
|
const { size } = context;
|
|
105
95
|
const screenshotBase64 = context.screenshot.base64;
|
|
106
96
|
const { modelFamily } = modelConfig;
|
|
107
|
-
const includeSubGoals = true === opts.deepThink;
|
|
108
97
|
const systemPrompt = await (0, llm_planning_js_namespaceObject.systemPromptToTaskPlanning)({
|
|
109
98
|
actionSpace: opts.actionSpace,
|
|
110
99
|
modelFamily,
|
|
111
100
|
includeBbox: opts.includeBbox,
|
|
112
101
|
includeThought: true,
|
|
113
|
-
|
|
102
|
+
deepThink: true === opts.deepThink
|
|
114
103
|
});
|
|
115
104
|
let imagePayload = screenshotBase64;
|
|
116
105
|
let imageWidth = size.width;
|
|
@@ -134,8 +123,8 @@ async function plan(userInstruction, opts) {
|
|
|
134
123
|
}
|
|
135
124
|
];
|
|
136
125
|
let latestFeedbackMessage;
|
|
137
|
-
const
|
|
138
|
-
const
|
|
126
|
+
const historicalLogsText = conversationHistory.historicalLogsToText();
|
|
127
|
+
const historicalLogsSection = historicalLogsText ? `\n\n${historicalLogsText}` : '';
|
|
139
128
|
const memoriesText = conversationHistory.memoriesToText();
|
|
140
129
|
const memoriesSection = memoriesText ? `\n\n${memoriesText}` : '';
|
|
141
130
|
if (conversationHistory.pendingFeedbackMessage) {
|
|
@@ -144,7 +133,7 @@ async function plan(userInstruction, opts) {
|
|
|
144
133
|
content: [
|
|
145
134
|
{
|
|
146
135
|
type: 'text',
|
|
147
|
-
text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${
|
|
136
|
+
text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${historicalLogsSection}`
|
|
148
137
|
},
|
|
149
138
|
{
|
|
150
139
|
type: 'image_url',
|
|
@@ -161,7 +150,7 @@ async function plan(userInstruction, opts) {
|
|
|
161
150
|
content: [
|
|
162
151
|
{
|
|
163
152
|
type: 'text',
|
|
164
|
-
text: `this is the latest screenshot${memoriesSection}${
|
|
153
|
+
text: `this is the latest screenshot${memoriesSection}${historicalLogsSection}`
|
|
165
154
|
},
|
|
166
155
|
{
|
|
167
156
|
type: 'image_url',
|
|
@@ -201,7 +190,6 @@ async function plan(userInstruction, opts) {
|
|
|
201
190
|
if (void 0 !== planFromAI.finalizeSuccess) {
|
|
202
191
|
debug('task completed via <complete> tag, stop planning');
|
|
203
192
|
shouldContinuePlanning = false;
|
|
204
|
-
if (includeSubGoals) conversationHistory.markAllSubGoalsFinished();
|
|
205
193
|
}
|
|
206
194
|
const returnValue = {
|
|
207
195
|
...planFromAI,
|
|
@@ -224,11 +212,7 @@ async function plan(userInstruction, opts) {
|
|
|
224
212
|
if (locateResult && void 0 !== modelFamily) action.param[field] = (0, external_common_js_namespaceObject.fillBboxParam)(locateResult, imageWidth, imageHeight, modelFamily);
|
|
225
213
|
});
|
|
226
214
|
});
|
|
227
|
-
if (
|
|
228
|
-
if (planFromAI.updateSubGoals?.length) conversationHistory.setSubGoals(planFromAI.updateSubGoals);
|
|
229
|
-
if (planFromAI.markFinishedIndexes?.length) for (const index of planFromAI.markFinishedIndexes)conversationHistory.markSubGoalFinished(index);
|
|
230
|
-
if (planFromAI.log) conversationHistory.appendSubGoalLog(planFromAI.log);
|
|
231
|
-
} else if (planFromAI.log) conversationHistory.appendHistoricalLog(planFromAI.log);
|
|
215
|
+
if (planFromAI.log) conversationHistory.appendHistoricalLog(planFromAI.log);
|
|
232
216
|
if (planFromAI.memory) conversationHistory.appendMemory(planFromAI.memory);
|
|
233
217
|
conversationHistory.append({
|
|
234
218
|
role: 'assistant',
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../src/ai-model/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n const type = actionType.trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { size } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when deepThink is true\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In deepThink mode: show full sub-goals with logs\n // In non-deepThink mode: show historical execution logs\n const subGoalsText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const subGoalsSection = subGoalsText ? `\\n\\n${subGoalsText}` : '';\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n });\n\n // Parse XML response to JSON object, capture parsing errors\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history based on response (only when deepThink is enabled)\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.setSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // In non-deepThink mode, accumulate logs as historical execution steps\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","warnLog","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","size","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","subGoalsText","subGoalsSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;ACyBA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACvB,MAAMC,UAAUD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS,YAAY;IAAE,SAAS;AAAK;AAK9C,SAASE,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACzC,MAAMI,SAASD,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACxC,MAAMK,MAAMF,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACvC,MAAMO,aAAaJ,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,AAAAA,IAAAA,wBAAAA,oBAAAA,AAAAA,EAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,AAAAA,IAAAA,wBAAAA,wBAAAA,AAAAA,EAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QACrD,MAAMc,OAAOd,WAAW,IAAI;QAC5B,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,AAAAA,IAAAA,yBAAAA,aAAAA,AAAAA,EAAcf,gBAAgBP;QACxC,EAAE,OAAOuB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,IAAI,EAAE,GAAGH;IACjB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,AAAAA,IAAAA,gCAAAA,0BAAAA,AAAAA,EAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,KAAK,KAAK;IAC3B,IAAIO,cAAcP,KAAK,MAAM;IAK7B,IAAI/B,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMuC,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBd,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMe,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEf,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIiB;IAKJ,MAAMC,eAAeX,kBACjBH,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMe,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAG/D,MAAME,eAAehB,oBAAoB,cAAc;IACvD,MAAMiB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIhB,oBAAoB,sBAAsB,EAAE;QAC9Ca,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGb,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEiB,kBAAkBF,iBAAiB;gBAChN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEa,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,kBAAkBF,iBAAiB;YAC3E;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACa;IAG3Bb,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMkB,aAAalB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMsB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,MAAM,EACJ,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,AAAAA,IAAAA,yBAAAA,MAAAA,AAAAA,EAAOJ,MAAMpB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;IACpE;IAGA,IAAI2B;IACJ,IAAI;QACFA,aAAaxD,yBAAyBoD,aAAalD;QAEnD,IAAIsD,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YACjEzD,QACE;YAEFyD,WAAW,eAAe,GAAG1C;YAC7B0C,WAAW,eAAe,GAAG1C;QAC/B;QAEA,MAAM2C,UAAUD,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIE,yBAAyB;QAG7B,IAAIF,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YAC5C3D,MAAM;YACN6D,yBAAyB;YAEzB,IAAIvB,iBACFH,oBAAoB,uBAAuB;QAE/C;QAEA,MAAM2B,cAAkC;YACtC,GAAGH,UAAU;YACbC;YACAL;YACAC;YACAC;YACA,UAAUM,AAAAA,IAAAA,mCAAAA,sBAAAA,AAAAA,EAAuBH,SAAS5B,KAAK,WAAW;YAC1D6B;QACF;QAEAG,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOL,YAAY;QAEnBC,QAAQ,OAAO,CAAC,CAACpC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAMyC,sBAAsBjC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;YAG9BzB,MAAM,+BAA+BiE;YACrC,MAAMC,eAAeD,sBACjBE,AAAAA,IAAAA,mCAAAA,2BAAAA,AAAAA,EAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENjE,MAAM,gBAAgBkE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe7C,OAAO,KAAK,CAAC4C,MAAM;gBACxC,IAAIC,gBAAgBhE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAAC4C,MAAM,GAAGE,AAAAA,IAAAA,mCAAAA,aAAAA,AAAAA,EACpBD,cACA3B,YACAC,aACAtC;YAGN;QACF;QAGA,IAAIiC,iBAAiB;YACnB,IAAIqB,WAAW,cAAc,EAAE,QAC7BxB,oBAAoB,WAAW,CAACwB,WAAW,cAAc;YAE3D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMY,SAASZ,WAAW,mBAAmB,CAChDxB,oBAAoB,mBAAmB,CAACoC;YAI5C,IAAIZ,WAAW,GAAG,EAChBxB,oBAAoB,gBAAgB,CAACwB,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChBxB,oBAAoB,mBAAmB,CAACwB,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnBxB,oBAAoB,YAAY,CAACwB,WAAW,MAAM;QAGpDxB,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMoB;gBACR;aACD;QACH;QAEA,OAAOO;IACT,EAAE,OAAOU,YAAY;QAEnB,MAAMC,eACJD,sBAAsB3C,QAAQ2C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,yBAAAA,oBAAoBA,CAC5B,CAAC,iBAAiB,EAAEF,cAAc,EAClClB,aACAC;IAEJ;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../src/ai-model/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { extractXMLTag } from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n const type = actionType.trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { size } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n deepThink: opts.deepThink === true,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build historical execution logs text to include in the message\n const historicalLogsText = conversationHistory.historicalLogsToText();\n const historicalLogsSection = historicalLogsText\n ? `\\n\\n${historicalLogsText}`\n : '';\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${historicalLogsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${memoriesSection}${historicalLogsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n });\n\n // Parse XML response to JSON object, capture parsing errors\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Accumulate logs as historical execution steps\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","warnLog","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","size","screenshotBase64","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historicalLogsText","historicalLogsSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;ACqBA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACvB,MAAMC,UAAUD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS,YAAY;IAAE,SAAS;AAAK;AAK9C,SAASE,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACzC,MAAMI,SAASD,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACxC,MAAMK,MAAMF,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IACvC,MAAMO,aAAaJ,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,AAAAA,IAAAA,wBAAAA,aAAAA,AAAAA,EAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,IAAIC,SAAc;IAClB,IAAIP,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QACrD,MAAMQ,OAAOR,WAAW,IAAI;QAC5B,IAAIS;QAEJ,IAAIR,gBACF,IAAI;YAEFQ,QAAQC,AAAAA,IAAAA,yBAAAA,aAAAA,AAAAA,EAAcT,gBAAgBP;QACxC,EAAE,OAAOiB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUH,WAAVG,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAId,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1BQ;QACA,GAAIH,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;IAC9D;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,IAAI,EAAE,GAAGH;IACjB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAEtB,WAAW,EAAE,GAAGuB;IAExB,MAAMI,eAAe,MAAMC,AAAAA,IAAAA,gCAAAA,0BAAAA,AAAAA,EAA2B;QACpD,aAAaP,KAAK,WAAW;QAC7BrB;QACA,aAAaqB,KAAK,WAAW;QAC7B,gBAAgB;QAChB,WAAWA,AAAmB,SAAnBA,KAAK,SAAS;IAC3B;IAEA,IAAIQ,eAAeH;IACnB,IAAII,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAK7B,IAAIzB,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMgC,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBb,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMc,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEd,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIgB;IAGJ,MAAMC,qBAAqBb,oBAAoB,oBAAoB;IACnE,MAAMc,wBAAwBD,qBAC1B,CAAC,IAAI,EAAEA,oBAAoB,GAC3B;IAGJ,MAAME,eAAef,oBAAoB,cAAc;IACvD,MAAMgB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9CY,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGZ,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEgB,kBAAkBF,uBAAuB;gBACtN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAL,oBAAoB,mCAAmC;IACzD,OACEY,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,kBAAkBF,uBAAuB;YACjF;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFL,oBAAoB,MAAM,CAACY;IAG3BZ,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMiB,aAAajB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMqB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,MAAM,EACJ,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,AAAAA,IAAAA,yBAAAA,MAAAA,AAAAA,EAAOJ,MAAMnB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAeT,SAAYS,KAAK,SAAS;IACpE;IAGA,IAAI0B;IACJ,IAAI;QACFA,aAAajD,yBAAyB6C,aAAa3C;QAEnD,IAAI+C,WAAW,MAAM,IAAIA,AAA+BnC,WAA/BmC,WAAW,eAAe,EAAgB;YACjElD,QACE;YAEFkD,WAAW,eAAe,GAAGnC;YAC7BmC,WAAW,eAAe,GAAGnC;QAC/B;QAEA,MAAMoC,UAAUD,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIE,yBAAyB;QAG7B,IAAIF,AAA+BnC,WAA/BmC,WAAW,eAAe,EAAgB;YAC5CpD,MAAM;YACNsD,yBAAyB;QAC3B;QAEA,MAAMC,cAAkC;YACtC,GAAGH,UAAU;YACbC;YACAL;YACAC;YACAC;YACA,UAAUM,AAAAA,IAAAA,mCAAAA,sBAAAA,AAAAA,EAAuBH,SAAS3B,KAAK,WAAW;YAC1D4B;QACF;QAEAG,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOL,YAAY;QAEnBC,QAAQ,OAAO,CAAC,CAACnC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAMwC,sBAAsBhC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;YAG9BnB,MAAM,+BAA+B0D;YACrC,MAAMC,eAAeD,sBACjBE,AAAAA,IAAAA,mCAAAA,2BAAAA,AAAAA,EAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAEN1D,MAAM,gBAAgB2D;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe5C,OAAO,KAAK,CAAC2C,MAAM;gBACxC,IAAIC,gBAAgBzD,AAAgBY,WAAhBZ,aAElBa,OAAO,KAAK,CAAC2C,MAAM,GAAGE,AAAAA,IAAAA,mCAAAA,aAAAA,AAAAA,EACpBD,cACA3B,YACAC,aACA/B;YAGN;QACF;QAGA,IAAI+C,WAAW,GAAG,EAChBvB,oBAAoB,mBAAmB,CAACuB,WAAW,GAAG;QAIxD,IAAIA,WAAW,MAAM,EACnBvB,oBAAoB,YAAY,CAACuB,WAAW,MAAM;QAGpDvB,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMmB;gBACR;aACD;QACH;QAEA,OAAOO;IACT,EAAE,OAAOS,YAAY;QAEnB,MAAMC,eACJD,sBAAsBzC,QAAQyC,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,yBAAAA,oBAAoBA,CAC5B,CAAC,iBAAiB,EAAEF,cAAc,EAClCjB,aACAC;IAEJ;AACF"}
|
|
@@ -92,12 +92,11 @@ const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
|
92
92
|
${tab}${fields.join(`\n${tab}`)}
|
|
93
93
|
`.trim();
|
|
94
94
|
};
|
|
95
|
-
async function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbox, includeThought,
|
|
95
|
+
async function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbox, includeThought, deepThink }) {
|
|
96
96
|
const preferredLanguage = (0, env_namespaceObject.getPreferredLanguage)();
|
|
97
97
|
if (includeBbox && !modelFamily) throw new Error('modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.');
|
|
98
98
|
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam(includeBbox ? modelFamily : void 0)));
|
|
99
99
|
const actionList = actionDescriptionList.join('\n');
|
|
100
|
-
const shouldIncludeSubGoals = includeSubGoals ?? false;
|
|
101
100
|
const locateExample1 = includeBbox ? `{
|
|
102
101
|
"prompt": "Add to cart button for Sauce Labs Backpack",
|
|
103
102
|
"bbox": [345, 442, 458, 483]
|
|
@@ -116,98 +115,26 @@ async function systemPromptToTaskPlanning({ actionSpace, modelFamily, includeBbo
|
|
|
116
115
|
}` : `{
|
|
117
116
|
"prompt": "Email input field in the registration form"
|
|
118
117
|
}`;
|
|
119
|
-
const step1Title = shouldIncludeSubGoals ? '## Step 1: Observe and Plan (related tags: <thought>, <update-plan-content>, <mark-sub-goal-done>)' : '## Step 1: Observe (related tags: <thought>)';
|
|
120
|
-
const step1Description = shouldIncludeSubGoals ? "First, observe the current screenshot and previous logs, then break down the user's instruction into multiple high-level sub-goals. Update the status of sub-goals based on what you see in the current screenshot." : 'First, observe the current screenshot and previous logs to understand the current state.';
|
|
121
118
|
const explicitInstructionRule = 'CRITICAL - Following Explicit Instructions: When the user gives you specific operation steps (not high-level goals), you MUST execute ONLY those exact steps - nothing more, nothing less. Do NOT add extra actions even if they seem logical. For example: "fill out the form" means only fill fields, do NOT submit; "click the button" means only click, do NOT wait for page load or verify results; "type \'hello\'" means only type, do NOT press Enter.';
|
|
122
|
-
const thoughtTagDescription =
|
|
119
|
+
const thoughtTagDescription = `REQUIRED: You MUST always output the <thought> tag. Never skip it.
|
|
123
120
|
|
|
124
|
-
Include your thought process in the <thought> tag. It should answer: What is the user's requirement? What is the current state based on the screenshot?
|
|
125
|
-
|
|
126
|
-
${explicitInstructionRule}` : `REQUIRED: You MUST always output the <thought> tag. Never skip it.
|
|
127
|
-
|
|
128
|
-
Include your thought process in the <thought> tag. It should answer: What is the current state based on the screenshot? What should be the next action? Write your thoughts naturally without numbering or section headers.
|
|
121
|
+
Include your thought process in the <thought> tag. It should answer: What is the user's requirement? What is the current state based on the screenshot? What should be the next action? Write your thoughts naturally without numbering or section headers.
|
|
129
122
|
|
|
130
123
|
${explicitInstructionRule}`;
|
|
131
|
-
const subGoalTags = shouldIncludeSubGoals ? `
|
|
132
|
-
|
|
133
|
-
* <update-plan-content> tag
|
|
134
|
-
|
|
135
|
-
Use this structure to give or update your plan:
|
|
136
|
-
|
|
137
|
-
<update-plan-content>
|
|
138
|
-
<sub-goal index="1" status="finished|pending">sub goal description</sub-goal>
|
|
139
|
-
<sub-goal index="2" status="finished|pending">sub goal description</sub-goal>
|
|
140
|
-
...
|
|
141
|
-
</update-plan-content>
|
|
142
|
-
|
|
143
|
-
* <mark-sub-goal-done> tag
|
|
144
|
-
|
|
145
|
-
Use this structure to mark a sub-goal as done:
|
|
146
|
-
|
|
147
|
-
<mark-sub-goal-done>
|
|
148
|
-
<sub-goal index="1" status="finished" />
|
|
149
|
-
</mark-sub-goal-done>
|
|
150
|
-
|
|
151
|
-
IMPORTANT: You MUST only mark a sub-goal as "finished" AFTER you have confirmed the task is actually completed by observing the result in the screenshot. Do NOT mark a sub-goal as done just because you expect the next action will complete it. Wait until you see visual confirmation in the screenshot that the sub-goal has been achieved.
|
|
152
|
-
|
|
153
|
-
* Note
|
|
154
|
-
|
|
155
|
-
During execution, you can call <update-plan-content> at any time to update the plan based on the latest screenshot and completed sub-goals.
|
|
156
|
-
|
|
157
|
-
### Example
|
|
158
|
-
|
|
159
|
-
If the user wants to "log in to a system using username and password, complete all to-do items, and submit a registration form", you can break it down into the following sub-goals:
|
|
160
|
-
|
|
161
|
-
<thought>...</thought>
|
|
162
|
-
<update-plan-content>
|
|
163
|
-
<sub-goal index="1" status="pending">Log in to the system</sub-goal>
|
|
164
|
-
<sub-goal index="2" status="pending">Complete all to-do items</sub-goal>
|
|
165
|
-
<sub-goal index="3" status="pending">Submit the registration form</sub-goal>
|
|
166
|
-
</update-plan-content>
|
|
167
|
-
|
|
168
|
-
After logging in and seeing the to-do items, you can mark the sub-goal as done:
|
|
169
|
-
|
|
170
|
-
<mark-sub-goal-done>
|
|
171
|
-
<sub-goal index="1" status="finished" />
|
|
172
|
-
</mark-sub-goal-done>
|
|
173
|
-
|
|
174
|
-
At this point, the status of all sub-goals is:
|
|
175
|
-
|
|
176
|
-
<update-plan-content>
|
|
177
|
-
<sub-goal index="1" status="finished" />
|
|
178
|
-
<sub-goal index="2" status="pending" />
|
|
179
|
-
<sub-goal index="3" status="pending" />
|
|
180
|
-
</update-plan-content>
|
|
181
|
-
|
|
182
|
-
After some time, when the last sub-goal is also completed, you can mark it as done as well:
|
|
183
|
-
|
|
184
|
-
<mark-sub-goal-done>
|
|
185
|
-
<sub-goal index="3" status="finished" />
|
|
186
|
-
</mark-sub-goal-done>` : '';
|
|
187
|
-
const memoryStepNumber = 2;
|
|
188
|
-
const checkGoalStepNumber = shouldIncludeSubGoals ? 3 : 2;
|
|
189
|
-
const actionStepNumber = shouldIncludeSubGoals ? 4 : 3;
|
|
190
124
|
return `
|
|
191
125
|
Target: You are an expert to manipulate the UI to accomplish the user's instruction. User will give you an instruction, some screenshots, background knowledge and previous logs indicating what have been done. Your task is to accomplish the instruction by thinking through the path to complete the task and give the next action to execute.
|
|
192
126
|
|
|
193
|
-
|
|
127
|
+
## Step 1: Observe (related tags: <thought>)
|
|
194
128
|
|
|
195
|
-
|
|
129
|
+
First, observe the current screenshot and previous logs to understand the current state.
|
|
196
130
|
|
|
197
131
|
* <thought> tag (REQUIRED)
|
|
198
132
|
|
|
199
133
|
${thoughtTagDescription}
|
|
200
|
-
${subGoalTags}
|
|
201
|
-
${shouldIncludeSubGoals ? `
|
|
202
|
-
## Step ${memoryStepNumber}: Memory Data from Current Screenshot (related tags: <memory>)
|
|
203
134
|
|
|
204
|
-
|
|
135
|
+
## Step 2: Check if the Instruction is Fulfilled (related tags: <complete>)
|
|
205
136
|
|
|
206
|
-
|
|
207
|
-
` : ''}
|
|
208
|
-
## Step ${checkGoalStepNumber}: ${shouldIncludeSubGoals ? 'Check if Goal is Accomplished' : 'Check if the Instruction is Fulfilled'} (related tags: <complete>)
|
|
209
|
-
|
|
210
|
-
${shouldIncludeSubGoals ? 'Based on the current screenshot and the status of all sub-goals, determine' : 'Determine'} if the entire task is completed.
|
|
137
|
+
Determine if the entire task is completed.
|
|
211
138
|
|
|
212
139
|
### CRITICAL: The User's Instruction is the Supreme Authority
|
|
213
140
|
|
|
@@ -217,36 +144,36 @@ The user's instruction defines the EXACT scope of what you must accomplish. You
|
|
|
217
144
|
- If the user gives you **explicit operation steps** (e.g., "click X", "type Y", "fill out the form"), treat them as exact commands. Execute ONLY those steps, nothing more.
|
|
218
145
|
- If the user gives you a **high-level goal** (e.g., "log in to the system", "complete the purchase"), you may determine the necessary steps to achieve it.
|
|
219
146
|
|
|
220
|
-
**What "
|
|
221
|
-
- The
|
|
147
|
+
**What "instruction fulfilled" means:**
|
|
148
|
+
- The instruction is fulfilled when you have done EXACTLY what the user asked - no extra steps, no assumptions.
|
|
222
149
|
- Do NOT perform any action beyond the explicit instruction, even if it seems logical or helpful.
|
|
223
150
|
|
|
224
151
|
**Examples - Explicit instructions (execute exactly, no extra steps):**
|
|
225
|
-
- "fill out the form" →
|
|
226
|
-
- "click the login button" →
|
|
227
|
-
- "type 'hello' in the search box" →
|
|
228
|
-
- "select the first item" →
|
|
152
|
+
- "fill out the form" → Instruction fulfilled when all fields are filled. Do NOT submit the form.
|
|
153
|
+
- "click the login button" → Instruction fulfilled once clicked. Do NOT wait for page load or verify login success.
|
|
154
|
+
- "type 'hello' in the search box" → Instruction fulfilled when 'hello' is typed. Do NOT press Enter or trigger search.
|
|
155
|
+
- "select the first item" → Instruction fulfilled when selected. Do NOT proceed to checkout.
|
|
229
156
|
|
|
230
157
|
**Special case - Assertion instructions:**
|
|
231
|
-
- If the user's instruction includes an assertion (e.g., "verify that...", "check that...", "assert..."), and you observe from the screenshot that the assertion condition is NOT satisfied and cannot be satisfied, mark
|
|
158
|
+
- If the user's instruction includes an assertion (e.g., "verify that...", "check that...", "assert..."), and you observe from the screenshot that the assertion condition is NOT satisfied and cannot be satisfied, mark it as failed (success="false").
|
|
232
159
|
- If the page is still loading (e.g., you see a loading spinner, skeleton screen, or progress bar), do NOT assert yet. Wait for the page to finish loading before evaluating the assertion.
|
|
233
|
-
|
|
160
|
+
|
|
234
161
|
**Page navigation restriction:**
|
|
235
162
|
- Unless the user's instruction explicitly asks you to click a link, jump to another page, or navigate to a URL, you MUST complete the task on the current page only.
|
|
236
163
|
- Do NOT navigate away from the current page on your own initiative (e.g., do not click links that lead to other pages, do not use browser back/forward, do not open new URLs).
|
|
237
164
|
- If the task cannot be accomplished on the current page and the user has not instructed you to navigate, report it as a failure (success="false") instead of attempting to navigate to other pages.
|
|
238
|
-
|
|
165
|
+
|
|
239
166
|
### Output Rules
|
|
240
167
|
|
|
241
|
-
- If the task is NOT complete, skip this section and continue to Step
|
|
168
|
+
- If the task is NOT complete, skip this section and continue to Step 3.
|
|
242
169
|
- Use the <complete success="true|false">message</complete> tag to output the result if the goal is accomplished or failed.
|
|
243
|
-
- the 'success' attribute is required.
|
|
170
|
+
- the 'success' attribute is required. No matter what actions were executed or what errors occurred during execution, if the instruction is fulfilled, set success="true". If the instruction is not fulfilled and cannot be fulfilled, set success="false".
|
|
244
171
|
- the 'message' is the information that will be provided to the user. If the user asks for a specific format, strictly follow that.
|
|
245
172
|
- If you output <complete>, do NOT output <action-type> or <action-param-json>. The task ends here.
|
|
246
173
|
|
|
247
|
-
## Step
|
|
174
|
+
## Step 3: Determine Next Action (related tags: <log>, <action-type>, <action-param-json>, <error>)
|
|
248
175
|
|
|
249
|
-
ONLY if the task is not complete: Think what the next action is according to the current screenshot
|
|
176
|
+
ONLY if the task is not complete: Think what the next action is according to the current screenshot.
|
|
250
177
|
|
|
251
178
|
- Don't give extra actions or plans beyond the instruction or the plan. For example, don't try to submit the form if the instruction is only to fill something.
|
|
252
179
|
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
@@ -301,26 +228,15 @@ For example:
|
|
|
301
228
|
Return in XML format following this decision flow:
|
|
302
229
|
|
|
303
230
|
**Always include (REQUIRED):**
|
|
304
|
-
<!-- Step 1: Observe
|
|
231
|
+
<!-- Step 1: Observe -->
|
|
305
232
|
<thought>Your thought process here. NEVER skip this tag.</thought>
|
|
306
|
-
|
|
307
|
-
<!-- required when no update-plan-content is provided in the previous response -->
|
|
308
|
-
<update-plan-content>...</update-plan-content>
|
|
309
|
-
|
|
310
|
-
<!-- required when any sub-goal is completed -->
|
|
311
|
-
<mark-sub-goal-done>
|
|
312
|
-
<sub-goal index="1" status="finished" />
|
|
313
|
-
</mark-sub-goal-done>
|
|
314
|
-
` : ''}${shouldIncludeSubGoals ? `
|
|
315
|
-
<!-- Step ${memoryStepNumber}: Memory data from current screenshot if needed -->
|
|
316
|
-
<memory>...</memory>
|
|
317
|
-
` : ''}
|
|
233
|
+
|
|
318
234
|
**Then choose ONE of the following paths:**
|
|
319
235
|
|
|
320
|
-
**Path A: If the
|
|
236
|
+
**Path A: If the instruction is fulfilled or failed (Step 2)**
|
|
321
237
|
<complete success="true|false">...</complete>
|
|
322
238
|
|
|
323
|
-
**Path B: If the
|
|
239
|
+
**Path B: If the instruction is NOT fulfilled yet (Step 3)**
|
|
324
240
|
<!-- Determine next action -->
|
|
325
241
|
<log>...</log>
|
|
326
242
|
<action-type>...</action-type>
|
|
@@ -328,137 +244,7 @@ ${shouldIncludeSubGoals ? `
|
|
|
328
244
|
|
|
329
245
|
<!-- OR if there's an error -->
|
|
330
246
|
<error>...</error>
|
|
331
|
-
${shouldIncludeSubGoals ? `
|
|
332
|
-
## Multi-turn Conversation Example
|
|
333
|
-
|
|
334
|
-
Below is an example of a multi-turn conversation for "fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address":
|
|
335
|
-
|
|
336
|
-
### Turn 1 - Initial instruction
|
|
337
|
-
|
|
338
|
-
**User message:**
|
|
339
|
-
<user_instruction>fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address</user_instruction>
|
|
340
|
-
|
|
341
|
-
**Screenshot:** [Shows a registration form with empty Name and Email fields]
|
|
342
|
-
|
|
343
|
-
**Your response:**
|
|
344
|
-
<thought>The user wants me to fill out the registration form with specific values and return the email address. I can see the form has two fields: Name and Email. Both are currently empty. I'll break this down into sub-goals and start with the Name field. Note: The instruction is to fill the form only (not submit), and return the email at the end.</thought>
|
|
345
|
-
<update-plan-content>
|
|
346
|
-
<sub-goal index="1" status="pending">Fill in the Name field with 'John'</sub-goal>
|
|
347
|
-
<sub-goal index="2" status="pending">Fill in the Email field with 'john@example.com'</sub-goal>
|
|
348
|
-
<sub-goal index="3" status="pending">Return the filled email address</sub-goal>
|
|
349
|
-
</update-plan-content>
|
|
350
|
-
<log>Click on the Name field to start filling the form</log>
|
|
351
|
-
<action-type>Tap</action-type>
|
|
352
|
-
<action-param-json>
|
|
353
|
-
{
|
|
354
|
-
"locate": ${locateNameField}
|
|
355
|
-
}
|
|
356
|
-
</action-param-json>
|
|
357
247
|
|
|
358
|
-
### Turn 2 - After clicking Name field
|
|
359
|
-
|
|
360
|
-
**User message:**
|
|
361
|
-
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
362
|
-
|
|
363
|
-
Sub-goals:
|
|
364
|
-
1. Fill in the Name field with 'John' (running)
|
|
365
|
-
2. Fill in the Email field with 'john@example.com' (pending)
|
|
366
|
-
3. Return the filled email address (pending)
|
|
367
|
-
Current sub-goal is: Fill in the Name field with 'John'
|
|
368
|
-
Actions performed for current sub-goal:
|
|
369
|
-
- Click on the Name field to start filling the form
|
|
370
|
-
|
|
371
|
-
**Screenshot:** [Shows the form with Name field now focused/active]
|
|
372
|
-
|
|
373
|
-
**Your response:**
|
|
374
|
-
<thought>The Name field is now focused. I need to type 'John' into this field. Current sub-goal is running, will be completed after input.</thought>
|
|
375
|
-
<log>Typing 'John' into the Name field</log>
|
|
376
|
-
<action-type>Input</action-type>
|
|
377
|
-
<action-param-json>
|
|
378
|
-
{
|
|
379
|
-
"value": "John"
|
|
380
|
-
}
|
|
381
|
-
</action-param-json>
|
|
382
|
-
|
|
383
|
-
### Turn 3 - After entering name
|
|
384
|
-
|
|
385
|
-
**User message:**
|
|
386
|
-
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
387
|
-
|
|
388
|
-
Sub-goals:
|
|
389
|
-
1. Fill in the Name field with 'John' (running)
|
|
390
|
-
2. Fill in the Email field with 'john@example.com' (pending)
|
|
391
|
-
3. Return the filled email address (pending)
|
|
392
|
-
Current sub-goal is: Fill in the Name field with 'John'
|
|
393
|
-
Actions performed for current sub-goal:
|
|
394
|
-
- Click on the Name field to start filling the form
|
|
395
|
-
- Typing 'John' into the Name field
|
|
396
|
-
|
|
397
|
-
**Screenshot:** [Shows the form with Name field containing 'John']
|
|
398
|
-
|
|
399
|
-
**Your response:**
|
|
400
|
-
<thought>The Name field now shows 'John'. Sub-goal 1 is complete. Next, I need to fill the Email field for sub-goal 2.</thought>
|
|
401
|
-
<mark-sub-goal-done>
|
|
402
|
-
<sub-goal index="1" status="finished" />
|
|
403
|
-
</mark-sub-goal-done>
|
|
404
|
-
<memory>Name field has been filled with 'John'</memory>
|
|
405
|
-
<log>Moving to the Email field</log>
|
|
406
|
-
<action-type>Tap</action-type>
|
|
407
|
-
<action-param-json>
|
|
408
|
-
{
|
|
409
|
-
"locate": ${locateEmailField}
|
|
410
|
-
}
|
|
411
|
-
</action-param-json>
|
|
412
|
-
|
|
413
|
-
### Turn 4 - After clicking Email field
|
|
414
|
-
|
|
415
|
-
**User message:**
|
|
416
|
-
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
417
|
-
|
|
418
|
-
Sub-goals:
|
|
419
|
-
1. Fill in the Name field with 'John' (finished)
|
|
420
|
-
2. Fill in the Email field with 'john@example.com' (running)
|
|
421
|
-
3. Return the filled email address (pending)
|
|
422
|
-
Current sub-goal is: Fill in the Email field with 'john@example.com'
|
|
423
|
-
Actions performed for current sub-goal:
|
|
424
|
-
- Moving to the Email field
|
|
425
|
-
|
|
426
|
-
**Screenshot:** [Shows the form with Name='John' and Email field focused]
|
|
427
|
-
|
|
428
|
-
**Your response:**
|
|
429
|
-
<thought>The Email field is now focused. I'll enter 'john@example.com'. After this, sub-goal 2 will be complete and my task will be done.</thought>
|
|
430
|
-
<log>Typing email address into the Email field</log>
|
|
431
|
-
<action-type>Input</action-type>
|
|
432
|
-
<action-param-json>
|
|
433
|
-
{
|
|
434
|
-
"value": "john@example.com"
|
|
435
|
-
}
|
|
436
|
-
</action-param-json>
|
|
437
|
-
|
|
438
|
-
### Turn 5 - After entering email (Goal accomplished)
|
|
439
|
-
|
|
440
|
-
**User message:**
|
|
441
|
-
The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.
|
|
442
|
-
|
|
443
|
-
Sub-goals:
|
|
444
|
-
1. Fill in the Name field with 'John' (finished)
|
|
445
|
-
2. Fill in the Email field with 'john@example.com' (running)
|
|
446
|
-
3. Return the filled email address (pending)
|
|
447
|
-
Current sub-goal is: Fill in the Email field with 'john@example.com'
|
|
448
|
-
Actions performed for current sub-goal:
|
|
449
|
-
- Moving to the Email field
|
|
450
|
-
- Typing email address into the Email field
|
|
451
|
-
|
|
452
|
-
**Screenshot:** [Shows the form with Name='John' and Email='john@example.com']
|
|
453
|
-
|
|
454
|
-
**Your response:**
|
|
455
|
-
<thought>Both fields are now filled: Name shows 'John' and Email shows 'john@example.com'. Sub-goal 2 is complete. The user asked me to return the filled email address, so I need to include 'john@example.com' in my response. All sub-goals are now finished.</thought>
|
|
456
|
-
<mark-sub-goal-done>
|
|
457
|
-
<sub-goal index="2" status="finished" />
|
|
458
|
-
<sub-goal index="3" status="finished" />
|
|
459
|
-
</mark-sub-goal-done>
|
|
460
|
-
<complete success="true">john@example.com</complete>
|
|
461
|
-
` : `
|
|
462
248
|
## Multi-turn Conversation Example
|
|
463
249
|
|
|
464
250
|
Below is an example of a multi-turn conversation for "fill out the registration form with name 'John' and email 'john@example.com', then return the filled email address":
|
|
@@ -541,7 +327,7 @@ The previous action has been executed, here is the latest screenshot. Please con
|
|
|
541
327
|
**Your response:**
|
|
542
328
|
<thought>Both fields are now filled: Name shows 'John' and Email shows 'john@example.com'. The user asked me to return the filled email address, so I should include 'john@example.com' in my response. The instruction has been fulfilled.</thought>
|
|
543
329
|
<complete success="true">john@example.com</complete>
|
|
544
|
-
|
|
330
|
+
`;
|
|
545
331
|
}
|
|
546
332
|
exports.descriptionForAction = __webpack_exports__.descriptionForAction;
|
|
547
333
|
exports.systemPromptToTaskPlanning = __webpack_exports__.systemPromptToTaskPlanning;
|