@midscene/core 1.2.2-beta-20260116114131.0 → 1.2.2-beta-20260119114334.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +4 -4
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +3 -4
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/auto-glm/index.mjs +2 -2
- package/dist/es/ai-model/auto-glm/planning.mjs +1 -1
- package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -1
- package/dist/es/ai-model/auto-glm/prompt.mjs +8 -8
- package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -1
- package/dist/es/ai-model/auto-glm/util.mjs +6 -3
- package/dist/es/ai-model/auto-glm/util.mjs.map +1 -1
- package/dist/es/ai-model/index.mjs +2 -2
- package/dist/es/ai-model/inspect.mjs +12 -12
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +51 -7
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/common.mjs +2 -2
- package/dist/es/ai-model/prompt/common.mjs.map +1 -1
- package/dist/es/ai-model/prompt/extraction.mjs +3 -1
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +5 -3
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +48 -52
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +5 -3
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +12 -13
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +2 -24
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/common.mjs +10 -9
- package/dist/es/common.mjs.map +1 -1
- package/dist/es/service/index.mjs +6 -6
- package/dist/es/service/index.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/agent.js +3 -3
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +2 -3
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/auto-glm/index.js +3 -0
- package/dist/lib/ai-model/auto-glm/planning.js +1 -1
- package/dist/lib/ai-model/auto-glm/planning.js.map +1 -1
- package/dist/lib/ai-model/auto-glm/prompt.js +8 -8
- package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -1
- package/dist/lib/ai-model/auto-glm/util.js +10 -4
- package/dist/lib/ai-model/auto-glm/util.js.map +1 -1
- package/dist/lib/ai-model/index.js +0 -3
- package/dist/lib/ai-model/inspect.js +12 -12
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +52 -5
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/common.js +2 -2
- package/dist/lib/ai-model/prompt/common.js.map +1 -1
- package/dist/lib/ai-model/prompt/extraction.js +5 -3
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +5 -3
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +48 -52
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +5 -3
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +11 -12
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +1 -26
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/common.js +10 -9
- package/dist/lib/common.js.map +1 -1
- package/dist/lib/service/index.js +6 -6
- package/dist/lib/service/index.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model/auto-glm/index.d.ts +1 -1
- package/dist/types/ai-model/auto-glm/prompt.d.ts +3 -3
- package/dist/types/ai-model/auto-glm/util.d.ts +11 -5
- package/dist/types/ai-model/index.d.ts +1 -1
- package/dist/types/ai-model/llm-planning.d.ts +5 -1
- package/dist/types/ai-model/prompt/common.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +3 -3
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
- package/dist/types/ai-model/service-caller/index.d.ts +2 -2
- package/dist/types/ai-model/ui-tars-planning.d.ts +2 -3
- package/dist/types/common.d.ts +5 -5
- package/package.json +2 -2
- package/dist/es/ai-model/prompt/assertion.mjs +0 -31
- package/dist/es/ai-model/prompt/assertion.mjs.map +0 -1
- package/dist/lib/ai-model/prompt/assertion.js +0 -65
- package/dist/lib/ai-model/prompt/assertion.js.map +0 -1
- package/dist/types/ai-model/prompt/assertion.d.ts +0 -2
|
@@ -26,6 +26,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
27
|
parseAction: ()=>external_parser_js_namespaceObject.parseAction,
|
|
28
28
|
isAutoGLM: ()=>external_util_js_namespaceObject.isAutoGLM,
|
|
29
|
+
isUITars: ()=>external_util_js_namespaceObject.isUITars,
|
|
29
30
|
autoGLMPlanning: ()=>external_planning_js_namespaceObject.autoGLMPlanning,
|
|
30
31
|
getAutoGLMPlanPrompt: ()=>external_prompt_js_namespaceObject.getAutoGLMPlanPrompt,
|
|
31
32
|
getAutoGLMLocatePrompt: ()=>external_prompt_js_namespaceObject.getAutoGLMLocatePrompt,
|
|
@@ -42,6 +43,7 @@ exports.autoGLMPlanning = __webpack_exports__.autoGLMPlanning;
|
|
|
42
43
|
exports.getAutoGLMLocatePrompt = __webpack_exports__.getAutoGLMLocatePrompt;
|
|
43
44
|
exports.getAutoGLMPlanPrompt = __webpack_exports__.getAutoGLMPlanPrompt;
|
|
44
45
|
exports.isAutoGLM = __webpack_exports__.isAutoGLM;
|
|
46
|
+
exports.isUITars = __webpack_exports__.isUITars;
|
|
45
47
|
exports.parseAction = __webpack_exports__.parseAction;
|
|
46
48
|
exports.parseAutoGLMLocateResponse = __webpack_exports__.parseAutoGLMLocateResponse;
|
|
47
49
|
exports.parseAutoGLMResponse = __webpack_exports__.parseAutoGLMResponse;
|
|
@@ -51,6 +53,7 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
|
51
53
|
"getAutoGLMLocatePrompt",
|
|
52
54
|
"getAutoGLMPlanPrompt",
|
|
53
55
|
"isAutoGLM",
|
|
56
|
+
"isUITars",
|
|
54
57
|
"parseAction",
|
|
55
58
|
"parseAutoGLMLocateResponse",
|
|
56
59
|
"parseAutoGLMResponse",
|
|
@@ -34,7 +34,7 @@ const external_prompt_js_namespaceObject = require("./prompt.js");
|
|
|
34
34
|
const debug = (0, logger_namespaceObject.getDebug)('auto-glm-planning');
|
|
35
35
|
async function autoGLMPlanning(userInstruction, options) {
|
|
36
36
|
const { conversationHistory, context, modelConfig, actionContext } = options;
|
|
37
|
-
const systemPrompt = (0, external_prompt_js_namespaceObject.getAutoGLMPlanPrompt)(modelConfig.
|
|
37
|
+
const systemPrompt = (0, external_prompt_js_namespaceObject.getAutoGLMPlanPrompt)(modelConfig.modelFamily) + (actionContext ? `<high_priority_knowledge>${actionContext}</high_priority_knowledge>` : '');
|
|
38
38
|
const imagePayload = context.screenshot;
|
|
39
39
|
conversationHistory.append({
|
|
40
40
|
role: 'user',
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { PlanningAIResponse, UIContext } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { ConversationHistory } from '../conversation-history';\nimport { callAIWithStringResponse } from '../service-caller/index';\nimport { transformAutoGLMAction } from './actions';\nimport { parseAction, parseAutoGLMResponse } from './parser';\nimport { getAutoGLMPlanPrompt } from './prompt';\n\nconst debug = getDebug('auto-glm-planning');\n\nexport async function autoGLMPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig, actionContext } = options;\n\n const systemPrompt =\n getAutoGLMPlanPrompt(modelConfig.
|
|
1
|
+
{"version":3,"file":"ai-model/auto-glm/planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { PlanningAIResponse, UIContext } from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { ConversationHistory } from '../conversation-history';\nimport { callAIWithStringResponse } from '../service-caller/index';\nimport { transformAutoGLMAction } from './actions';\nimport { parseAction, parseAutoGLMResponse } from './parser';\nimport { getAutoGLMPlanPrompt } from './prompt';\n\nconst debug = getDebug('auto-glm-planning');\n\nexport async function autoGLMPlanning(\n userInstruction: string,\n options: {\n conversationHistory: ConversationHistory;\n context: UIContext;\n modelConfig: IModelConfig;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { conversationHistory, context, modelConfig, actionContext } = options;\n\n const systemPrompt =\n getAutoGLMPlanPrompt(modelConfig.modelFamily) +\n (actionContext\n ? `<high_priority_knowledge>${actionContext}</high_priority_knowledge>`\n : '');\n\n const imagePayload = context.screenshot;\n\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'text', text: userInstruction }],\n });\n conversationHistory.append({\n role: 'user',\n content: [{ type: 'image_url', image_url: { url: imagePayload.base64 } }],\n });\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...conversationHistory.snapshot(1),\n ];\n\n const { content: rawResponse, usage } = await callAIWithStringResponse(\n msgs,\n modelConfig,\n );\n\n debug('autoGLMPlanning rawResponse:', rawResponse);\n\n const parsedResponse = parseAutoGLMResponse(rawResponse);\n debug('thinking in response:', parsedResponse.think);\n debug('action in response:', parsedResponse.content);\n\n let transformedActions = [] as ReturnType<typeof transformAutoGLMAction>;\n\n const parsedAction = parseAction(parsedResponse);\n debug('Parsed action object:', parsedAction);\n transformedActions = transformAutoGLMAction(parsedAction, context.size);\n debug('Transformed actions:', transformedActions);\n\n conversationHistory.append({\n role: 'assistant',\n content: `<think>${parsedResponse.think}</think><answer>${parsedResponse.content}</answer>`,\n });\n\n const shouldContinuePlanning = !parsedResponse.content.startsWith('finish(');\n\n return {\n actions: transformedActions,\n log: rawResponse,\n usage,\n shouldContinuePlanning,\n rawResponse: JSON.stringify(rawResponse, undefined, 2),\n };\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","autoGLMPlanning","userInstruction","options","conversationHistory","context","modelConfig","actionContext","systemPrompt","getAutoGLMPlanPrompt","imagePayload","msgs","rawResponse","usage","callAIWithStringResponse","parsedResponse","parseAutoGLMResponse","transformedActions","parsedAction","parseAction","transformAutoGLMAction","shouldContinuePlanning","JSON","undefined"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;ACIA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAEhB,eAAeC,gBACpBC,eAAuB,EACvBC,OAKC;IAED,MAAM,EAAEC,mBAAmB,EAAEC,OAAO,EAAEC,WAAW,EAAEC,aAAa,EAAE,GAAGJ;IAErE,MAAMK,eACJC,AAAAA,IAAAA,mCAAAA,oBAAAA,AAAAA,EAAqBH,YAAY,WAAW,IAC3CC,CAAAA,gBACG,CAAC,yBAAyB,EAAEA,cAAc,0BAA0B,CAAC,GACrE,EAAC;IAEP,MAAMG,eAAeL,QAAQ,UAAU;IAEvCD,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAQ,MAAMF;YAAgB;SAAE;IACpD;IACAE,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YAAC;gBAAE,MAAM;gBAAa,WAAW;oBAAE,KAAKM,aAAa,MAAM;gBAAC;YAAE;SAAE;IAC3E;IAEA,MAAMC,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASH;QAAa;WACrCJ,oBAAoB,QAAQ,CAAC;KACjC;IAED,MAAM,EAAE,SAASQ,WAAW,EAAEC,KAAK,EAAE,GAAG,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EAC5CH,MACAL;IAGFP,MAAM,gCAAgCa;IAEtC,MAAMG,iBAAiBC,AAAAA,IAAAA,mCAAAA,oBAAAA,AAAAA,EAAqBJ;IAC5Cb,MAAM,yBAAyBgB,eAAe,KAAK;IACnDhB,MAAM,uBAAuBgB,eAAe,OAAO;IAEnD,IAAIE,qBAAqB,EAAE;IAE3B,MAAMC,eAAeC,AAAAA,IAAAA,mCAAAA,WAAAA,AAAAA,EAAYJ;IACjChB,MAAM,yBAAyBmB;IAC/BD,qBAAqBG,AAAAA,IAAAA,oCAAAA,sBAAAA,AAAAA,EAAuBF,cAAcb,QAAQ,IAAI;IACtEN,MAAM,wBAAwBkB;IAE9Bb,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS,CAAC,OAAO,EAAEW,eAAe,KAAK,CAAC,gBAAgB,EAAEA,eAAe,OAAO,CAAC,SAAS,CAAC;IAC7F;IAEA,MAAMM,yBAAyB,CAACN,eAAe,OAAO,CAAC,UAAU,CAAC;IAElE,OAAO;QACL,SAASE;QACT,KAAKL;QACLC;QACAQ;QACA,aAAaC,KAAK,SAAS,CAACV,aAAaW,QAAW;IACtD;AACF"}
|
|
@@ -189,13 +189,13 @@ const getAutoGLMChinesePlanPrompt = ()=>`
|
|
|
189
189
|
17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。
|
|
190
190
|
18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。
|
|
191
191
|
`;
|
|
192
|
-
const getAutoGLMPlanPrompt = (
|
|
193
|
-
if ('auto-glm-multilingual' ===
|
|
194
|
-
if ('auto-glm' ===
|
|
195
|
-
throw new Error(`Unsupported
|
|
192
|
+
const getAutoGLMPlanPrompt = (modelFamily)=>{
|
|
193
|
+
if ('auto-glm-multilingual' === modelFamily) return getAutoGLMMultilingualPlanPrompt();
|
|
194
|
+
if ('auto-glm' === modelFamily) return getAutoGLMChinesePlanPrompt();
|
|
195
|
+
throw new Error(`Unsupported modelFamily for Auto-GLM plan prompt: ${modelFamily}`);
|
|
196
196
|
};
|
|
197
|
-
const getAutoGLMLocatePrompt = (
|
|
198
|
-
if ('auto-glm-multilingual' ===
|
|
197
|
+
const getAutoGLMLocatePrompt = (modelFamily)=>{
|
|
198
|
+
if ('auto-glm-multilingual' === modelFamily) return `
|
|
199
199
|
The current date: ${getMultilingualFormattedDate()}
|
|
200
200
|
|
|
201
201
|
# Setup
|
|
@@ -225,7 +225,7 @@ Your output should STRICTLY follow the format:
|
|
|
225
225
|
REMEMBER:
|
|
226
226
|
- Your goal is to locate and tap the UI element specified by the user (e.g., button, icon, link, etc.). Do not attempt any other actions.
|
|
227
227
|
`;
|
|
228
|
-
if ('auto-glm' ===
|
|
228
|
+
if ('auto-glm' === modelFamily) return `
|
|
229
229
|
今天的日期是: ${getChineseFormattedDate()}
|
|
230
230
|
|
|
231
231
|
你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。
|
|
@@ -244,7 +244,7 @@ REMEMBER:
|
|
|
244
244
|
必须遵循的规则:
|
|
245
245
|
- 你的目标是定位并点击用户指定的UI元素(例如按钮、图标、链接等),请不要尝试任何其他的操作。
|
|
246
246
|
`;
|
|
247
|
-
throw new Error(`Unsupported
|
|
247
|
+
throw new Error(`Unsupported modelFamily for Auto-GLM locate prompt: ${modelFamily}`);
|
|
248
248
|
};
|
|
249
249
|
exports.getAutoGLMLocatePrompt = __webpack_exports__.getAutoGLMLocatePrompt;
|
|
250
250
|
exports.getAutoGLMPlanPrompt = __webpack_exports__.getAutoGLMPlanPrompt;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/prompt.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/prompt.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/**\n * Auto-GLM Prompt Templates\n *\n * Portions of this file are derived from Open-AutoGLM\n * Copyright (c) 2024 zai-org\n * Licensed under the Apache License, Version 2.0\n *\n * Source: https://github.com/zai-org/Open-AutoGLM\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n * Modifications:\n * - Adapted prompts for Midscene.js integration\n */\n\nimport type { TVlModeTypes } from '@midscene/shared/env';\n\n/**\n * Get formatted date string for system prompts\n * @returns Formatted date string like \"2026-01-12, Sunday\"\n */\nfunction getMultilingualFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const dayOfWeek = [\n 'Sunday',\n 'Monday',\n 'Tuesday',\n 'Wednesday',\n 'Thursday',\n 'Friday',\n 'Saturday',\n ][today.getDay()];\n\n return `${year}-${month}-${date}, ${dayOfWeek}`;\n}\n\n/**\n * Get formatted Chinese date (e.g., \"2026年01月13日 星期一\")\n */\nfunction getChineseFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const weekdayNames = [\n '星期日',\n '星期一',\n '星期二',\n '星期三',\n '星期四',\n '星期五',\n '星期六',\n ];\n const weekday = weekdayNames[today.getDay()];\n\n return `${year}年${month}月${date}日 ${weekday}`;\n}\n\nconst getAutoGLMMultilingualPlanPrompt = (): string => {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n- **Type**\n Enter text into the currently focused input field.\n **Example**:\n <answer>\n do(action=\"Type\", text=\"Hello World\")\n </answer>\n- **Swipe**\n Perform a swipe action with start point and end point.\n **Examples**:\n <answer>\n do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])\n </answer>\n- **Long Press**\n Perform a long press action on a specified screen area.\n You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.\n **Example**:\n <answer>\n do(action=\"Long Press\", element=[x,y])\n </answer>\n- **Launch**\n Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.\n **Example**:\n <answer>\n do(action=\"Launch\", app=\"Settings\")\n </answer>\n- **Back**\n Press the Back button to navigate to the previous screen.\n **Example**:\n <answer>\n do(action=\"Back\")\n </answer>\n- **Finish**\n Terminate the program and optionally print a message.\n **Example**:\n <answer>\n finish(message=\"Task completed.\")\n </answer>\n\n\nREMEMBER:\n- Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in <think> part.\n- Only ONE LINE of action in <answer> part per response: Each step must contain exactly one line of executable code.\n- Generate execution code strictly according to format requirements.\n `;\n};\n\nconst getAutoGLMChinesePlanPrompt = (): string => {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Launch\", app=\"xxx\") \n Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y], message=\"重要操作\") \n 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。\n- do(action=\"Type\", text=\"xxx\") \n Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。\n- do(action=\"Type_Name\", text=\"xxx\") \n Type_Name是输入人名的操作,基本功能同Type。\n- do(action=\"Swipe\", start=[x1,y1], end=[x2,y2]) \n Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Long Press\", element=[x,y]) \n Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。\n- do(action=\"Double Tap\", element=[x,y]) \n Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Back\") \n 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Home\") \n Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Wait\", duration=\"x seconds\") \n 等待页面加载,x为需要等待多少秒。\n- finish(message=\"xxx\") \n finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 \n\n必须遵循的规则:\n0. 严禁调用 Interact、Take_over、Note、Call_API 这四个操作,这些操作暂不支持。\n1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。\n2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。\n3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。\n4. 如果页面显示网络问题,需要重新加载,请点击重新加载。\n5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。\n6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。\n7. 在做小红书总结类任务时一定要筛选图文笔记。\n8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。\n9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。\n10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。\n11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将\"群\"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。\n12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。\n13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。\n14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。\n15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。\n16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。\n17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message=\"原因\")。\n18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。\n`;\n};\n\nexport const getAutoGLMPlanPrompt = (\n vlMode: TVlModeTypes | undefined,\n): string => {\n if (vlMode === 'auto-glm-multilingual') {\n return getAutoGLMMultilingualPlanPrompt();\n } else if (vlMode === 'auto-glm') {\n return getAutoGLMChinesePlanPrompt();\n }\n throw new Error(`Unsupported vlMode for Auto-GLM plan prompt: ${vlMode}`);\n};\n\nexport const getAutoGLMLocatePrompt = (\n vlMode: TVlModeTypes | undefined,\n): string => {\n if (vlMode === 'auto-glm-multilingual') {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n\nREMEMBER:\n- Your goal is to locate and tap the UI element specified by the user (e.g., button, icon, link, etc.). Do not attempt any other actions.\n `;\n } else if (vlMode === 'auto-glm') {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n\n必须遵循的规则:\n- 你的目标是定位并点击用户指定的UI元素(例如按钮、图标、链接等),请不要尝试任何其他的操作。\n `;\n }\n throw new Error(`Unsupported vlMode for Auto-GLM locate prompt: ${vlMode}`);\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","getMultilingualFormattedDate","today","Date","year","month","String","date","dayOfWeek","getChineseFormattedDate","weekdayNames","weekday","getAutoGLMMultilingualPlanPrompt","getAutoGLMChinesePlanPrompt","getAutoGLMPlanPrompt","vlMode","Error","getAutoGLMLocatePrompt"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACyBA,SAASI;IACP,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMM,YAAY;QAChB;QACA;QACA;QACA;QACA;QACA;QACA;KACD,CAACN,MAAM,MAAM,GAAG;IAEjB,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEC,WAAW;AACjD;AAKA,SAASC;IACP,MAAMP,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMQ,eAAe;QACnB;QACA;QACA;QACA;QACA;QACA;QACA;KACD;IACD,MAAMC,UAAUD,YAAY,CAACR,MAAM,MAAM,GAAG;IAE5C,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEI,SAAS;AAC/C;AAEA,MAAMC,mCAAmC,IAChC,CAAC;kBACQ,EAAEX,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAoEjD,CAAC;AAGH,MAAMY,8BAA8B,IAC3B,CAAC;QACF,EAAEJ,0BAA0B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAyDpC,CAAC;AAGM,MAAMK,uBAAuB,CAClCC;IAEA,IAAIA,AAAW,4BAAXA,QACF,OAAOH;IACF,IAAIG,AAAW,eAAXA,QACT,OAAOF;IAET,MAAM,IAAIG,MAAM,CAAC,6CAA6C,EAAED,QAAQ;AAC1E;AAEO,MAAME,yBAAyB,CACpCF;IAEA,IAAIA,AAAW,4BAAXA,QACF,OAAO,CAAC;kBACM,EAAEd,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA4BjD,CAAC;IACM,IAAIc,AAAW,eAAXA,QACT,OAAO,CAAC;QACJ,EAAEN,0BAA0B;;;;;;;;;;;;;;;;;IAiBhC,CAAC;IAEH,MAAM,IAAIO,MAAM,CAAC,+CAA+C,EAAED,QAAQ;AAC5E"}
|
|
1
|
+
{"version":3,"file":"ai-model/auto-glm/prompt.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/prompt.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","/**\n * Auto-GLM Prompt Templates\n *\n * Portions of this file are derived from Open-AutoGLM\n * Copyright (c) 2024 zai-org\n * Licensed under the Apache License, Version 2.0\n *\n * Source: https://github.com/zai-org/Open-AutoGLM\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n * Modifications:\n * - Adapted prompts for Midscene.js integration\n */\n\nimport type { TModelFamily } from '@midscene/shared/env';\n\n/**\n * Get formatted date string for system prompts\n * @returns Formatted date string like \"2026-01-12, Sunday\"\n */\nfunction getMultilingualFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const dayOfWeek = [\n 'Sunday',\n 'Monday',\n 'Tuesday',\n 'Wednesday',\n 'Thursday',\n 'Friday',\n 'Saturday',\n ][today.getDay()];\n\n return `${year}-${month}-${date}, ${dayOfWeek}`;\n}\n\n/**\n * Get formatted Chinese date (e.g., \"2026年01月13日 星期一\")\n */\nfunction getChineseFormattedDate(): string {\n const today = new Date();\n const year = today.getFullYear();\n const month = String(today.getMonth() + 1).padStart(2, '0');\n const date = String(today.getDate()).padStart(2, '0');\n const weekdayNames = [\n '星期日',\n '星期一',\n '星期二',\n '星期三',\n '星期四',\n '星期五',\n '星期六',\n ];\n const weekday = weekdayNames[today.getDay()];\n\n return `${year}年${month}月${date}日 ${weekday}`;\n}\n\nconst getAutoGLMMultilingualPlanPrompt = (): string => {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n- **Type**\n Enter text into the currently focused input field.\n **Example**:\n <answer>\n do(action=\"Type\", text=\"Hello World\")\n </answer>\n- **Swipe**\n Perform a swipe action with start point and end point.\n **Examples**:\n <answer>\n do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])\n </answer>\n- **Long Press**\n Perform a long press action on a specified screen area.\n You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.\n **Example**:\n <answer>\n do(action=\"Long Press\", element=[x,y])\n </answer>\n- **Launch**\n Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.\n **Example**:\n <answer>\n do(action=\"Launch\", app=\"Settings\")\n </answer>\n- **Back**\n Press the Back button to navigate to the previous screen.\n **Example**:\n <answer>\n do(action=\"Back\")\n </answer>\n- **Finish**\n Terminate the program and optionally print a message.\n **Example**:\n <answer>\n finish(message=\"Task completed.\")\n </answer>\n\n\nREMEMBER:\n- Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in <think> part.\n- Only ONE LINE of action in <answer> part per response: Each step must contain exactly one line of executable code.\n- Generate execution code strictly according to format requirements.\n `;\n};\n\nconst getAutoGLMChinesePlanPrompt = (): string => {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Launch\", app=\"xxx\") \n Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Tap\", element=[x,y], message=\"重要操作\") \n 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。\n- do(action=\"Type\", text=\"xxx\") \n Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。\n- do(action=\"Type_Name\", text=\"xxx\") \n Type_Name是输入人名的操作,基本功能同Type。\n- do(action=\"Swipe\", start=[x1,y1], end=[x2,y2]) \n Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Long Press\", element=[x,y]) \n Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。\n- do(action=\"Double Tap\", element=[x,y]) \n Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Back\") \n 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Home\") \n Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。\n- do(action=\"Wait\", duration=\"x seconds\") \n 等待页面加载,x为需要等待多少秒。\n- finish(message=\"xxx\") \n finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 \n\n必须遵循的规则:\n0. 严禁调用 Interact、Take_over、Note、Call_API 这四个操作,这些操作暂不支持。\n1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。\n2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。\n3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。\n4. 如果页面显示网络问题,需要重新加载,请点击重新加载。\n5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。\n6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。\n7. 在做小红书总结类任务时一定要筛选图文笔记。\n8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。\n9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。\n10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。\n11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将\"群\"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。\n12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。\n13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。\n14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。\n15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。\n16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。\n17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message=\"原因\")。\n18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。\n`;\n};\n\nexport const getAutoGLMPlanPrompt = (\n modelFamily: TModelFamily | undefined,\n): string => {\n if (modelFamily === 'auto-glm-multilingual') {\n return getAutoGLMMultilingualPlanPrompt();\n } else if (modelFamily === 'auto-glm') {\n return getAutoGLMChinesePlanPrompt();\n }\n throw new Error(\n `Unsupported modelFamily for Auto-GLM plan prompt: ${modelFamily}`,\n );\n};\n\nexport const getAutoGLMLocatePrompt = (\n modelFamily: TModelFamily | undefined,\n): string => {\n if (modelFamily === 'auto-glm-multilingual') {\n return `\nThe current date: ${getMultilingualFormattedDate()}\n\n# Setup\nYou are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.\n\n# More details about the code\nYour response format must be structured as follows:\n\nThink first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.\nProvide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.\n\nYour output should STRICTLY follow the format:\n<think>\n[Your thought]\n</think>\n<answer>\n[Your operation code]\n</answer>\n\n- **Tap**\n Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.\n **Example**:\n <answer>\n do(action=\"Tap\", element=[x,y])\n </answer>\n\nREMEMBER:\n- Your goal is to locate and tap the UI element specified by the user (e.g., button, icon, link, etc.). Do not attempt any other actions.\n `;\n } else if (modelFamily === 'auto-glm') {\n return `\n今天的日期是: ${getChineseFormattedDate()}\n\n你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。\n你必须严格按照要求输出以下格式:\n<think>{think}</think>\n<answer>{action}</answer>\n\n其中:\n- {think} 是对你为什么选择这个操作的简短推理说明。\n- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。\n\n操作指令及其作用如下:\n- do(action=\"Tap\", element=[x,y]) \n Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。\n\n必须遵循的规则:\n- 你的目标是定位并点击用户指定的UI元素(例如按钮、图标、链接等),请不要尝试任何其他的操作。\n `;\n }\n throw new Error(\n `Unsupported modelFamily for Auto-GLM locate prompt: ${modelFamily}`,\n );\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","getMultilingualFormattedDate","today","Date","year","month","String","date","dayOfWeek","getChineseFormattedDate","weekdayNames","weekday","getAutoGLMMultilingualPlanPrompt","getAutoGLMChinesePlanPrompt","getAutoGLMPlanPrompt","modelFamily","Error","getAutoGLMLocatePrompt"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACyBA,SAASI;IACP,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMM,YAAY;QAChB;QACA;QACA;QACA;QACA;QACA;QACA;KACD,CAACN,MAAM,MAAM,GAAG;IAEjB,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEC,WAAW;AACjD;AAKA,SAASC;IACP,MAAMP,QAAQ,IAAIC;IAClB,MAAMC,OAAOF,MAAM,WAAW;IAC9B,MAAMG,QAAQC,OAAOJ,MAAM,QAAQ,KAAK,GAAG,QAAQ,CAAC,GAAG;IACvD,MAAMK,OAAOD,OAAOJ,MAAM,OAAO,IAAI,QAAQ,CAAC,GAAG;IACjD,MAAMQ,eAAe;QACnB;QACA;QACA;QACA;QACA;QACA;QACA;KACD;IACD,MAAMC,UAAUD,YAAY,CAACR,MAAM,MAAM,GAAG;IAE5C,OAAO,GAAGE,KAAK,CAAC,EAAEC,MAAM,CAAC,EAAEE,KAAK,EAAE,EAAEI,SAAS;AAC/C;AAEA,MAAMC,mCAAmC,IAChC,CAAC;kBACQ,EAAEX,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAoEjD,CAAC;AAGH,MAAMY,8BAA8B,IAC3B,CAAC;QACF,EAAEJ,0BAA0B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAyDpC,CAAC;AAGM,MAAMK,uBAAuB,CAClCC;IAEA,IAAIA,AAAgB,4BAAhBA,aACF,OAAOH;IACF,IAAIG,AAAgB,eAAhBA,aACT,OAAOF;IAET,MAAM,IAAIG,MACR,CAAC,kDAAkD,EAAED,aAAa;AAEtE;AAEO,MAAME,yBAAyB,CACpCF;IAEA,IAAIA,AAAgB,4BAAhBA,aACF,OAAO,CAAC;kBACM,EAAEd,+BAA+B;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA4BjD,CAAC;IACM,IAAIc,AAAgB,eAAhBA,aACT,OAAO,CAAC;QACJ,EAAEN,0BAA0B;;;;;;;;;;;;;;;;;IAiBhC,CAAC;IAEH,MAAM,IAAIO,MACR,CAAC,oDAAoD,EAAED,aAAa;AAExE"}
|
|
@@ -24,14 +24,20 @@ var __webpack_require__ = {};
|
|
|
24
24
|
var __webpack_exports__ = {};
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
|
-
isAutoGLM: ()=>isAutoGLM
|
|
27
|
+
isAutoGLM: ()=>isAutoGLM,
|
|
28
|
+
isUITars: ()=>isUITars
|
|
28
29
|
});
|
|
29
|
-
function isAutoGLM(
|
|
30
|
-
return 'auto-glm' ===
|
|
30
|
+
function isAutoGLM(modelFamily) {
|
|
31
|
+
return 'auto-glm' === modelFamily || 'auto-glm-multilingual' === modelFamily;
|
|
32
|
+
}
|
|
33
|
+
function isUITars(modelFamily) {
|
|
34
|
+
return 'vlm-ui-tars' === modelFamily || 'vlm-ui-tars-doubao' === modelFamily || 'vlm-ui-tars-doubao-1.5' === modelFamily;
|
|
31
35
|
}
|
|
32
36
|
exports.isAutoGLM = __webpack_exports__.isAutoGLM;
|
|
37
|
+
exports.isUITars = __webpack_exports__.isUITars;
|
|
33
38
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
34
|
-
"isAutoGLM"
|
|
39
|
+
"isAutoGLM",
|
|
40
|
+
"isUITars"
|
|
35
41
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
36
42
|
Object.defineProperty(exports, '__esModule', {
|
|
37
43
|
value: true
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/auto-glm/util.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/util.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {
|
|
1
|
+
{"version":3,"file":"ai-model/auto-glm/util.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/auto-glm/util.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { TModelFamily } from '@midscene/shared/env';\n\n/**\n * Check if the modelFamily is auto-glm or auto-glm-multilingual\n * @param modelFamily The model family to check\n * @returns true if modelFamily is auto-glm or auto-glm-multilingual\n */\nexport function isAutoGLM(modelFamily: TModelFamily | undefined): boolean {\n return modelFamily === 'auto-glm' || modelFamily === 'auto-glm-multilingual';\n}\n\n/**\n * Check if the modelFamily is a UI-TARS variant\n * @param modelFamily The model family to check\n * @returns true if modelFamily is any UI-TARS variant\n */\nexport function isUITars(modelFamily: TModelFamily | undefined): boolean {\n return (\n modelFamily === 'vlm-ui-tars' ||\n modelFamily === 'vlm-ui-tars-doubao' ||\n modelFamily === 'vlm-ui-tars-doubao-1.5'\n );\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","isAutoGLM","modelFamily","isUITars"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;ACCO,SAASI,UAAUC,WAAqC;IAC7D,OAAOA,AAAgB,eAAhBA,eAA8BA,AAAgB,4BAAhBA;AACvC;AAOO,SAASC,SAASD,WAAqC;IAC5D,OACEA,AAAgB,kBAAhBA,eACAA,AAAgB,yBAAhBA,eACAA,AAAgB,6BAAhBA;AAEJ"}
|
|
@@ -24,7 +24,6 @@ var __webpack_require__ = {};
|
|
|
24
24
|
var __webpack_exports__ = {};
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
|
-
resizeImageForUiTars: ()=>external_ui_tars_planning_js_namespaceObject.resizeImageForUiTars,
|
|
28
27
|
callAIWithObjectResponse: ()=>index_js_namespaceObject.callAIWithObjectResponse,
|
|
29
28
|
TUserPromptSchema: ()=>external_common_js_namespaceObject.TUserPromptSchema,
|
|
30
29
|
generatePlaywrightTest: ()=>playwright_generator_js_namespaceObject.generatePlaywrightTest,
|
|
@@ -89,7 +88,6 @@ exports.generateYamlTestStream = __webpack_exports__.generateYamlTestStream;
|
|
|
89
88
|
exports.getMidsceneLocationSchema = __webpack_exports__.getMidsceneLocationSchema;
|
|
90
89
|
exports.parseActionParam = __webpack_exports__.parseActionParam;
|
|
91
90
|
exports.plan = __webpack_exports__.plan;
|
|
92
|
-
exports.resizeImageForUiTars = __webpack_exports__.resizeImageForUiTars;
|
|
93
91
|
exports.systemPromptToLocateElement = __webpack_exports__.systemPromptToLocateElement;
|
|
94
92
|
exports.uiTarsPlanning = __webpack_exports__.uiTarsPlanning;
|
|
95
93
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
@@ -118,7 +116,6 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
|
118
116
|
"getMidsceneLocationSchema",
|
|
119
117
|
"parseActionParam",
|
|
120
118
|
"plan",
|
|
121
|
-
"resizeImageForUiTars",
|
|
122
119
|
"systemPromptToLocateElement",
|
|
123
120
|
"uiTarsPlanning"
|
|
124
121
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
@@ -89,12 +89,12 @@ const promptsToChatParam = async (multimodalPrompt)=>{
|
|
|
89
89
|
};
|
|
90
90
|
async function AiLocateElement(options) {
|
|
91
91
|
const { context, targetElementDescription, callAIFn, modelConfig } = options;
|
|
92
|
-
const {
|
|
92
|
+
const { modelFamily } = modelConfig;
|
|
93
93
|
const screenshotBase64 = context.screenshot.base64;
|
|
94
94
|
(0, utils_namespaceObject.assert)(targetElementDescription, "cannot find the target element description");
|
|
95
95
|
const targetElementDescriptionText = extraTextFromUserPrompt(targetElementDescription);
|
|
96
96
|
const userInstructionPrompt = (0, llm_locator_js_namespaceObject.findElementPrompt)(targetElementDescriptionText);
|
|
97
|
-
const systemPrompt = (0, util_js_namespaceObject.isAutoGLM)(
|
|
97
|
+
const systemPrompt = (0, util_js_namespaceObject.isAutoGLM)(modelFamily) ? (0, prompt_js_namespaceObject.getAutoGLMLocatePrompt)(modelFamily) : (0, llm_locator_js_namespaceObject.systemPromptToLocateElement)(modelFamily);
|
|
98
98
|
let imagePayload = screenshotBase64;
|
|
99
99
|
let imageWidth = context.size.width;
|
|
100
100
|
let imageHeight = context.size.height;
|
|
@@ -108,7 +108,7 @@ async function AiLocateElement(options) {
|
|
|
108
108
|
imageHeight = options.searchConfig.rect?.height;
|
|
109
109
|
originalImageWidth = imageWidth;
|
|
110
110
|
originalImageHeight = imageHeight;
|
|
111
|
-
} else if ('qwen2.5-vl' ===
|
|
111
|
+
} else if ('qwen2.5-vl' === modelFamily) {
|
|
112
112
|
const paddedResult = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
|
|
113
113
|
imageWidth = paddedResult.width;
|
|
114
114
|
imageHeight = paddedResult.height;
|
|
@@ -131,7 +131,7 @@ async function AiLocateElement(options) {
|
|
|
131
131
|
},
|
|
132
132
|
{
|
|
133
133
|
type: 'text',
|
|
134
|
-
text: (0, util_js_namespaceObject.isAutoGLM)(
|
|
134
|
+
text: (0, util_js_namespaceObject.isAutoGLM)(modelFamily) ? `Tap: ${userInstructionPrompt}` : userInstructionPrompt
|
|
135
135
|
}
|
|
136
136
|
]
|
|
137
137
|
}
|
|
@@ -143,7 +143,7 @@ async function AiLocateElement(options) {
|
|
|
143
143
|
});
|
|
144
144
|
msgs.push(...addOns);
|
|
145
145
|
}
|
|
146
|
-
if ((0, util_js_namespaceObject.isAutoGLM)(
|
|
146
|
+
if ((0, util_js_namespaceObject.isAutoGLM)(modelFamily)) {
|
|
147
147
|
const { content: rawResponseContent, usage } = await (0, index_js_namespaceObject.callAIWithStringResponse)(msgs, modelConfig);
|
|
148
148
|
debugInspect('auto-glm rawResponse:', rawResponseContent);
|
|
149
149
|
const parsed = (0, parser_js_namespaceObject.parseAutoGLMLocateResponse)(rawResponseContent);
|
|
@@ -212,7 +212,7 @@ async function AiLocateElement(options) {
|
|
|
212
212
|
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
213
213
|
try {
|
|
214
214
|
if ('bbox' in res.content && Array.isArray(res.content.bbox) && res.content.bbox.length >= 1) {
|
|
215
|
-
resRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(res.content.bbox, imageWidth, imageHeight, options.searchConfig?.rect?.left, options.searchConfig?.rect?.top, originalImageWidth, originalImageHeight,
|
|
215
|
+
resRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(res.content.bbox, imageWidth, imageHeight, options.searchConfig?.rect?.left, options.searchConfig?.rect?.top, originalImageWidth, originalImageHeight, modelFamily);
|
|
216
216
|
debugInspect('resRect', resRect);
|
|
217
217
|
const rectCenter = {
|
|
218
218
|
x: resRect.left + resRect.width / 2,
|
|
@@ -244,9 +244,9 @@ async function AiLocateElement(options) {
|
|
|
244
244
|
}
|
|
245
245
|
async function AiLocateSection(options) {
|
|
246
246
|
const { context, sectionDescription, modelConfig } = options;
|
|
247
|
-
const {
|
|
247
|
+
const { modelFamily } = modelConfig;
|
|
248
248
|
const screenshotBase64 = context.screenshot.base64;
|
|
249
|
-
const systemPrompt = (0, llm_section_locator_js_namespaceObject.systemPromptToLocateSection)(
|
|
249
|
+
const systemPrompt = (0, llm_section_locator_js_namespaceObject.systemPromptToLocateSection)(modelFamily);
|
|
250
250
|
const sectionLocatorInstructionText = (0, llm_section_locator_js_namespaceObject.sectionLocatorInstruction)(extraTextFromUserPrompt(sectionDescription));
|
|
251
251
|
const msgs = [
|
|
252
252
|
{
|
|
@@ -281,23 +281,23 @@ async function AiLocateSection(options) {
|
|
|
281
281
|
let sectionRect;
|
|
282
282
|
const sectionBbox = result.content.bbox;
|
|
283
283
|
if (sectionBbox) {
|
|
284
|
-
const targetRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(sectionBbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height,
|
|
284
|
+
const targetRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(sectionBbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height, modelFamily);
|
|
285
285
|
debugSection('original targetRect %j', targetRect);
|
|
286
286
|
const referenceBboxList = result.content.references_bbox || [];
|
|
287
287
|
debugSection('referenceBboxList %j', referenceBboxList);
|
|
288
|
-
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>(0, external_common_js_namespaceObject.adaptBboxToRect)(bbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height,
|
|
288
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>(0, external_common_js_namespaceObject.adaptBboxToRect)(bbox, context.size.width, context.size.height, 0, 0, context.size.width, context.size.height, modelFamily));
|
|
289
289
|
debugSection('referenceRects %j', referenceRects);
|
|
290
290
|
const mergedRect = (0, external_common_js_namespaceObject.mergeRects)([
|
|
291
291
|
targetRect,
|
|
292
292
|
...referenceRects
|
|
293
293
|
]);
|
|
294
294
|
debugSection('mergedRect %j', mergedRect);
|
|
295
|
-
sectionRect = (0, external_common_js_namespaceObject.expandSearchArea)(mergedRect, context.size,
|
|
295
|
+
sectionRect = (0, external_common_js_namespaceObject.expandSearchArea)(mergedRect, context.size, modelFamily);
|
|
296
296
|
debugSection('expanded sectionRect %j', sectionRect);
|
|
297
297
|
}
|
|
298
298
|
let imageBase64 = screenshotBase64;
|
|
299
299
|
if (sectionRect) {
|
|
300
|
-
const croppedResult = await (0, img_namespaceObject.cropByRect)(screenshotBase64, sectionRect, 'qwen2.5-vl' ===
|
|
300
|
+
const croppedResult = await (0, img_namespaceObject.cropByRect)(screenshotBase64, sectionRect, 'qwen2.5-vl' === modelFamily);
|
|
301
301
|
imageBase64 = croppedResult.imageBase64;
|
|
302
302
|
sectionRect.width = croppedResult.width;
|
|
303
303
|
sectionRect.height = croppedResult.height;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/inspect.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../src/ai-model/inspect.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(vlMode)\n ? getAutoGLMLocatePrompt(vlMode)\n : systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(vlMode)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(vlMode)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig);\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Create a small bbox around the point\n const bboxSize = 10;\n const x1 = Math.max(pixelX - bboxSize / 2, 0);\n const y1 = Math.max(pixelY - bboxSize / 2, 0);\n const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);\n const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);\n\n // Convert to Rect format\n resRect = {\n left: x1,\n top: y1,\n width: x2 - x1,\n height: y2 - y1,\n };\n\n // Apply offset if searching in a cropped area\n if (options.searchConfig?.rect) {\n resRect.left += options.searchConfig.rect.left;\n resRect.top += options.searchConfig.rect.top;\n }\n\n debugInspect('auto-glm resRect:', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","bboxSize","x1","y1","x2","y2","rectCenter","element","generateElementByPosition","res","rawResponse","JSON","Array","adaptBboxToRect","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;;;;;ACiDA,MAAMI,eAAeC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAC9B,MAAMC,eAAeD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAQrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElDM,IAAAA,sBAAAA,MAAAA,AAAAA,EACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,AAAAA,IAAAA,+BAAAA,iBAAAA,AAAAA,EAAkBF;IAChD,MAAMG,eAAeC,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,UAC3BQ,AAAAA,IAAAA,0BAAAA,sBAAAA,AAAAA,EAAuBR,UACvBS,AAAAA,IAAAA,+BAAAA,2BAAAA,AAAAA,EAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAaf,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIgB,cAAchB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIiB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjB,QAAQ,YAAY,EAAE;QACxBO,IAAAA,sBAAAA,MAAAA,AAAAA,EACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,IAAAA,sBAAAA,MAAAA,AAAAA,EACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;QAC/CgB,aAAahB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiB,cAAcjB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAW,iBAAXA,QAAyB;QAClC,MAAMe,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMzB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,UACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMoB,SAAS,MAAM7B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIV,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,SAAS;QACrB,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EAAyB9B,MAAMS;QAEvChB,aAAa,yBAAyBmC;QAEtC,MAAMG,SAASC,AAAAA,IAAAA,0BAAAA,0BAAAA,AAAAA,EAA2BJ;QAE1CnC,aAAa,sBAAsBsC,OAAO,KAAK;QAC/CtC,aAAa,yBAAyBsC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DtC,aAAa,yBAAyB0C,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCtC,aAAa,iCAAiC;gBAAE2C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C7B,aAAa,+BAA+B;gBAAE6C;gBAAQE;YAAO;YAG7D,MAAMC,WAAW;YACjB,MAAMC,KAAKH,KAAK,GAAG,CAACD,SAASG,WAAW,GAAG;YAC3C,MAAME,KAAKJ,KAAK,GAAG,CAACC,SAASC,WAAW,GAAG;YAC3C,MAAMG,KAAKL,KAAK,GAAG,CAACD,SAASG,WAAW,GAAGpB;YAC3C,MAAMwB,KAAKN,KAAK,GAAG,CAACC,SAASC,WAAW,GAAGnB;YAG3CW,UAAU;gBACR,MAAMS;gBACN,KAAKC;gBACL,OAAOC,KAAKF;gBACZ,QAAQG,KAAKF;YACf;YAGA,IAAItC,QAAQ,YAAY,EAAE,MAAM;gBAC9B4B,QAAQ,IAAI,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBAC9C4B,QAAQ,GAAG,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YAC9C;YAEAZ,aAAa,qBAAqBwC;YAElC,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,AAAAA,IAAAA,yBAAAA,yBAAAA,AAAAA,EACnCF,YACAjC;YAGF,IAAIkC,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMd;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,MAAMkB,MAAM,MAAMzC,SAASR,MAAMS;IAEjC,MAAMyC,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIhB;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYc,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBG,MAAM,OAAO,CAACH,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAhB,UAAUoB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACRJ,IAAI,OAAO,CAAC,IAAI,EAChB5B,YACAC,aACAjB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkB,oBACAC,qBACAd;YAGFjB,aAAa,WAAWwC;YAExB,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,AAAAA,IAAAA,yBAAAA,yBAAAA,AAAAA,EACnCF,YACAjC;YAEFsB,SAAS,EAAE;YAEX,IAAIY,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;IACF,EAAE,OAAOO,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACnB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEoB,IAAI,CAAC,CAAC;aAFtBpB,SAAS;YAACoB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMtB;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAe;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAeQ,gBAAgBpD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEoD,kBAAkB,EAAEjD,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMU,eAAe2C,AAAAA,IAAAA,uCAAAA,2BAAAA,AAAAA,EAA4BjD;IACjD,MAAMkD,gCAAgCC,AAAAA,IAAAA,uCAAAA,yBAAAA,AAAAA,EACpCjE,wBAAwB8D;IAE1B,MAAM1D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM/B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQ4D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA1D,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EACnB/D,MACAS;IAGF,IAAIuD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAab,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACjBY,aACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BuE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DnE,aAAa,wBAAwBwE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASjB,MAAM,OAAO,CAACiB,OAC/B,GAAG,CAAC,CAACA,OACGhB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACLgB,MACA/D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqByE;QAGlC,MAAME,aAAaC,AAAAA,IAAAA,mCAAAA,UAAAA,AAAAA,EAAW;YAACL;eAAeE;SAAe;QAC7DzE,aAAa,iBAAiB2E;QAG9BN,cAAcQ,AAAAA,IAAAA,mCAAAA,gBAAAA,AAAAA,EAAiBF,YAAYhE,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BqE;IAC1C;IAEA,IAAIS,cAAc9D;IAClB,IAAIqD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EAC1BhE,kBACAqD,aACAtD,AAAW,iBAAXA;QAEF+D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBvE,OAO7C;IACC,MAAM,EAAEwE,SAAS,EAAEvE,OAAO,EAAEwE,aAAa,EAAE/E,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe+D,AAAAA,IAAAA,8BAAAA,qBAAAA,AAAAA;IACrB,MAAMpE,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM0E,wBAAwBC,AAAAA,IAAAA,8BAAAA,sBAAAA,AAAAA,EAC5B5E,QAAQ,eAAe,IAAI,IAC3BwE;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKvE;YACL,QAAQ;QACV;IACF;IAGFuE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASkE;QACX;KACD;IAED,IAAInF,kBAAkB;QACpB,MAAM4B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EACnB/D,MACAS;IAEF,OAAO;QACL,aAAaqD,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB5E,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeqE,AAAAA,IAAAA,yCAAAA,iCAAAA,AAAAA;IACrB,MAAMC,aAAaC,AAAAA,IAAAA,yCAAAA,yBAAAA,AAAAA,EAA0BH;IAE7C,MAAMpF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASsE;QACX;KACD;IAED,MAAMxB,SAAS,MAAMtD,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBqD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../src/ai-model/inspect.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ReferenceImage,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig);\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Create a small bbox around the point\n const bboxSize = 10;\n const x1 = Math.max(pixelX - bboxSize / 2, 0);\n const y1 = Math.max(pixelY - bboxSize / 2, 0);\n const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);\n const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);\n\n // Convert to Rect format\n resRect = {\n left: x1,\n top: y1,\n width: x2 - x1,\n height: y2 - y1,\n };\n\n // Apply offset if searching in a cropped area\n if (options.searchConfig?.rect) {\n resRect.left += options.searchConfig.rect.left;\n resRect.top += options.searchConfig.rect.top;\n }\n\n debugInspect('auto-glm resRect:', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n const res = await callAIFn(msgs, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelFamily);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n modelConfig,\n );\n return {\n parseResult: result.content,\n usage: result.usage,\n reasoning_content: result.reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","modelFamily","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","bboxSize","x1","y1","x2","y2","rectCenter","element","generateElementByPosition","res","rawResponse","JSON","Array","adaptBboxToRect","e","msg","Error","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;;;;;ACiDA,MAAMI,eAAeC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAC9B,MAAMC,eAAeD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAQrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElDM,IAAAA,sBAAAA,MAAAA,AAAAA,EACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,AAAAA,IAAAA,+BAAAA,iBAAAA,AAAAA,EAAkBF;IAChD,MAAMG,eAAeC,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,eAC3BQ,AAAAA,IAAAA,0BAAAA,sBAAAA,AAAAA,EAAuBR,eACvBS,AAAAA,IAAAA,+BAAAA,2BAAAA,AAAAA,EAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAaf,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIgB,cAAchB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIiB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjB,QAAQ,YAAY,EAAE;QACxBO,IAAAA,sBAAAA,MAAAA,AAAAA,EACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,IAAAA,sBAAAA,MAAAA,AAAAA,EACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;QAC/CgB,aAAahB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiB,cAAcjB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMe,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMzB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,eACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMoB,SAAS,MAAM7B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIV,AAAAA,IAAAA,wBAAAA,SAAAA,AAAAA,EAAUP,cAAc;QAC1B,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EAAyB9B,MAAMS;QAEvChB,aAAa,yBAAyBmC;QAEtC,MAAMG,SAASC,AAAAA,IAAAA,0BAAAA,0BAAAA,AAAAA,EAA2BJ;QAE1CnC,aAAa,sBAAsBsC,OAAO,KAAK;QAC/CtC,aAAa,yBAAyBsC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DtC,aAAa,yBAAyB0C,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCtC,aAAa,iCAAiC;gBAAE2C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C7B,aAAa,+BAA+B;gBAAE6C;gBAAQE;YAAO;YAG7D,MAAMC,WAAW;YACjB,MAAMC,KAAKH,KAAK,GAAG,CAACD,SAASG,WAAW,GAAG;YAC3C,MAAME,KAAKJ,KAAK,GAAG,CAACC,SAASC,WAAW,GAAG;YAC3C,MAAMG,KAAKL,KAAK,GAAG,CAACD,SAASG,WAAW,GAAGpB;YAC3C,MAAMwB,KAAKN,KAAK,GAAG,CAACC,SAASC,WAAW,GAAGnB;YAG3CW,UAAU;gBACR,MAAMS;gBACN,KAAKC;gBACL,OAAOC,KAAKF;gBACZ,QAAQG,KAAKF;YACf;YAGA,IAAItC,QAAQ,YAAY,EAAE,MAAM;gBAC9B4B,QAAQ,IAAI,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBAC9C4B,QAAQ,GAAG,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YAC9C;YAEAZ,aAAa,qBAAqBwC;YAElC,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,AAAAA,IAAAA,yBAAAA,yBAAAA,AAAAA,EACnCF,YACAjC;YAGF,IAAIkC,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMd;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,MAAMkB,MAAM,MAAMzC,SAASR,MAAMS;IAEjC,MAAMyC,cAAcC,KAAK,SAAS,CAACF,IAAI,OAAO;IAE9C,IAAIhB;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYc,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBG,MAAM,OAAO,CAACH,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAhB,UAAUoB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACRJ,IAAI,OAAO,CAAC,IAAI,EAChB5B,YACAC,aACAjB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkB,oBACAC,qBACAd;YAGFjB,aAAa,WAAWwC;YAExB,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,AAAAA,IAAAA,yBAAAA,yBAAAA,AAAAA,EACnCF,YACAjC;YAEFsB,SAAS,EAAE;YAEX,IAAIY,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;IACF,EAAE,OAAOO,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACnB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEoB,IAAI,CAAC,CAAC;aAFtBpB,SAAS;YAACoB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMtB;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAe;QACA,OAAOD,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAeQ,gBAAgBpD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEoD,kBAAkB,EAAEjD,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMU,eAAe2C,AAAAA,IAAAA,uCAAAA,2BAAAA,AAAAA,EAA4BjD;IACjD,MAAMkD,gCAAgCC,AAAAA,IAAAA,uCAAAA,yBAAAA,AAAAA,EACpCjE,wBAAwB8D;IAE1B,MAAM1D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMiD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM/B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQ4D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA1D,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EACnB/D,MACAS;IAGF,IAAIuD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAab,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACjBY,aACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0BuE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DnE,aAAa,wBAAwBwE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASjB,MAAM,OAAO,CAACiB,OAC/B,GAAG,CAAC,CAACA,OACGhB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACLgB,MACA/D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqByE;QAGlC,MAAME,aAAaC,AAAAA,IAAAA,mCAAAA,UAAAA,AAAAA,EAAW;YAACL;eAAeE;SAAe;QAC7DzE,aAAa,iBAAiB2E;QAG9BN,cAAcQ,AAAAA,IAAAA,mCAAAA,gBAAAA,AAAAA,EAAiBF,YAAYhE,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BqE;IAC1C;IAEA,IAAIS,cAAc9D;IAClB,IAAIqD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EAC1BhE,kBACAqD,aACAtD,AAAgB,iBAAhBA;QAEF+D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBvE,OAO7C;IACC,MAAM,EAAEwE,SAAS,EAAEvE,OAAO,EAAEwE,aAAa,EAAE/E,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAe+D,AAAAA,IAAAA,8BAAAA,qBAAAA,AAAAA;IACrB,MAAMpE,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM0E,wBAAwBC,AAAAA,IAAAA,8BAAAA,sBAAAA,AAAAA,EAC5B5E,QAAQ,eAAe,IAAI,IAC3BwE;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKvE;YACL,QAAQ;QACV;IACF;IAGFuE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMhF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASkE;QACX;KACD;IAED,IAAInF,kBAAkB;QACpB,MAAM4B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAMmC,SAAS,MAAMC,AAAAA,IAAAA,yBAAAA,wBAAAA,AAAAA,EACnB/D,MACAS;IAEF,OAAO;QACL,aAAaqD,OAAO,OAAO;QAC3B,OAAOA,OAAO,KAAK;QACnB,mBAAmBA,OAAO,iBAAiB;IAC7C;AACF;AAEO,eAAeqB,sBACpBC,WAAmB,EACnB5E,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAeqE,AAAAA,IAAAA,yCAAAA,iCAAAA,AAAAA;IACrB,MAAMC,aAAaC,AAAAA,IAAAA,yCAAAA,yBAAAA,AAAAA,EAA0BH;IAE7C,MAAMpF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASsE;QACX;KACD;IAED,MAAMxB,SAAS,MAAMtD,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBqD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -24,6 +24,7 @@ var __webpack_require__ = {};
|
|
|
24
24
|
var __webpack_exports__ = {};
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
|
+
parseXMLPlanningResponse: ()=>parseXMLPlanningResponse,
|
|
27
28
|
plan: ()=>plan
|
|
28
29
|
});
|
|
29
30
|
const img_namespaceObject = require("@midscene/shared/img");
|
|
@@ -33,14 +34,57 @@ const external_common_js_namespaceObject = require("../common.js");
|
|
|
33
34
|
const llm_planning_js_namespaceObject = require("./prompt/llm-planning.js");
|
|
34
35
|
const index_js_namespaceObject = require("./service-caller/index.js");
|
|
35
36
|
const debug = (0, logger_namespaceObject.getDebug)('planning');
|
|
37
|
+
function parseXMLPlanningResponse(xmlString) {
|
|
38
|
+
const extractTag = (tagName)=>{
|
|
39
|
+
const regex = new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`, 'i');
|
|
40
|
+
const match = xmlString.match(regex);
|
|
41
|
+
return match ? match[1].trim() : void 0;
|
|
42
|
+
};
|
|
43
|
+
const thought = extractTag('thought');
|
|
44
|
+
const note = extractTag('note');
|
|
45
|
+
const log = extractTag('log');
|
|
46
|
+
const error = extractTag('error');
|
|
47
|
+
const actionType = extractTag('action-type');
|
|
48
|
+
const actionParamStr = extractTag('action-param-json');
|
|
49
|
+
if (!log) throw new Error('Missing required field: log');
|
|
50
|
+
let action = null;
|
|
51
|
+
if (actionType && 'null' !== actionType.toLowerCase()) {
|
|
52
|
+
const type = actionType.trim();
|
|
53
|
+
let param;
|
|
54
|
+
if (actionParamStr) try {
|
|
55
|
+
param = (0, index_js_namespaceObject.safeParseJson)(actionParamStr, void 0);
|
|
56
|
+
} catch (e) {
|
|
57
|
+
throw new Error(`Failed to parse action-param-json: ${e}`);
|
|
58
|
+
}
|
|
59
|
+
action = {
|
|
60
|
+
type,
|
|
61
|
+
...void 0 !== param ? {
|
|
62
|
+
param
|
|
63
|
+
} : {}
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
return {
|
|
67
|
+
...thought ? {
|
|
68
|
+
thought
|
|
69
|
+
} : {},
|
|
70
|
+
...note ? {
|
|
71
|
+
note
|
|
72
|
+
} : {},
|
|
73
|
+
log,
|
|
74
|
+
...error ? {
|
|
75
|
+
error
|
|
76
|
+
} : {},
|
|
77
|
+
action
|
|
78
|
+
};
|
|
79
|
+
}
|
|
36
80
|
async function plan(userInstruction, opts) {
|
|
37
81
|
const { context, modelConfig, conversationHistory } = opts;
|
|
38
82
|
const { size } = context;
|
|
39
83
|
const screenshotBase64 = context.screenshot.base64;
|
|
40
|
-
const {
|
|
84
|
+
const { modelFamily } = modelConfig;
|
|
41
85
|
const systemPrompt = await (0, llm_planning_js_namespaceObject.systemPromptToTaskPlanning)({
|
|
42
86
|
actionSpace: opts.actionSpace,
|
|
43
|
-
|
|
87
|
+
modelFamily,
|
|
44
88
|
includeBbox: opts.includeBbox,
|
|
45
89
|
includeThought: true !== opts.deepThink
|
|
46
90
|
});
|
|
@@ -49,7 +93,7 @@ async function plan(userInstruction, opts) {
|
|
|
49
93
|
let imageHeight = size.height;
|
|
50
94
|
const rightLimit = imageWidth;
|
|
51
95
|
const bottomLimit = imageHeight;
|
|
52
|
-
if ('qwen2.5-vl' ===
|
|
96
|
+
if ('qwen2.5-vl' === modelFamily) {
|
|
53
97
|
const paddedResult = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
|
|
54
98
|
imageWidth = paddedResult.width;
|
|
55
99
|
imageHeight = paddedResult.height;
|
|
@@ -112,9 +156,10 @@ async function plan(userInstruction, opts) {
|
|
|
112
156
|
...instruction,
|
|
113
157
|
...historyLog
|
|
114
158
|
];
|
|
115
|
-
const { content:
|
|
159
|
+
const { content: rawResponse, usage, reasoning_content } = await (0, index_js_namespaceObject.callAI)(msgs, modelConfig, {
|
|
116
160
|
deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink
|
|
117
161
|
});
|
|
162
|
+
const planFromAI = parseXMLPlanningResponse(rawResponse);
|
|
118
163
|
const actions = planFromAI.action ? [
|
|
119
164
|
planFromAI.action
|
|
120
165
|
] : [];
|
|
@@ -141,7 +186,7 @@ async function plan(userInstruction, opts) {
|
|
|
141
186
|
debug('locateFields', locateFields);
|
|
142
187
|
locateFields.forEach((field)=>{
|
|
143
188
|
const locateResult = action.param[field];
|
|
144
|
-
if (locateResult && void 0 !==
|
|
189
|
+
if (locateResult && void 0 !== modelFamily) action.param[field] = (0, external_common_js_namespaceObject.fillBboxParam)(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, modelFamily);
|
|
145
190
|
});
|
|
146
191
|
});
|
|
147
192
|
conversationHistory.append({
|
|
@@ -155,8 +200,10 @@ async function plan(userInstruction, opts) {
|
|
|
155
200
|
});
|
|
156
201
|
return returnValue;
|
|
157
202
|
}
|
|
203
|
+
exports.parseXMLPlanningResponse = __webpack_exports__.parseXMLPlanningResponse;
|
|
158
204
|
exports.plan = __webpack_exports__.plan;
|
|
159
205
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
206
|
+
"parseXMLPlanningResponse",
|
|
160
207
|
"plan"
|
|
161
208
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
162
209
|
Object.defineProperty(exports, '__esModule', {
|