@midscene/core 1.2.2-beta-20260120033218.0 → 1.2.3-beta-20260120082504.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +3 -3
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/task-builder.mjs +4 -2
- package/dist/es/agent/task-builder.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +9 -5
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/inspect.mjs +7 -6
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +60 -6
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/extraction.mjs +51 -53
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +64 -49
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/util.mjs +6 -1
- package/dist/es/ai-model/prompt/util.mjs.map +1 -1
- package/dist/es/device/index.mjs +2 -14
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/agent.js +2 -2
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/task-builder.js +4 -2
- package/dist/lib/agent/task-builder.js.map +1 -1
- package/dist/lib/agent/tasks.js +9 -5
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/inspect.js +5 -4
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +60 -3
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/extraction.js +53 -55
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +64 -49
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/util.js +8 -0
- package/dist/lib/ai-model/prompt/util.js.map +1 -1
- package/dist/lib/device/index.js +16 -34
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/agent/task-builder.d.ts +3 -1
- package/dist/types/agent/tasks.d.ts +2 -0
- package/dist/types/ai-model/inspect.d.ts +2 -2
- package/dist/types/ai-model/llm-planning.d.ts +6 -2
- package/dist/types/ai-model/prompt/extraction.d.ts +5 -2
- package/dist/types/ai-model/prompt/util.d.ts +7 -0
- package/dist/types/device/index.d.ts +0 -11
- package/dist/types/types.d.ts +8 -0
- package/dist/types/yaml.d.ts +1 -5
- package/package.json +2 -2
|
@@ -24,11 +24,39 @@ var __webpack_require__ = {};
|
|
|
24
24
|
var __webpack_exports__ = {};
|
|
25
25
|
__webpack_require__.r(__webpack_exports__);
|
|
26
26
|
__webpack_require__.d(__webpack_exports__, {
|
|
27
|
-
|
|
27
|
+
parseXMLExtractionResponse: ()=>parseXMLExtractionResponse,
|
|
28
28
|
systemPromptToExtract: ()=>systemPromptToExtract,
|
|
29
29
|
extractDataQueryPrompt: ()=>extractDataQueryPrompt
|
|
30
30
|
});
|
|
31
31
|
const env_namespaceObject = require("@midscene/shared/env");
|
|
32
|
+
const index_js_namespaceObject = require("../service-caller/index.js");
|
|
33
|
+
const external_util_js_namespaceObject = require("./util.js");
|
|
34
|
+
function parseXMLExtractionResponse(xmlString) {
|
|
35
|
+
const thought = (0, external_util_js_namespaceObject.extractXMLTag)(xmlString, 'thought');
|
|
36
|
+
const dataJsonStr = (0, external_util_js_namespaceObject.extractXMLTag)(xmlString, 'data-json');
|
|
37
|
+
const errorsStr = (0, external_util_js_namespaceObject.extractXMLTag)(xmlString, 'errors');
|
|
38
|
+
if (!dataJsonStr) throw new Error('Missing required field: data-json');
|
|
39
|
+
let data;
|
|
40
|
+
try {
|
|
41
|
+
data = (0, index_js_namespaceObject.safeParseJson)(dataJsonStr, void 0);
|
|
42
|
+
} catch (e) {
|
|
43
|
+
throw new Error(`Failed to parse data-json: ${e}`);
|
|
44
|
+
}
|
|
45
|
+
let errors;
|
|
46
|
+
if (errorsStr) try {
|
|
47
|
+
const parsedErrors = (0, index_js_namespaceObject.safeParseJson)(errorsStr, void 0);
|
|
48
|
+
if (Array.isArray(parsedErrors)) errors = parsedErrors;
|
|
49
|
+
} catch (e) {}
|
|
50
|
+
return {
|
|
51
|
+
...thought ? {
|
|
52
|
+
thought
|
|
53
|
+
} : {},
|
|
54
|
+
data,
|
|
55
|
+
...errors && errors.length > 0 ? {
|
|
56
|
+
errors
|
|
57
|
+
} : {}
|
|
58
|
+
};
|
|
59
|
+
}
|
|
32
60
|
function systemPromptToExtract() {
|
|
33
61
|
const preferredLanguage = (0, env_namespaceObject.getPreferredLanguage)();
|
|
34
62
|
return `
|
|
@@ -41,12 +69,10 @@ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Ar
|
|
|
41
69
|
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
|
|
42
70
|
|
|
43
71
|
|
|
44
|
-
Return in the following
|
|
45
|
-
{
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
errors: [], // string[], error message if any
|
|
49
|
-
}
|
|
72
|
+
Return in the following XML format:
|
|
73
|
+
<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>
|
|
74
|
+
<data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>
|
|
75
|
+
<errors>optional error messages as JSON array, e.g., ["error1", "error2"]</errors>
|
|
50
76
|
|
|
51
77
|
# Example 1
|
|
52
78
|
For example, if the DATA_DEMAND is:
|
|
@@ -61,14 +87,14 @@ For example, if the DATA_DEMAND is:
|
|
|
61
87
|
|
|
62
88
|
By viewing the screenshot and page contents, you can extract the following data:
|
|
63
89
|
|
|
90
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
91
|
+
<data-json>
|
|
64
92
|
{
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
age: 30,
|
|
69
|
-
isAdmin: true
|
|
70
|
-
},
|
|
93
|
+
"name": "John",
|
|
94
|
+
"age": 30,
|
|
95
|
+
"isAdmin": true
|
|
71
96
|
}
|
|
97
|
+
</data-json>
|
|
72
98
|
|
|
73
99
|
# Example 2
|
|
74
100
|
If the DATA_DEMAND is:
|
|
@@ -79,10 +105,10 @@ the todo items list, string[]
|
|
|
79
105
|
|
|
80
106
|
By viewing the screenshot and page contents, you can extract the following data:
|
|
81
107
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
108
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
109
|
+
<data-json>
|
|
110
|
+
["todo 1", "todo 2", "todo 3"]
|
|
111
|
+
</data-json>
|
|
86
112
|
|
|
87
113
|
# Example 3
|
|
88
114
|
If the DATA_DEMAND is:
|
|
@@ -93,10 +119,10 @@ the page title, string
|
|
|
93
119
|
|
|
94
120
|
By viewing the screenshot and page contents, you can extract the following data:
|
|
95
121
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
122
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
123
|
+
<data-json>
|
|
124
|
+
"todo list"
|
|
125
|
+
</data-json>
|
|
100
126
|
|
|
101
127
|
# Example 4
|
|
102
128
|
If the DATA_DEMAND is:
|
|
@@ -109,10 +135,10 @@ If the DATA_DEMAND is:
|
|
|
109
135
|
|
|
110
136
|
By viewing the screenshot and page contents, you can extract the following data:
|
|
111
137
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
138
|
+
<thought>According to the screenshot, i can see ...</thought>
|
|
139
|
+
<data-json>
|
|
140
|
+
{ "result": true }
|
|
141
|
+
</data-json>
|
|
116
142
|
`;
|
|
117
143
|
}
|
|
118
144
|
const extractDataQueryPrompt = (pageDescription, dataQuery)=>{
|
|
@@ -128,40 +154,12 @@ ${dataQueryText}
|
|
|
128
154
|
</DATA_DEMAND>
|
|
129
155
|
`;
|
|
130
156
|
};
|
|
131
|
-
const extractDataSchema = {
|
|
132
|
-
type: 'json_schema',
|
|
133
|
-
json_schema: {
|
|
134
|
-
name: 'extract_data',
|
|
135
|
-
strict: true,
|
|
136
|
-
schema: {
|
|
137
|
-
type: 'object',
|
|
138
|
-
properties: {
|
|
139
|
-
data: {
|
|
140
|
-
type: 'object',
|
|
141
|
-
description: 'The extracted data'
|
|
142
|
-
},
|
|
143
|
-
errors: {
|
|
144
|
-
type: 'array',
|
|
145
|
-
items: {
|
|
146
|
-
type: 'string'
|
|
147
|
-
},
|
|
148
|
-
description: 'Error messages, if any'
|
|
149
|
-
}
|
|
150
|
-
},
|
|
151
|
-
required: [
|
|
152
|
-
'data',
|
|
153
|
-
'errors'
|
|
154
|
-
],
|
|
155
|
-
additionalProperties: false
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
};
|
|
159
157
|
exports.extractDataQueryPrompt = __webpack_exports__.extractDataQueryPrompt;
|
|
160
|
-
exports.
|
|
158
|
+
exports.parseXMLExtractionResponse = __webpack_exports__.parseXMLExtractionResponse;
|
|
161
159
|
exports.systemPromptToExtract = __webpack_exports__.systemPromptToExtract;
|
|
162
160
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
163
161
|
"extractDataQueryPrompt",
|
|
164
|
-
"
|
|
162
|
+
"parseXMLExtractionResponse",
|
|
165
163
|
"systemPromptToExtract"
|
|
166
164
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
167
165
|
Object.defineProperty(exports, '__esModule', {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/extraction.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/extraction.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { getPreferredLanguage } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nexport function systemPromptToExtract() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.\n\nThe user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.\n\nIf a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.\n\nIf the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.\n\n\nReturn in the following
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/extraction.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/extraction.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { AIDataExtractionResponse } from '@/types';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { safeParseJson } from '../service-caller/index';\nimport { extractXMLTag } from './util';\n\n/**\n * Parse XML response from LLM and convert to AIDataExtractionResponse\n */\nexport function parseXMLExtractionResponse<T>(\n xmlString: string,\n): AIDataExtractionResponse<T> {\n const thought = extractXMLTag(xmlString, 'thought');\n const dataJsonStr = extractXMLTag(xmlString, 'data-json');\n const errorsStr = extractXMLTag(xmlString, 'errors');\n\n // Parse data-json (required)\n if (!dataJsonStr) {\n throw new Error('Missing required field: data-json');\n }\n\n let data: T;\n try {\n data = safeParseJson(dataJsonStr, undefined) as T;\n } catch (e) {\n throw new Error(`Failed to parse data-json: ${e}`);\n }\n\n // Parse errors (optional)\n let errors: string[] | undefined;\n if (errorsStr) {\n try {\n const parsedErrors = safeParseJson(errorsStr, undefined);\n if (Array.isArray(parsedErrors)) {\n errors = parsedErrors;\n }\n } catch (e) {\n // If errors parsing fails, just ignore it\n }\n }\n\n return {\n ...(thought ? { thought } : {}),\n data,\n ...(errors && errors.length > 0 ? { errors } : {}),\n };\n}\n\nexport function systemPromptToExtract() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.\n\nThe user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.\n\nIf a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.\n\nIf the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.\n\n\nReturn in the following XML format:\n<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>\n<data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>\n<errors>optional error messages as JSON array, e.g., [\"error1\", \"error2\"]</errors>\n\n# Example 1\nFor example, if the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"name\": \"name shows on the left panel, string\",\n \"age\": \"age shows on the right panel, number\",\n \"isAdmin\": \"if the user is admin, boolean\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n{\n \"name\": \"John\",\n \"age\": 30,\n \"isAdmin\": true\n}\n</data-json>\n\n# Example 2\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe todo items list, string[]\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n[\"todo 1\", \"todo 2\", \"todo 3\"]\n</data-json>\n\n# Example 3\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe page title, string\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n\"todo list\"\n</data-json>\n\n# Example 4\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"result\": \"Boolean, is it currently the SMS page?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n{ \"result\": true }\n</data-json>\n`;\n}\n\nexport const extractDataQueryPrompt = (\n pageDescription: string,\n dataQuery: string | Record<string, string>,\n) => {\n let dataQueryText = '';\n if (typeof dataQuery === 'string') {\n dataQueryText = dataQuery;\n } else {\n dataQueryText = JSON.stringify(dataQuery, null, 2);\n }\n\n return `\n<PageDescription>\n${pageDescription}\n</PageDescription>\n\n<DATA_DEMAND>\n${dataQueryText}\n</DATA_DEMAND>\n `;\n};\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","parseXMLExtractionResponse","xmlString","thought","extractXMLTag","dataJsonStr","errorsStr","Error","data","safeParseJson","undefined","e","errors","parsedErrors","Array","systemPromptToExtract","preferredLanguage","getPreferredLanguage","extractDataQueryPrompt","pageDescription","dataQuery","dataQueryText","JSON"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;ACGO,SAASI,2BACdC,SAAiB;IAEjB,MAAMC,UAAUC,AAAAA,IAAAA,iCAAAA,aAAAA,AAAAA,EAAcF,WAAW;IACzC,MAAMG,cAAcD,AAAAA,IAAAA,iCAAAA,aAAAA,AAAAA,EAAcF,WAAW;IAC7C,MAAMI,YAAYF,AAAAA,IAAAA,iCAAAA,aAAAA,AAAAA,EAAcF,WAAW;IAG3C,IAAI,CAACG,aACH,MAAM,IAAIE,MAAM;IAGlB,IAAIC;IACJ,IAAI;QACFA,OAAOC,AAAAA,IAAAA,yBAAAA,aAAAA,AAAAA,EAAcJ,aAAaK;IACpC,EAAE,OAAOC,GAAG;QACV,MAAM,IAAIJ,MAAM,CAAC,2BAA2B,EAAEI,GAAG;IACnD;IAGA,IAAIC;IACJ,IAAIN,WACF,IAAI;QACF,MAAMO,eAAeJ,AAAAA,IAAAA,yBAAAA,aAAAA,AAAAA,EAAcH,WAAWI;QAC9C,IAAII,MAAM,OAAO,CAACD,eAChBD,SAASC;IAEb,EAAE,OAAOF,GAAG,CAEZ;IAGF,OAAO;QACL,GAAIR,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9BK;QACA,GAAII,UAAUA,OAAO,MAAM,GAAG,IAAI;YAAEA;QAAO,IAAI,CAAC,CAAC;IACnD;AACF;AAEO,SAASG;IACd,MAAMC,oBAAoBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA;IAE1B,OAAO,CAAC;;;;;;;;;;;0EAWgE,EAAED,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAqE9F,CAAC;AACD;AAEO,MAAME,yBAAyB,CACpCC,iBACAC;IAEA,IAAIC,gBAAgB;IAElBA,gBADE,AAAqB,YAArB,OAAOD,YACOA,YAEAE,KAAK,SAAS,CAACF,WAAW,MAAM;IAGlD,OAAO,CAAC;;AAEV,EAAED,gBAAgB;;;;AAIlB,EAAEE,cAAc;;EAEd,CAAC;AACH"}
|
|
@@ -30,15 +30,6 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
30
30
|
const env_namespaceObject = require("@midscene/shared/env");
|
|
31
31
|
const zod_schema_utils_namespaceObject = require("@midscene/shared/zod-schema-utils");
|
|
32
32
|
const external_common_js_namespaceObject = require("./common.js");
|
|
33
|
-
const buildCommonOutputFields = (includeThought, preferredLanguage)=>{
|
|
34
|
-
const fields = [
|
|
35
|
-
`"note"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action. Use ${preferredLanguage}.`,
|
|
36
|
-
`"log": string, // a brief preamble to the user explaining what you're about to do. Use ${preferredLanguage}.`,
|
|
37
|
-
`"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use ${preferredLanguage}.`
|
|
38
|
-
];
|
|
39
|
-
if (includeThought) fields.unshift('"thought": string, // your thought process about the next action');
|
|
40
|
-
return fields.join('\n ');
|
|
41
|
-
};
|
|
42
33
|
const vlLocateParam = (modelFamily)=>{
|
|
43
34
|
if (modelFamily) return `{bbox: [number, number, number, number], prompt: string } // ${(0, external_common_js_namespaceObject.bboxDescription)(modelFamily)}`;
|
|
44
35
|
return "{ prompt: string /* description of the target element */ }";
|
|
@@ -113,7 +104,7 @@ The \`log\` field is a brief preamble message to the user explaining what you're
|
|
|
113
104
|
|
|
114
105
|
- **Use ${preferredLanguage}**
|
|
115
106
|
- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).
|
|
116
|
-
- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what
|
|
107
|
+
- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what's been done so far and create a sense of momentum and clarity for the user to understand your next actions.
|
|
117
108
|
- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.
|
|
118
109
|
|
|
119
110
|
**Examples:**
|
|
@@ -123,13 +114,27 @@ The \`log\` field is a brief preamble message to the user explaining what you're
|
|
|
123
114
|
- "Go back to find the login button"
|
|
124
115
|
`;
|
|
125
116
|
const shouldIncludeThought = includeThought ?? true;
|
|
126
|
-
const
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
117
|
+
const locateExample1 = includeBbox ? `{
|
|
118
|
+
"prompt": "Add to cart button for Sauce Labs Backpack",
|
|
119
|
+
"bbox": [345, 442, 458, 483]
|
|
120
|
+
}` : `{
|
|
121
|
+
"prompt": "Add to cart button for Sauce Labs Backpack"
|
|
122
|
+
}`;
|
|
123
|
+
const locateExample2 = includeBbox ? `{
|
|
124
|
+
"prompt": "Add to cart button for Sauce Labs Bike Light",
|
|
125
|
+
"bbox": [732, 442, 844, 483]
|
|
126
|
+
}` : `{
|
|
127
|
+
"prompt": "Add to cart button for Sauce Labs Bike Light"
|
|
128
|
+
}`;
|
|
129
|
+
const locateExample3 = includeBbox ? `{
|
|
130
|
+
"prompt": "Cart icon in top right corner",
|
|
131
|
+
"bbox": [956, 17, 982, 54]
|
|
132
|
+
}` : `{
|
|
133
|
+
"prompt": "Cart icon in top right corner"
|
|
134
|
+
}`;
|
|
135
|
+
const thoughtTag = (content)=>shouldIncludeThought ? `<thought>${content}</thought>\n` : '';
|
|
131
136
|
return `
|
|
132
|
-
Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to
|
|
137
|
+
Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to accomplish the instruction.
|
|
133
138
|
|
|
134
139
|
Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
135
140
|
|
|
@@ -139,9 +144,9 @@ Please tell what the next one action is (or null if no action should be done) to
|
|
|
139
144
|
- Give just the next ONE action you should do
|
|
140
145
|
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
141
146
|
- Make sure the previous actions are completed successfully before performing the next step
|
|
142
|
-
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than
|
|
147
|
+
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 3 times, you should think this is an error and set the "error" field to the error message.
|
|
143
148
|
- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the "Print_Assert_Result" action.
|
|
144
|
-
-
|
|
149
|
+
- Return the "complete-task" tag when the task is completed and no more actions should be done.
|
|
145
150
|
|
|
146
151
|
## Supporting actions
|
|
147
152
|
${actionList}
|
|
@@ -150,47 +155,57 @@ ${logFieldInstruction}
|
|
|
150
155
|
|
|
151
156
|
## Return format
|
|
152
157
|
|
|
153
|
-
Return in
|
|
158
|
+
Return in XML format with the following structure:
|
|
159
|
+
${shouldIncludeThought ? "<thought>Think through the following: What is the user's requirement? What is the current state based on the screenshot? What should be the next action and which action-type to use (or error, or complete-task)? Write your thoughts naturally without numbering or section headers.</thought>" : ''}
|
|
160
|
+
<note>CRITICAL: If any information from the current screenshot will be needed in follow-up actions, you MUST record it here completely. The current screenshot will NOT be available in subsequent steps, so this note is your only way to preserve essential information for later use. Examples: extracted data, element states, content that needs to be referenced. Leave empty if no follow-up information is needed.</note>
|
|
161
|
+
<log>a brief preamble to the user</log>
|
|
162
|
+
<error>error messages (optional)</error>
|
|
163
|
+
<action-type>the type of the action, or null if no action</action-type>
|
|
164
|
+
<action-param-json>JSON object containing the action parameters</action-param-json>
|
|
165
|
+
<complete-task success="true|false">Optional: Use this tag to finalize the task when all instructions have been completed. Set success="true" if the task succeeded, or success="false" if it failed. When success="true", the message should contain the conclusion, data, or return value that the user needs. When success="false", the message MUST explain why the task failed and what went wrong. When this tag is present, no action-type or action-param-json is needed.</complete-task>
|
|
166
|
+
|
|
167
|
+
## Example
|
|
168
|
+
|
|
169
|
+
This is an example of a complete interaction flow:
|
|
170
|
+
|
|
171
|
+
user: <user_instruction>Add first two items to the cart and tell me the total price of the cart. Just the price number, no other text</user_instruction>
|
|
172
|
+
|
|
173
|
+
user: this is the latest screenshot
|
|
174
|
+
(image ignored due to size optimization)
|
|
175
|
+
|
|
176
|
+
assistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The screenshot shows a product listing page with the first two items being "Sauce Labs Backpack" ($29.99) and "Sauce Labs Bike Light" ($9.99), both with "Add to cart" buttons visible. I should click the "Add to cart" button for the first item (Sauce Labs Backpack) using the Tap action.')}<log>Click 'Add to cart' for the Sauce Labs Backpack</log>
|
|
177
|
+
<action-type>Tap</action-type>
|
|
178
|
+
<action-param-json>
|
|
154
179
|
{
|
|
155
|
-
${
|
|
156
|
-
"action":
|
|
157
|
-
{
|
|
158
|
-
"type": string, // the type of the action
|
|
159
|
-
"param"?: { // The parameter of the action, if any
|
|
160
|
-
// k-v style parameter fields
|
|
161
|
-
},
|
|
162
|
-
} | null
|
|
180
|
+
"locate": ${locateExample1}
|
|
163
181
|
}
|
|
182
|
+
</action-param-json>
|
|
164
183
|
|
|
165
|
-
|
|
184
|
+
user: Time: 2026-01-20 14:38:03 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.
|
|
185
|
+
(image ignored due to size optimization)
|
|
166
186
|
|
|
187
|
+
assistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The first item (Sauce Labs Backpack) has been successfully added - its button now shows "Remove" and the cart icon displays 1 item. The second item (Sauce Labs Bike Light) still shows "Add to cart" button. I should now click the "Add to cart" button for the second item using the Tap action.')}<log>Click 'Add to cart' for the Sauce Labs Bike Light</log>
|
|
188
|
+
<action-type>Tap</action-type>
|
|
189
|
+
<action-param-json>
|
|
167
190
|
{
|
|
168
|
-
|
|
169
|
-
"action": {
|
|
170
|
-
"type": "Tap",
|
|
171
|
-
"param": {
|
|
172
|
-
"locate": {
|
|
173
|
-
"prompt": "The login button"${modelFamily && includeBbox ? ', "bbox": [100, 200, 300, 400]' : ''}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
}
|
|
191
|
+
"locate": ${locateExample2}
|
|
177
192
|
}
|
|
193
|
+
</action-param-json>
|
|
178
194
|
|
|
179
|
-
|
|
195
|
+
user: Time: 2026-01-20 14:38:08 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.
|
|
180
196
|
|
|
197
|
+
assistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). Both items have been successfully added - both show "Remove" buttons and the cart icon displays 2 items. The total should be $29.99 + $9.99 = $39.98, but I need to verify by viewing the cart. I should click the cart icon to view the cart details and confirm the total price using the Tap action.')}<log>Click the cart icon to view the cart and confirm the total price</log>
|
|
198
|
+
<action-type>Tap</action-type>
|
|
199
|
+
<action-param-json>
|
|
181
200
|
{
|
|
182
|
-
|
|
183
|
-
"log": "Scroll to find more titles",
|
|
184
|
-
"action": {
|
|
185
|
-
"type": "Scroll",
|
|
186
|
-
"param": {
|
|
187
|
-
"locate": {
|
|
188
|
-
"prompt": "The page content area"
|
|
189
|
-
},
|
|
190
|
-
"direction": "down"
|
|
191
|
-
}
|
|
192
|
-
}
|
|
201
|
+
"locate": ${locateExample3}
|
|
193
202
|
}
|
|
203
|
+
</action-param-json>
|
|
204
|
+
|
|
205
|
+
user: Time: 2026-01-20 14:38:13 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.
|
|
206
|
+
|
|
207
|
+
assistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The cart now displays both items: Sauce Labs Backpack ($29.99) and Sauce Labs Bike Light ($9.99). The total price is $29.99 + $9.99 = $39.98. All required actions are complete. I should use complete-task with success="true" to return the total price as requested (just the number 39.98).')}<log>Report the total price: 39.98</log>
|
|
208
|
+
<complete-task success="true">39.98</complete-task>
|
|
194
209
|
`;
|
|
195
210
|
}
|
|
196
211
|
exports.descriptionForAction = __webpack_exports__.descriptionForAction;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { DeviceAction } from '@/types';\nimport type { TModelFamily } from '@midscene/shared/env';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst buildCommonOutputFields = (\n includeThought: boolean,\n preferredLanguage: string,\n) => {\n const fields = [\n `\"note\"?: string, // some important notes to finish the follow-up action should be written here, and the agent executing the subsequent steps will focus on this information. For example, the data extracted from the current screenshot which will be used in the follow-up action. Use ${preferredLanguage}.`,\n `\"log\": string, // a brief preamble to the user explaining what you're about to do. Use ${preferredLanguage}.`,\n `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use ${preferredLanguage}.`,\n ];\n\n if (includeThought) {\n fields.unshift(\n `\"thought\": string, // your thought process about the next action`,\n );\n }\n\n return fields.join('\\n ');\n};\n\nconst vlLocateParam = (modelFamily: TModelFamily | undefined) => {\n if (modelFamily) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(modelFamily)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n modelFamily,\n includeBbox,\n includeThought,\n}: {\n actionSpace: DeviceAction<any>[];\n modelFamily: TModelFamily | undefined;\n includeBbox: boolean;\n includeThought?: boolean;\n}) {\n const preferredLanguage = getPreferredLanguage();\n\n // Validate parameters: if includeBbox is true, modelFamily must be defined\n if (includeBbox && !modelFamily) {\n throw new Error(\n 'modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? modelFamily : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you're about to do. It should follow these principles and examples:\n\n- **Use ${preferredLanguage}**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n const shouldIncludeThought = includeThought ?? true;\n const commonOutputFields = buildCommonOutputFields(\n shouldIncludeThought,\n preferredLanguage,\n );\n const exampleThoughtLine = shouldIncludeThought\n ? ` \"thought\": \"The form has already been filled, I need to click the login button to login\",\n`\n : '';\n const exampleThoughtLineWithNote = shouldIncludeThought\n ? ` \"thought\": \"I need to note the titles in the current screenshot for further processing and scroll to find more titles\",\n`\n : '';\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n- Call the \"Finalize\" action when the task is completed and no more actions should be done.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n${exampleThoughtLine} \"log\": \"Click the login button\",\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${modelFamily && includeBbox ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n}\n\nFor example, if the instruction is to find out every title in the screenshot, the return value should be:\n\n{\n${exampleThoughtLineWithNote} \"note\": \"The titles in the current screenshot are: 'Hello, world!', 'Midscene 101', 'Model strategy'\",\n \"log\": \"Scroll to find more titles\",\n \"action\": {\n \"type\": \"Scroll\",\n \"param\": {\n \"locate\": {\n \"prompt\": \"The page content area\"\n },\n \"direction\": \"down\"\n }\n }\n}\n`;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","buildCommonOutputFields","includeThought","preferredLanguage","fields","vlLocateParam","modelFamily","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","paramLines","schema","isZodObject","shape","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","getPreferredLanguage","Error","actionDescriptionList","actionList","logFieldInstruction","shouldIncludeThought","commonOutputFields","exampleThoughtLine","exampleThoughtLineWithNote"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;ACOA,MAAMI,0BAA0B,CAC9BC,gBACAC;IAEA,MAAMC,SAAS;QACb,CAAC,yRAAyR,EAAED,kBAAkB,CAAC,CAAC;QAChT,CAAC,uFAAuF,EAAEA,kBAAkB,CAAC,CAAC;QAC9G,CAAC,2KAA2K,EAAEA,kBAAkB,CAAC,CAAC;KACnM;IAED,IAAID,gBACFE,OAAO,OAAO,CACZ;IAIJ,OAAOA,OAAO,IAAI,CAAC;AACrB;AAEA,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,aACF,OAAO,CAAC,6DAA6D,EAAEC,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBD,cAAc;IAEvG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMb,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEW,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMG,aAAuB,EAAE;QAG/B,MAAMC,SAASJ,OAAO,WAAW;QAIjC,MAAMK,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACvB,KAAKa,MAAM,IAAIZ,OAAO,OAAO,CAACwB,OACxC,IAAIZ,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMa,aACJ,AACE,cADF,OAAQb,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMc,kBAAkBD,aAAa,GAAG1B,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAM4B,WAAWC,AAAAA,IAAAA,iCAAAA,cAAAA,AAAAA,EAAehB,OAAOO;gBAGvC,MAAMU,cAAcC,AAAAA,IAAAA,iCAAAA,iBAAAA,AAAAA,EAAkBlB;gBAGtC,MAAMmB,eAAepB,iBAAiBC;gBACtC,MAAMoB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3Cd,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBd,OAAO,IAAI,CAAC;gBACZc,WAAW,OAAO,CAAC,CAACiB;oBAClB/B,OAAO,IAAI,CAAC,CAAC,IAAI,EAAE+B,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,AAAAA,IAAAA,iCAAAA,cAAAA,AAAAA,EAAeN;YAChC,MAAMO,cAAcC,AAAAA,IAAAA,iCAAAA,iBAAAA,AAAAA,EAAkBR;YAGtC,IAAIiB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBhC,OAAO,IAAI,CAACgC;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAErB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMb,OAAO,IAAI,CAAC,CAAC,EAAE,EAAEa,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeoB,2BAA2B,EAC/CC,WAAW,EACXhC,WAAW,EACXiC,WAAW,EACXrC,cAAc,EAMf;IACC,MAAMC,oBAAoBqC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA;IAG1B,IAAID,eAAe,CAACjC,aAClB,MAAM,IAAImC,MACR;IAIJ,MAAMC,wBAAwBJ,YAAY,GAAG,CAAC,CAACvB,SACtCD,qBACLC,QACAV,cAAckC,cAAcjC,cAAcwB;IAG9C,MAAMa,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,MAAME,sBAAsB,CAAC;;;;;QAKvB,EAAEzC,kBAAkB;;;;;;;;;;AAU5B,CAAC;IAEC,MAAM0C,uBAAuB3C,kBAAkB;IAC/C,MAAM4C,qBAAqB7C,wBACzB4C,sBACA1C;IAEF,MAAM4C,qBAAqBF,uBACvB,CAAC;AACP,CAAC,GACK;IACJ,MAAMG,6BAA6BH,uBAC/B,CAAC;AACP,CAAC,GACK;IAEJ,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAEF,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;EAMpB,EAAEE,mBAAmB;;;;;;;;;;;;;AAavB,EAAEC,mBAAmB;;;;;oCAKe,EAAEzC,eAAeiC,cAAc,mCAAmC,GAAG;;;;;;;;;AASzG,EAAES,2BAA2B;;;;;;;;;;;;AAY7B,CAAC;AACD"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { DeviceAction } from '@/types';\nimport type { TModelFamily } from '@midscene/shared/env';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\nconst vlLocateParam = (modelFamily: TModelFamily | undefined) => {\n if (modelFamily) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(modelFamily)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n modelFamily,\n includeBbox,\n includeThought,\n}: {\n actionSpace: DeviceAction<any>[];\n modelFamily: TModelFamily | undefined;\n includeBbox: boolean;\n includeThought?: boolean;\n}) {\n const preferredLanguage = getPreferredLanguage();\n\n // Validate parameters: if includeBbox is true, modelFamily must be defined\n if (includeBbox && !modelFamily) {\n throw new Error(\n 'modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? modelFamily : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you're about to do. It should follow these principles and examples:\n\n- **Use ${preferredLanguage}**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what's been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n const shouldIncludeThought = includeThought ?? true;\n\n // Generate locate object examples based on includeBbox\n const locateExample1 = includeBbox\n ? `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\",\n \"bbox\": [345, 442, 458, 483]\n }`\n : `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\"\n }`;\n\n const locateExample2 = includeBbox\n ? `{\n \"prompt\": \"Add to cart button for Sauce Labs Bike Light\",\n \"bbox\": [732, 442, 844, 483]\n }`\n : `{\n \"prompt\": \"Add to cart button for Sauce Labs Bike Light\"\n }`;\n\n const locateExample3 = includeBbox\n ? `{\n \"prompt\": \"Cart icon in top right corner\",\n \"bbox\": [956, 17, 982, 54]\n }`\n : `{\n \"prompt\": \"Cart icon in top right corner\"\n }`;\n\n const thoughtTag = (content: string) =>\n shouldIncludeThought ? `<thought>${content}</thought>\\n` : '';\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 3 times, you should think this is an error and set the \"error\" field to the error message.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n- Return the \"complete-task\" tag when the task is completed and no more actions should be done.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in XML format with the following structure:\n${shouldIncludeThought ? \"<thought>Think through the following: What is the user's requirement? What is the current state based on the screenshot? What should be the next action and which action-type to use (or error, or complete-task)? Write your thoughts naturally without numbering or section headers.</thought>\" : ''}\n<note>CRITICAL: If any information from the current screenshot will be needed in follow-up actions, you MUST record it here completely. The current screenshot will NOT be available in subsequent steps, so this note is your only way to preserve essential information for later use. Examples: extracted data, element states, content that needs to be referenced. Leave empty if no follow-up information is needed.</note>\n<log>a brief preamble to the user</log>\n<error>error messages (optional)</error>\n<action-type>the type of the action, or null if no action</action-type>\n<action-param-json>JSON object containing the action parameters</action-param-json>\n<complete-task success=\"true|false\">Optional: Use this tag to finalize the task when all instructions have been completed. Set success=\"true\" if the task succeeded, or success=\"false\" if it failed. When success=\"true\", the message should contain the conclusion, data, or return value that the user needs. When success=\"false\", the message MUST explain why the task failed and what went wrong. When this tag is present, no action-type or action-param-json is needed.</complete-task>\n\n## Example \n\nThis is an example of a complete interaction flow:\n\nuser: <user_instruction>Add first two items to the cart and tell me the total price of the cart. Just the price number, no other text</user_instruction>\n\nuser: this is the latest screenshot\n(image ignored due to size optimization)\n\nassistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The screenshot shows a product listing page with the first two items being \"Sauce Labs Backpack\" ($29.99) and \"Sauce Labs Bike Light\" ($9.99), both with \"Add to cart\" buttons visible. I should click the \"Add to cart\" button for the first item (Sauce Labs Backpack) using the Tap action.')}<log>Click 'Add to cart' for the Sauce Labs Backpack</log>\n<action-type>Tap</action-type>\n<action-param-json>\n{\n \"locate\": ${locateExample1}\n}\n</action-param-json>\n\nuser: Time: 2026-01-20 14:38:03 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.\n(image ignored due to size optimization)\n\nassistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The first item (Sauce Labs Backpack) has been successfully added - its button now shows \"Remove\" and the cart icon displays 1 item. The second item (Sauce Labs Bike Light) still shows \"Add to cart\" button. I should now click the \"Add to cart\" button for the second item using the Tap action.')}<log>Click 'Add to cart' for the Sauce Labs Bike Light</log>\n<action-type>Tap</action-type>\n<action-param-json>\n{\n \"locate\": ${locateExample2}\n}\n</action-param-json>\n\nuser: Time: 2026-01-20 14:38:08 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.\n\nassistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). Both items have been successfully added - both show \"Remove\" buttons and the cart icon displays 2 items. The total should be $29.99 + $9.99 = $39.98, but I need to verify by viewing the cart. I should click the cart icon to view the cart details and confirm the total price using the Tap action.')}<log>Click the cart icon to view the cart and confirm the total price</log>\n<action-type>Tap</action-type>\n<action-param-json>\n{\n \"locate\": ${locateExample3}\n}\n</action-param-json>\n\nuser: Time: 2026-01-20 14:38:13 (YYYY-MM-DD HH:mm:ss), I have finished the action previously planned.. The last screenshot is attached. Please going on according to the instruction.\n\nassistant: ${thoughtTag('The instruction is to add the first two items to the cart and report the total price (number only). The cart now displays both items: Sauce Labs Backpack ($29.99) and Sauce Labs Bike Light ($9.99). The total price is $29.99 + $9.99 = $39.98. All required actions are complete. I should use complete-task with success=\"true\" to return the total price as requested (just the number 39.98).')}<log>Report the total price: 39.98</log>\n<complete-task success=\"true\">39.98</complete-task>\n`;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","vlLocateParam","modelFamily","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","includeThought","preferredLanguage","getPreferredLanguage","Error","actionDescriptionList","actionList","logFieldInstruction","shouldIncludeThought","locateExample1","locateExample2","locateExample3","thoughtTag","content"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;ACIA,MAAMI,gBAAgB,CAACC;IACrB,IAAIA,aACF,OAAO,CAAC,6DAA6D,EAAEC,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBD,cAAc;IAEvG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACpB,KAAKS,MAAM,IAAIR,OAAO,OAAO,CAACqB,OACxC,IAAIb,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMc,aACJ,AACE,cADF,OAAQd,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMe,kBAAkBD,aAAa,GAAGvB,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMyB,WAAWC,AAAAA,IAAAA,iCAAAA,cAAAA,AAAAA,EAAejB,OAAOO;gBAGvC,MAAMW,cAAcC,AAAAA,IAAAA,iCAAAA,iBAAAA,AAAAA,EAAkBnB;gBAGtC,MAAMoB,eAAerB,iBAAiBC;gBACtC,MAAMqB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3Cd,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACiB;oBAClBlB,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEkB,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,AAAAA,IAAAA,iCAAAA,cAAAA,AAAAA,EAAeN;YAChC,MAAMO,cAAcC,AAAAA,IAAAA,iCAAAA,iBAAAA,AAAAA,EAAkBR;YAGtC,IAAIiB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBnB,OAAO,IAAI,CAACmB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEtB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeqB,2BAA2B,EAC/CC,WAAW,EACXjC,WAAW,EACXkC,WAAW,EACXC,cAAc,EAMf;IACC,MAAMC,oBAAoBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA;IAG1B,IAAIH,eAAe,CAAClC,aAClB,MAAM,IAAIsC,MACR;IAIJ,MAAMC,wBAAwBN,YAAY,GAAG,CAAC,CAACxB,SACtCD,qBACLC,QACAV,cAAcmC,cAAclC,cAAcyB;IAG9C,MAAMe,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,MAAME,sBAAsB,CAAC;;;;;QAKvB,EAAEL,kBAAkB;;;;;;;;;;AAU5B,CAAC;IAEC,MAAMM,uBAAuBP,kBAAkB;IAG/C,MAAMQ,iBAAiBT,cACnB,CAAC;;;GAGJ,CAAC,GACE,CAAC;;GAEJ,CAAC;IAEF,MAAMU,iBAAiBV,cACnB,CAAC;;;GAGJ,CAAC,GACE,CAAC;;GAEJ,CAAC;IAEF,MAAMW,iBAAiBX,cACnB,CAAC;;;GAGJ,CAAC,GACE,CAAC;;GAEJ,CAAC;IAEF,MAAMY,aAAa,CAACC,UAClBL,uBAAuB,CAAC,SAAS,EAAEK,QAAQ,YAAY,CAAC,GAAG;IAE7D,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAEP,WAAW;;AAEb,EAAEC,oBAAoB;;;;;AAKtB,EAAEC,uBAAuB,qSAAqS,GAAG;;;;;;;;;;;;;;;;;WAiBtT,EAAEI,WAAW,sYAAsY;;;;YAIlZ,EAAEH,eAAe;;;;;;;WAOlB,EAAEG,WAAW,2YAA2Y;;;;YAIvZ,EAAEF,eAAe;;;;;;WAMlB,EAAEE,WAAW,+YAA+Y;;;;YAI3Z,EAAED,eAAe;;;;;;WAMlB,EAAEC,WAAW,uYAAuY;;AAE/Z,CAAC;AACD"}
|
|
@@ -29,8 +29,14 @@ __webpack_require__.d(__webpack_exports__, {
|
|
|
29
29
|
describeUserPage: ()=>describeUserPage,
|
|
30
30
|
distance: ()=>distance,
|
|
31
31
|
distanceThreshold: ()=>distanceThreshold,
|
|
32
|
+
extractXMLTag: ()=>extractXMLTag,
|
|
32
33
|
samplePageDescription: ()=>samplePageDescription
|
|
33
34
|
});
|
|
35
|
+
function extractXMLTag(xmlString, tagName) {
|
|
36
|
+
const regex = new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`, 'i');
|
|
37
|
+
const match = xmlString.match(regex);
|
|
38
|
+
return match ? match[1].trim() : void 0;
|
|
39
|
+
}
|
|
34
40
|
function describeSize(size) {
|
|
35
41
|
return `${size.width} x ${size.height}`;
|
|
36
42
|
}
|
|
@@ -72,6 +78,7 @@ exports.describeSize = __webpack_exports__.describeSize;
|
|
|
72
78
|
exports.describeUserPage = __webpack_exports__.describeUserPage;
|
|
73
79
|
exports.distance = __webpack_exports__.distance;
|
|
74
80
|
exports.distanceThreshold = __webpack_exports__.distanceThreshold;
|
|
81
|
+
exports.extractXMLTag = __webpack_exports__.extractXMLTag;
|
|
75
82
|
exports.samplePageDescription = __webpack_exports__.samplePageDescription;
|
|
76
83
|
for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
77
84
|
"describeElement",
|
|
@@ -79,6 +86,7 @@ for(var __rspack_i in __webpack_exports__)if (-1 === [
|
|
|
79
86
|
"describeUserPage",
|
|
80
87
|
"distance",
|
|
81
88
|
"distanceThreshold",
|
|
89
|
+
"extractXMLTag",
|
|
82
90
|
"samplePageDescription"
|
|
83
91
|
].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
84
92
|
Object.defineProperty(exports, '__esModule', {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/util.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/util.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { BaseElement, ElementTreeNode, Size, UIContext } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport { assert } from '@midscene/shared/utils';\n\nexport function describeSize(size: Size) {\n return `${size.width} x ${size.height}`;\n}\n\nexport function describeElement(\n elements: (Pick<BaseElement, 'rect' | 'content'> & { id: string })[],\n) {\n const sliceLength = 80;\n return elements\n .map((item) =>\n [\n item.id,\n item.rect.left,\n item.rect.top,\n item.rect.left + item.rect.width,\n item.rect.top + item.rect.height,\n item.content.length > sliceLength\n ? `${item.content.slice(0, sliceLength)}...`\n : item.content,\n ].join(', '),\n )\n .join('\\n');\n}\nexport const distanceThreshold = 16;\n\n// export function elementByPositionWithElementInfo(\n// treeRoot: ElementTreeNode<BaseElement>,\n// position: {\n// x: number;\n// y: number;\n// },\n// options?: {\n// requireStrictDistance?: boolean;\n// filterPositionElements?: boolean;\n// },\n// ) {\n// const requireStrictDistance = options?.requireStrictDistance ?? true;\n// const filterPositionElements = options?.filterPositionElements ?? false;\n\n// assert(typeof position !== 'undefined', 'position is required for query');\n\n// const matchingElements: BaseElement[] = [];\n\n// function dfs(node: ElementTreeNode<BaseElement>) {\n// if (node?.node) {\n// const item = node.node;\n// if (\n// item.rect.left <= position.x &&\n// position.x <= item.rect.left + item.rect.width &&\n// item.rect.top <= position.y &&\n// position.y <= item.rect.top + item.rect.height\n// ) {\n// if (\n// !(\n// filterPositionElements &&\n// item.attributes?.nodeType === NodeType.POSITION\n// ) &&\n// item.isVisible\n// ) {\n// matchingElements.push(item);\n// }\n// }\n// }\n\n// for (const child of node.children) {\n// dfs(child);\n// }\n// }\n\n// dfs(treeRoot);\n\n// if (matchingElements.length === 0) {\n// return undefined;\n// }\n\n// // Find the smallest element by area\n// const element = matchingElements.reduce((smallest, current) => {\n// const smallestArea = smallest.rect.width * smallest.rect.height;\n// const currentArea = current.rect.width * current.rect.height;\n// return currentArea < smallestArea ? current : smallest;\n// });\n\n// const distanceToCenter = distance(\n// { x: element.center[0], y: element.center[1] },\n// position,\n// );\n\n// if (requireStrictDistance) {\n// return distanceToCenter <= distanceThreshold ? element : undefined;\n// }\n\n// return element;\n// }\n\nexport function distance(\n point1: { x: number; y: number },\n point2: { x: number; y: number },\n) {\n return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);\n}\n\nexport const samplePageDescription = `\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n The username is accepted\n </h4>\n ...many more\n</div>\n====================\n`;\n\nexport async function describeUserPage(context: UIContext) {\n return `The size of the page: ${describeSize(context.size)}`;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","describeSize","size","describeElement","elements","sliceLength","item","distanceThreshold","distance","point1","point2","Math","samplePageDescription","describeUserPage","context"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/util.js","sources":["webpack/runtime/define_property_getters","webpack/runtime/has_own_property","webpack/runtime/make_namespace_object","../../../../src/ai-model/prompt/util.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type { BaseElement, ElementTreeNode, Size, UIContext } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport { assert } from '@midscene/shared/utils';\n\n/**\n * Extract content from an XML tag in a string\n * @param xmlString - The XML string to parse\n * @param tagName - The name of the tag to extract (case-insensitive)\n * @returns The trimmed content of the tag, or undefined if not found\n */\nexport function extractXMLTag(\n xmlString: string,\n tagName: string,\n): string | undefined {\n const regex = new RegExp(`<${tagName}>([\\\\s\\\\S]*?)</${tagName}>`, 'i');\n const match = xmlString.match(regex);\n return match ? match[1].trim() : undefined;\n}\n\nexport function describeSize(size: Size) {\n return `${size.width} x ${size.height}`;\n}\n\nexport function describeElement(\n elements: (Pick<BaseElement, 'rect' | 'content'> & { id: string })[],\n) {\n const sliceLength = 80;\n return elements\n .map((item) =>\n [\n item.id,\n item.rect.left,\n item.rect.top,\n item.rect.left + item.rect.width,\n item.rect.top + item.rect.height,\n item.content.length > sliceLength\n ? `${item.content.slice(0, sliceLength)}...`\n : item.content,\n ].join(', '),\n )\n .join('\\n');\n}\nexport const distanceThreshold = 16;\n\n// export function elementByPositionWithElementInfo(\n// treeRoot: ElementTreeNode<BaseElement>,\n// position: {\n// x: number;\n// y: number;\n// },\n// options?: {\n// requireStrictDistance?: boolean;\n// filterPositionElements?: boolean;\n// },\n// ) {\n// const requireStrictDistance = options?.requireStrictDistance ?? true;\n// const filterPositionElements = options?.filterPositionElements ?? false;\n\n// assert(typeof position !== 'undefined', 'position is required for query');\n\n// const matchingElements: BaseElement[] = [];\n\n// function dfs(node: ElementTreeNode<BaseElement>) {\n// if (node?.node) {\n// const item = node.node;\n// if (\n// item.rect.left <= position.x &&\n// position.x <= item.rect.left + item.rect.width &&\n// item.rect.top <= position.y &&\n// position.y <= item.rect.top + item.rect.height\n// ) {\n// if (\n// !(\n// filterPositionElements &&\n// item.attributes?.nodeType === NodeType.POSITION\n// ) &&\n// item.isVisible\n// ) {\n// matchingElements.push(item);\n// }\n// }\n// }\n\n// for (const child of node.children) {\n// dfs(child);\n// }\n// }\n\n// dfs(treeRoot);\n\n// if (matchingElements.length === 0) {\n// return undefined;\n// }\n\n// // Find the smallest element by area\n// const element = matchingElements.reduce((smallest, current) => {\n// const smallestArea = smallest.rect.width * smallest.rect.height;\n// const currentArea = current.rect.width * current.rect.height;\n// return currentArea < smallestArea ? current : smallest;\n// });\n\n// const distanceToCenter = distance(\n// { x: element.center[0], y: element.center[1] },\n// position,\n// );\n\n// if (requireStrictDistance) {\n// return distanceToCenter <= distanceThreshold ? element : undefined;\n// }\n\n// return element;\n// }\n\nexport function distance(\n point1: { x: number; y: number },\n point2: { x: number; y: number },\n) {\n return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);\n}\n\nexport const samplePageDescription = `\nAnd the page is described as follows:\n====================\nThe size of the page: 1280 x 720\nSome of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.\n\nDescription of all the elements in screenshot:\n<div id=\"969f1637\" markerId=\"1\" left=\"100\" top=\"100\" width=\"100\" height=\"100\"> // The markerId indicated by the rectangle label in the screenshot\n <h4 id=\"b211ecb2\" markerId=\"5\" left=\"150\" top=\"150\" width=\"90\" height=\"60\">\n The username is accepted\n </h4>\n ...many more\n</div>\n====================\n`;\n\nexport async function describeUserPage(context: UIContext) {\n return `The size of the page: ${describeSize(context.size)}`;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","extractXMLTag","xmlString","tagName","regex","RegExp","match","undefined","describeSize","size","describeElement","elements","sliceLength","item","distanceThreshold","distance","point1","point2","Math","samplePageDescription","describeUserPage","context"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;ACIO,SAASI,cACdC,SAAiB,EACjBC,OAAe;IAEf,MAAMC,QAAQ,IAAIC,OAAO,CAAC,CAAC,EAAEF,QAAQ,eAAe,EAAEA,QAAQ,CAAC,CAAC,EAAE;IAClE,MAAMG,QAAQJ,UAAU,KAAK,CAACE;IAC9B,OAAOE,QAAQA,KAAK,CAAC,EAAE,CAAC,IAAI,KAAKC;AACnC;AAEO,SAASC,aAAaC,IAAU;IACrC,OAAO,GAAGA,KAAK,KAAK,CAAC,GAAG,EAAEA,KAAK,MAAM,EAAE;AACzC;AAEO,SAASC,gBACdC,QAAoE;IAEpE,MAAMC,cAAc;IACpB,OAAOD,SACJ,GAAG,CAAC,CAACE,OACJ;YACEA,KAAK,EAAE;YACPA,KAAK,IAAI,CAAC,IAAI;YACdA,KAAK,IAAI,CAAC,GAAG;YACbA,KAAK,IAAI,CAAC,IAAI,GAAGA,KAAK,IAAI,CAAC,KAAK;YAChCA,KAAK,IAAI,CAAC,GAAG,GAAGA,KAAK,IAAI,CAAC,MAAM;YAChCA,KAAK,OAAO,CAAC,MAAM,GAAGD,cAClB,GAAGC,KAAK,OAAO,CAAC,KAAK,CAAC,GAAGD,aAAa,GAAG,CAAC,GAC1CC,KAAK,OAAO;SACjB,CAAC,IAAI,CAAC,OAER,IAAI,CAAC;AACV;AACO,MAAMC,oBAAoB;AAuE1B,SAASC,SACdC,MAAgC,EAChCC,MAAgC;IAEhC,OAAOC,KAAK,IAAI,CAAEF,AAAAA,CAAAA,OAAO,CAAC,GAAGC,OAAO,CAAC,AAAD,KAAM,IAAKD,AAAAA,CAAAA,OAAO,CAAC,GAAGC,OAAO,CAAC,AAAD,KAAM;AACzE;AAEO,MAAME,wBAAwB,CAAC;;;;;;;;;;;;;;AActC,CAAC;AAEM,eAAeC,iBAAiBC,OAAkB;IACvD,OAAO,CAAC,sBAAsB,EAAEb,aAAaa,QAAQ,IAAI,GAAG;AAC9D"}
|